In [1]:
import numpy as np
import polars as pl

In [2]:
FOLDER = "/home/deniskirbaba/Documents/influai-data/embeddings/"

## Join

Заджойним id из posts_content к id из posts_metadata. Чтобы получить id, channel_id, post_date для постов, у которых точно есть что-то в raw_text.

In [5]:
# Load batched csv files stems
posts_meta_batched_names = [
    "posts_meta_5000000",
    "posts_meta_10000000",
    "posts_meta_15000000",
    "posts_meta_20000000",
    "posts_meta_25000000",
    "posts_meta_30000000",
    "posts_meta_35000000",
    "posts_meta_36745798",
]

posts_meta = pl.read_csv(FOLDER + posts_meta_batched_names[0] + ".csv")
posts_meta

id,channel_id,post_date
i64,i64,str
20163752,1235821477,"""2024-07-05T08:55:10.000000"""
20163753,1235821477,"""2024-07-04T15:52:28.000000"""
20163754,1235821477,"""2024-07-02T09:39:19.000000"""
20163755,1235821477,"""2024-07-02T09:38:44.000000"""
20163756,1235821477,"""2024-07-02T09:35:44.000000"""
…,…,…
25153714,1419580345,"""2024-09-02T16:16:29.000000"""
25153715,1419580345,"""2024-09-02T08:15:26.000000"""
25153716,1419580345,"""2024-09-01T16:11:19.000000"""
25153717,1419580345,"""2024-09-01T08:10:27.000000"""


In [6]:
# Concatenating
for batch_stem in posts_meta_batched_names[1:]:
    new_posts_meta = pl.read_csv(FOLDER + batch_stem + ".csv")
    posts_meta = pl.concat([posts_meta, new_posts_meta], how="vertical")
    del new_posts_meta

posts_meta

id,channel_id,post_date
i64,i64,str
20163752,1235821477,"""2024-07-05T08:55:10.000000"""
20163753,1235821477,"""2024-07-04T15:52:28.000000"""
20163754,1235821477,"""2024-07-02T09:39:19.000000"""
20163755,1235821477,"""2024-07-02T09:38:44.000000"""
20163756,1235821477,"""2024-07-02T09:35:44.000000"""
…,…,…
20163747,1235821477,"""2024-07-07T18:00:27.000000"""
20163748,1235821477,"""2024-07-07T07:11:31.000000"""
20163749,1235821477,"""2024-07-07T07:11:31.000000"""
20163750,1235821477,"""2024-07-05T08:55:10.000000"""


In [7]:
post_ids_with_text = pl.read_csv(FOLDER + "id_posts_with_text.csv")
post_ids_with_text

id
i64
20852619
20852620
20852621
20852622
20852623
…
20852610
20852611
20852612
20852613


In [8]:
%%time
posts_with_text_meta = posts_meta.join(post_ids_with_text, on=["id"], how="inner")
posts_with_text_meta

CPU times: user 10.6 s, sys: 3.6 s, total: 14.2 s
Wall time: 7 s


id,channel_id,post_date
i64,i64,str
20163752,1235821477,"""2024-07-05T08:55:10.000000"""
20163753,1235821477,"""2024-07-04T15:52:28.000000"""
20163761,1235821477,"""2024-07-02T09:33:40.000000"""
20163762,1235821477,"""2024-07-01T11:23:10.000000"""
20163763,1235821477,"""2024-06-30T20:40:22.000000"""
…,…,…
20163742,1235821477,"""2024-07-13T08:55:05.000000"""
20163745,1235821477,"""2024-07-10T09:22:54.000000"""
20163746,1235821477,"""2024-07-09T18:35:10.000000"""
20163747,1235821477,"""2024-07-07T18:00:27.000000"""


In [9]:
# Release memory
del post_ids_with_text
del posts_meta

In [10]:
posts_with_text_meta.write_csv(FOLDER + "posts_with_text_meta.csv")

In [3]:
posts_with_text_meta = pl.read_csv(FOLDER + "posts_with_text_meta.csv")

## Construct csv with previous posts id

Создадим таблицу, в которой к каждому посту (id, channel_id) будет определен список id предыдущих к этому посту в канале (размера = 10). 

In [4]:
%%time
# Contert `post_date` str -> DateTime (needed bcs later we'll sorting by post_date)
posts_with_text_meta = posts_with_text_meta.with_columns(
    pl.col("post_date")
    .str.strptime(pl.Datetime, format="%Y-%m-%dT%H:%M:%S%.f")
    .dt.cast_time_unit("ns")
    .dt.convert_time_zone("UTC")
)
posts_with_text_meta

CPU times: user 21.8 s, sys: 219 ms, total: 22 s
Wall time: 6.85 s


id,channel_id,post_date
i64,i64,"datetime[ns, UTC]"
20163752,1235821477,2024-07-05 08:55:10 UTC
20163753,1235821477,2024-07-04 15:52:28 UTC
20163761,1235821477,2024-07-02 09:33:40 UTC
20163762,1235821477,2024-07-01 11:23:10 UTC
20163763,1235821477,2024-06-30 20:40:22 UTC
…,…,…
20163742,1235821477,2024-07-13 08:55:05 UTC
20163745,1235821477,2024-07-10 09:22:54 UTC
20163746,1235821477,2024-07-09 18:35:10 UTC
20163747,1235821477,2024-07-07 18:00:27 UTC


In [5]:
%%time
# Sort and add index column
posts_with_text_meta = posts_with_text_meta.sort(by=["channel_id", "post_date"]).with_row_index(
    name="index"
)
posts_with_text_meta

CPU times: user 11.4 s, sys: 1.93 s, total: 13.4 s
Wall time: 5.68 s


index,id,channel_id,post_date
u32,i64,i64,"datetime[ns, UTC]"
0,4007211,1000183937,2024-01-09 06:46:26 UTC
1,4007210,1000183937,2024-01-09 15:00:00 UTC
2,4007209,1000183937,2024-01-10 05:02:05 UTC
3,4007208,1000183937,2024-01-10 12:01:35 UTC
4,4007207,1000183937,2024-01-11 07:29:01 UTC
…,…,…,…
21640406,7225243,2493256284,2024-11-12 16:09:23 UTC
21640407,7225239,2493256284,2024-11-18 19:37:44 UTC
21640408,7225238,2493256284,2024-11-23 09:56:06 UTC
21640409,7225233,2493256284,2024-11-26 17:37:24 UTC


In [6]:
%%time

N_POSTS_BEFORE_TARGET = 10  # Number of the most recent posts we saving before target post 

prev_posts_with_text = (
    posts_with_text_meta
    .rolling("index", period=f"{N_POSTS_BEFORE_TARGET}i", group_by="channel_id", closed="left")
    .agg(pl.col("id").alias("prev_id"))
)
prev_posts_with_text

CPU times: user 4.13 s, sys: 4.51 s, total: 8.63 s
Wall time: 8.12 s


channel_id,index,prev_id
i64,u32,list[i64]
1000183937,0,[]
1000183937,1,[4007211]
1000183937,2,"[4007211, 4007210]"
1000183937,3,"[4007211, 4007210, 4007209]"
1000183937,4,"[4007211, 4007210, … 4007208]"
…,…,…
2493256284,21640406,"[7225268, 7225267, … 7225247]"
2493256284,21640407,"[7225267, 7225263, … 7225243]"
2493256284,21640408,"[7225263, 7225262, … 7225239]"
2493256284,21640409,"[7225262, 7225261, … 7225238]"


In [7]:
# Add id column back to the table (can do this safely because the order don't changed)
assert prev_posts_with_text["index"].is_sorted()

prev_posts_with_text.insert_column(0, posts_with_text_meta["id"])
prev_posts_with_text.drop_in_place("index")

del posts_with_text_meta

prev_posts_with_text

id,channel_id,prev_id
i64,i64,list[i64]
4007211,1000183937,[]
4007210,1000183937,[4007211]
4007209,1000183937,"[4007211, 4007210]"
4007208,1000183937,"[4007211, 4007210, 4007209]"
4007207,1000183937,"[4007211, 4007210, … 4007208]"
…,…,…
7225243,2493256284,"[7225268, 7225267, … 7225247]"
7225239,2493256284,"[7225267, 7225263, … 7225243]"
7225238,2493256284,"[7225263, 7225262, … 7225239]"
7225233,2493256284,"[7225262, 7225261, … 7225238]"


## Now load the metadata for ad posts and join it with previous calculated table

In [9]:
%%time
ad_posts_meta = pl.read_csv(FOLDER + "ad_posts_meta.csv")
ad_posts_meta

CPU times: user 127 ms, sys: 20.8 ms, total: 148 ms
Wall time: 125 ms


id,channel_id,post_date
i64,i64,str
6901405,1229173666,"""2024-04-27T18:16:09.000000"""
6901333,1229173666,"""2024-05-01T18:15:06.000000"""
6900679,1229173666,"""2024-06-02T18:15:18.000000"""
6900562,1229173666,"""2024-06-07T18:15:10.000000"""
6899803,1229173666,"""2024-07-11T18:15:59.000000"""
…,…,…
6902253,1229173666,"""2024-03-22T18:16:45.000000"""
6902226,1229173666,"""2024-03-23T18:16:01.000000"""
6902013,1229173666,"""2024-03-31T18:15:08.000000"""
6901782,1229173666,"""2024-04-09T18:16:09.000000"""


In [13]:
%%time
# Do inner join by id
ad_prev_posts = prev_posts_with_text.join(ad_posts_meta, on="id", how="inner")
ad_prev_posts

CPU times: user 1 s, sys: 20.4 ms, total: 1.02 s
Wall time: 544 ms


id,channel_id,prev_id,channel_id_right,post_date
i64,i64,list[i64],i64,str
4006259,1000183937,"[4006269, 4006268, … 4006260]",1000183937,"""2024-11-06T15:14:08.000000"""
8352825,1000627591,"[8352835, 8352834, … 8352826]",1000627591,"""2024-09-16T09:06:21.000000"""
7074797,1000683515,"[7074807, 7074806, … 7074798]",1000683515,"""2024-08-26T06:02:01.000000"""
9257579,1000735083,"[9257589, 9257588, … 9257580]",1000735083,"""2024-01-23T16:43:51.000000"""
9257507,1000735083,"[9257517, 9257516, … 9257508]",1000735083,"""2024-01-28T10:02:48.000000"""
…,…,…,…,…
12415659,2314282174,"[12415682, 12415676, … 12415660]",2314282174,"""2024-11-27T14:58:15.000000"""
12415580,2314282174,"[12415600, 12415593, … 12415582]",2314282174,"""2024-11-28T09:52:06.000000"""
12415573,2314282174,"[12415585, 12415584, … 12415574]",2314282174,"""2024-11-28T11:33:01.000000"""
4735900,2376767046,"[4735914, 4735913, … 4735901]",2376767046,"""2024-10-14T07:02:44.000000"""


In [14]:
# Drop some cols
ad_prev_posts.drop_in_place("channel_id_right");
ad_prev_posts.drop_in_place("post_date");

# Release memory
del prev_posts_with_text
del ad_posts_meta

ad_prev_posts

id,channel_id,prev_id
i64,i64,list[i64]
4006259,1000183937,"[4006269, 4006268, … 4006260]"
8352825,1000627591,"[8352835, 8352834, … 8352826]"
7074797,1000683515,"[7074807, 7074806, … 7074798]"
9257579,1000735083,"[9257589, 9257588, … 9257580]"
9257507,1000735083,"[9257517, 9257516, … 9257508]"
…,…,…
12415659,2314282174,"[12415682, 12415676, … 12415660]"
12415580,2314282174,"[12415600, 12415593, … 12415582]"
12415573,2314282174,"[12415585, 12415584, … 12415574]"
4735900,2376767046,"[4735914, 4735913, … 4735901]"


In [15]:
%%time
# Save it in numpy format
np.save(FOLDER + "ad_prev_posts.npy", ad_prev_posts.to_numpy())

CPU times: user 1.87 s, sys: 303 ms, total: 2.18 s
Wall time: 2.27 s
