In [1]:
import pandas as pd
import joblib

In [2]:
tweet_0_pt_1 = pd.read_parquet("../../02_data/raw/tweet_0_pt_1.parquet", engine="fastparquet")
tweet_0_pt_2 = pd.read_parquet("../../02_data/raw/tweet_0_pt_2.parquet", engine="fastparquet")

tweet_df = pd.concat([tweet_0_pt_1, tweet_0_pt_2])
tweet_df.head(1)

Unnamed: 0,author_id,context_annotations,conversation_id,created_at,id,in_reply_to_user_id,lang,possibly_sensitive,referenced_tweets,reply_settings,...,geo.coordinates,geo.place_id,geo.type,public_metrics.like_count,public_metrics.quote_count,public_metrics.reply_count,public_metrics.retweet_count,withheld.copyright,withheld.country_codes,withheld.scope
0,391092102.0,,1502784561763295233,2022-03-12 23:12:14+00:00,t1502784561763295233,,en,0.0,,everyone,...,,,,0.0,0.0,0.0,42.0,,,


In [3]:
tweet_df["author_id"] = "u" + tweet_df["author_id"].astype("Int64").astype(str)
tweet_df["created_at"] = pd.to_datetime(tweet_df["created_at"], utc=True)

In [4]:
if tweet_df.isnull().any().any():
    print("Missing values found in the dataset.")
    na_summary = tweet_df.isnull().sum().loc[lambda x: x > 0].to_frame(name='Missing Count')
    na_summary['Missing Percentage'] = (na_summary['Missing Count'] / tweet_df.shape[0]) * 100
    print("\nSummary of missing values:")
    print(na_summary)
else:
    print("No missing values found in the dataset.")

Missing values found in the dataset.

Summary of missing values:
                            Missing Count  Missing Percentage
context_annotations              10000000           100.00000
in_reply_to_user_id               7360171            73.60171
referenced_tweets                10000000           100.00000
reply_settings                    9175227            91.75227
attachments.media_keys            9785046            97.85046
attachments.poll_ids              9785046            97.85046
entities.annotations             10000000           100.00000
entities.cashtags                10000000           100.00000
entities.hashtags                10000000           100.00000
entities.media                   10000000           100.00000
entities.mentions                10000000           100.00000
entities.symbols                 10000000           100.00000
entities.urls                    10000000           100.00000
entities.user_mentions           10000000           100.00000
geo.c

In [5]:
tw_cols_to_drop = [
    "context_annotations",
    "referenced_tweets",
    "entities.annotations",
    "entities.cashtags",
    "entities.hashtags",
    "entities.media",
    "entities.mentions",
    "entities.symbols",
    "entities.urls",
    "entities.user_mentions"
]

tweet_df_1 = tweet_df.drop(columns=tw_cols_to_drop)
tweet_df_1.head(1)

Unnamed: 0,author_id,conversation_id,created_at,id,in_reply_to_user_id,lang,possibly_sensitive,reply_settings,source,text,...,geo.coordinates,geo.place_id,geo.type,public_metrics.like_count,public_metrics.quote_count,public_metrics.reply_count,public_metrics.retweet_count,withheld.copyright,withheld.country_codes,withheld.scope
0,u391092102,1502784561763295233,2022-03-12 23:12:14+00:00,t1502784561763295233,,en,0.0,everyone,Twitter Web App,RT @ChelseaSTrust: Join the CST now.\n\n#Toget...,...,,,,0.0,0.0,0.0,42.0,,,


In [6]:
user_df = pd.read_parquet("../../02_data/raw/user.parquet", engine="fastparquet")
user_df.head(1)

Unnamed: 0,created_at,description,id,location,name,pinned_tweet_id,profile_image_url,protected,url,username,...,entities.description.cashtags,entities.description.hashtags,entities.description.mentions,entities.description.urls,entities.url.urls,public_metrics.followers_count,public_metrics.following_count,public_metrics.listed_count,public_metrics.tweet_count,withheld.country_codes
0,2020-01-16 02:02:55+00:00,Theoretical Computer Scientist. See also https...,u1217628182611927040,"Cambridge, MA",Boaz Barak,,https://pbs.twimg.com/profile_images/125226236...,0.0,https://t.co/BoMip9FF17,boazbaraktcs,...,,,,,,7316.0,215.0,69.0,3098.0,


In [19]:
label_df = pd.read_csv(f"../../02_data/raw/label.csv")
label_df.head()

Unnamed: 0,id,label
0,u1217628182611927040,human
1,u2664730894,human
2,u1266703520205549568,human
3,u1089159225148882949,human
4,u36741729,bot


In [20]:
user_df_1 = user_df.merge(
    label_df,
    on="id",
    how="left"
    )
user_df_1.head(1)

Unnamed: 0,created_at,description,id,location,name,pinned_tweet_id,profile_image_url,protected,url,username,...,entities.description.hashtags,entities.description.mentions,entities.description.urls,entities.url.urls,public_metrics.followers_count,public_metrics.following_count,public_metrics.listed_count,public_metrics.tweet_count,withheld.country_codes,label
0,2020-01-16 02:02:55+00:00,Theoretical Computer Scientist. See also https...,u1217628182611927040,"Cambridge, MA",Boaz Barak,,https://pbs.twimg.com/profile_images/125226236...,0.0,https://t.co/BoMip9FF17,boazbaraktcs,...,,,,,7316.0,215.0,69.0,3098.0,,human


In [30]:
user_df_subset = user_df_1[user_df_1["id"].isin(tweet_df_1["author_id"])].copy()
user_df_subset.shape

(268238, 22)

In [31]:
tweet_df_1["pct_pl"] = tweet_df_1["lang"].eq("pl").groupby(tweet_df_1["author_id"]).transform("mean")

user_df_subset_1 = user_df_subset.merge(
    tweet_df_1[["author_id", "pct_pl"]].drop_duplicates(),
    left_on="id",
    right_on="author_id",
    how="left"
    ).drop(columns="author_id")
user_df_subset_1.head()

Unnamed: 0,created_at,description,id,location,name,pinned_tweet_id,profile_image_url,protected,url,username,...,entities.description.mentions,entities.description.urls,entities.url.urls,public_metrics.followers_count,public_metrics.following_count,public_metrics.listed_count,public_metrics.tweet_count,withheld.country_codes,label,pct_pl
0,2020-01-16 02:02:55+00:00,Theoretical Computer Scientist. See also https...,u1217628182611927040,"Cambridge, MA",Boaz Barak,,https://pbs.twimg.com/profile_images/125226236...,0.0,https://t.co/BoMip9FF17,boazbaraktcs,...,,,,7316.0,215.0,69.0,3098.0,,human,0.0
1,2008-06-23 20:59:59+00:00,"Director, Knowledge Ecology International, an ...",u15211869,"√úT: 38.911326,-77.04508",James Love,1.335259e+18,https://pbs.twimg.com/profile_images/126138453...,0.0,https://t.co/mcNZxOR7gv,jamie_love,...,,,,10299.0,2166.0,383.0,57397.0,,human,0.0
2,2014-04-27 00:20:12+00:00,"paper tweets, dms are open",u2465283662,,AK,,https://pbs.twimg.com/profile_images/145119163...,0.0,,ak92501,...,,,,45541.0,1206.0,605.0,9194.0,,bot,0.0
3,2011-04-20 03:30:01+00:00,Assistant prof @NottsPolitics | institutional ...,u284870222,"Nottingham, UK",Anna Meier,1.303136e+18,https://pbs.twimg.com/profile_images/145636471...,0.0,https://t.co/bnWgIr4Oyv,AnnaMeierPS,...,,,,9490.0,2146.0,93.0,19122.0,,human,0.0
4,2009-10-18 16:06:58+00:00,Creative developer ‚Ä¢ Freelancer ‚Ä¢ Teacher ‚Ä¢ We...,u83389771,Paris,Bruno Simon,1.351508e+18,https://pbs.twimg.com/profile_images/123799977...,0.0,https://t.co/UEPMHqG4nV,bruno_simon,...,,,,33173.0,414.0,417.0,5132.0,,human,0.0


In [39]:
user_df_subset_pl = user_df_subset_1[user_df_subset_1["pct_pl"] >= 0.9].copy()
user_df_subset_pl.drop(columns="pct_pl", inplace=True)

user_df_subset_pl.head(1)

Unnamed: 0,created_at,description,id,location,name,pinned_tweet_id,profile_image_url,protected,url,username,...,entities.description.hashtags,entities.description.mentions,entities.description.urls,entities.url.urls,public_metrics.followers_count,public_metrics.following_count,public_metrics.listed_count,public_metrics.tweet_count,withheld.country_codes,label
7685,2009-07-22 23:40:16+00:00,üë®‚Äçüíª Software developer for 20+ years (full-sta...,u59291478,"Cardiff, Wales",Savvas Stephanides,1.48058e+18,https://pbs.twimg.com/profile_images/147983948...,0.0,https://t.co/LbRwXBRHbe,SavvasStephnds,...,,,,,3521.0,423.0,83.0,11818.0,,human


In [40]:
user_df_subset_non_pl = user_df_subset_1[user_df_subset_1["pct_pl"] < 0.9].copy()
user_df_subset_non_pl.drop(columns="pct_pl", inplace=True)

user_df_subset_non_pl.head(1)

Unnamed: 0,created_at,description,id,location,name,pinned_tweet_id,profile_image_url,protected,url,username,...,entities.description.hashtags,entities.description.mentions,entities.description.urls,entities.url.urls,public_metrics.followers_count,public_metrics.following_count,public_metrics.listed_count,public_metrics.tweet_count,withheld.country_codes,label
0,2020-01-16 02:02:55+00:00,Theoretical Computer Scientist. See also https...,u1217628182611927040,"Cambridge, MA",Boaz Barak,,https://pbs.twimg.com/profile_images/125226236...,0.0,https://t.co/BoMip9FF17,boazbaraktcs,...,,,,,7316.0,215.0,69.0,3098.0,,human


In [43]:
user_df_subset_1["label"].value_counts(normalize=True)

label
human    0.936959
bot      0.063041
Name: proportion, dtype: float64

In [44]:
user_df_subset_pl["label"].value_counts(normalize=True)

label
human    0.952703
bot      0.047297
Name: proportion, dtype: float64

In [45]:
user_df_subset_non_pl["label"].value_counts(normalize=True)

label
human    0.93695
bot      0.06305
Name: proportion, dtype: float64

In [41]:
tweet_df_pl = tweet_df_1[tweet_df_1["author_id"].isin(user_df_subset_pl["id"])].copy()
tweet_df_pl.shape

(2690, 23)

In [42]:
tweet_df_non_pl = tweet_df_1[~tweet_df_1["author_id"].isin(user_df_subset_pl["id"])].copy()
tweet_df_non_pl.shape

(9997310, 23)

In [51]:
user_df_subset_non_pl.head(1)

Unnamed: 0,created_at,description,id,location,name,pinned_tweet_id,profile_image_url,protected,url,username,...,entities.description.hashtags,entities.description.mentions,entities.description.urls,entities.url.urls,public_metrics.followers_count,public_metrics.following_count,public_metrics.listed_count,public_metrics.tweet_count,withheld.country_codes,label
0,2020-01-16 02:02:55+00:00,Theoretical Computer Scientist. See also https...,u1217628182611927040,"Cambridge, MA",Boaz Barak,,https://pbs.twimg.com/profile_images/125226236...,0.0,https://t.co/BoMip9FF17,boazbaraktcs,...,,,,,7316.0,215.0,69.0,3098.0,,human
