# Data Exploration


Read the genuine users' data and apply fixes to the columns' names and values.


## Read Data


In [1]:
import re
import emoji
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from arabert.preprocess import ArabertPreprocessor

In [2]:
# we have mutiple raw files according to topics (gov - jour - news - univ).
# change the target topic in the following variable and rerun the notebook.

topic = "univ"
gen_raw_tweets_paths = f"../data/raw/collected_gen/{topic}/*Tweet*.csv"
gen_raw_users_paths = f"../data/raw/collected_gen/{topic}/*User*.csv"

# In case of sports and banking topics uncomment the follwoing cells

topic = "banking"
gen_raw_tweets_paths = f"../data/raw/{topic}_tweets.csv"
gen_raw_users_paths = f"../data/raw/{topic}_users.csv"

prop_processed_data_path = "../data/processed/propaganda.json"
gen_processed_data_path = f"../data/processed/genuine_{topic}.json"

processing_model = "aubmindlab/bert-base-arabertv2"

In [3]:
# we are trying to match the format of this df to concat all later

prop_df = pd.read_json(prop_processed_data_path)
prop_df.head(2)

Unnamed: 0,tweetid,userid,user_display_name,user_screen_name,user_reported_location,user_profile_description,follower_count,following_count,account_creation_date,tweet_text,is_retweet,quote_count,reply_count,like_count,retweet_count,hashtags,urls,user_mentions,text
0,1161436140945195008,392352672,نجديه عذيه .,Ksa_FO1,نجد العذيه للدعم,مابادل اقل من ٥ الاف ولا أقبل المقدم بدون طلب ...,70616,52577,1318723200000,RT @GROB_07: #حـسـآب_تـمــيزة_بـتـفـاعـل 💫\n⠀┈...,True,0,0,0,0,3,0,2,# حسآب_تميزة_بتفاعل نج م القوآئم لهذآ اليوم # ...
1,764273648496742400,4708503082,ْ‏ོ جود العبداللّهہ|| ོالتنبيهات ما توصل,j06__,,‏‏‏‏‏‏‏‏‏‏‏إنه يستجيب منذُ أن دعوُت ولگِن يُدب...,60061,2617,1451865600000,اللهم اكثر من امثالي ليعم الخير و تعم السعادة ...,False,0,0,0,0,1,0,0,اللهم أكثر من امثالي ليعم الخير و تعم السعادة#...


In [4]:
prop_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 56000 entries, 0 to 55999
Data columns (total 19 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   tweetid                   56000 non-null  int64 
 1   userid                    56000 non-null  object
 2   user_display_name         56000 non-null  object
 3   user_screen_name          56000 non-null  object
 4   user_reported_location    36366 non-null  object
 5   user_profile_description  51749 non-null  object
 6   follower_count            56000 non-null  int64 
 7   following_count           56000 non-null  int64 
 8   account_creation_date     56000 non-null  int64 
 9   tweet_text                56000 non-null  object
 10  is_retweet                56000 non-null  bool  
 11  quote_count               56000 non-null  int64 
 12  reply_count               56000 non-null  int64 
 13  like_count                56000 non-null  int64 
 14  retweet_count             5

In [5]:
import glob

paths = glob.glob(gen_raw_tweets_paths)
paths

['../data/raw/banking_tweets.csv']

In [6]:
dfs = [pd.read_csv(f) for f in paths]

In [7]:
df1 = pd.concat(dfs, ignore_index=True)
df1.head(2)

Unnamed: 0,in_reply_to_user_id,id,source,text,created_at,referenced_tweets,conversation_id,lang,possibly_sensitive,author_id,entities.urls,entities.hashtags,public_metrics.retweet_count,public_metrics.reply_count,public_metrics.like_count,public_metrics.quote_count,attachments.media_keys,entities.annotations,entities.mentions,geo.place_id
0,947365777.0,1526234562359066624,Twitter for iPhone,الفائز بالمركز الثالث في #هاكثون_الادخار #وزين...,2022-05-16T16:14:10.000Z,"[{'type': 'replied_to', 'id': '152623416602681...",1526222020568023042,ar,False,947365777,"[{'start': 148, 'end': 171, 'url': 'https://t....","[{'start': 25, 'end': 40, 'tag': 'هاكثون_الادخ...",2,1,8,0,['3_1526234554218029059'],,,
1,,1488090545981378568,Twitter for iPhone,استقبل صاحب السمو الملكي الأمير سعود بن نايف\n...,2022-01-31T10:03:28.000Z,,1488090545981378568,ar,False,2321495687,"[{'start': 279, 'end': 302, 'url': 'https://t....","[{'start': 45, 'end': 58, 'tag': 'أمير_الشرقية'}]",27,5,32,0,"['3_1488090518890422276', '3_14880905189114388...","[{'start': 32, 'end': 43, 'probability': 0.813...",,


In [8]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2012 entries, 0 to 2011
Data columns (total 20 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   in_reply_to_user_id           126 non-null    float64
 1   id                            2012 non-null   int64  
 2   source                        2012 non-null   object 
 3   text                          2012 non-null   object 
 4   created_at                    2012 non-null   object 
 5   referenced_tweets             400 non-null    object 
 6   conversation_id               2012 non-null   int64  
 7   lang                          2012 non-null   object 
 8   possibly_sensitive            2012 non-null   bool   
 9   author_id                     2012 non-null   int64  
 10  entities.urls                 1677 non-null   object 
 11  entities.hashtags             1264 non-null   object 
 12  public_metrics.retweet_count  2012 non-null   int64  
 13  pub

## Process Column Names


In [9]:
df1 = df1.dropna(subset=["text"])

In [10]:
is_retweet = []

for ref_tweet in df1.referenced_tweets.values:
    if not pd.isna(ref_tweet) and not ref_tweet.isdigit():
        is_retweet.append(eval(ref_tweet)[0]["type"] == "retweeted")
    else:
        is_retweet.append(False)

df1["is_retweet"] = is_retweet

In [11]:
col_rename = {
    "id": "tweetid",
    "text": "tweet_text",
    "created_at": "tweet_time",
    "author_id": "userid",
    "entities.urls": "urls",
    "entities.hashtags": "hashtags",
    "public_metrics.retweet_count": "retweet_count",
    "public_metrics.reply_count": "reply_count",
    "public_metrics.like_count": "like_count",
    "public_metrics.quote_count": "quote_count",
    "entities.mentions": "user_mentions",
    "is_retweet": "is_retweet",
}

df1.rename(columns=col_rename, inplace=True)
df1 = df1[col_rename.values()]
df1.head(2)

Unnamed: 0,tweetid,tweet_text,tweet_time,userid,urls,hashtags,retweet_count,reply_count,like_count,quote_count,user_mentions,is_retweet
0,1526234562359066624,الفائز بالمركز الثالث في #هاكثون_الادخار #وزين...,2022-05-16T16:14:10.000Z,947365777,"[{'start': 148, 'end': 171, 'url': 'https://t....","[{'start': 25, 'end': 40, 'tag': 'هاكثون_الادخ...",2,1,8,0,,False
1,1488090545981378568,استقبل صاحب السمو الملكي الأمير سعود بن نايف\n...,2022-01-31T10:03:28.000Z,2321495687,"[{'start': 279, 'end': 302, 'url': 'https://t....","[{'start': 45, 'end': 58, 'tag': 'أمير_الشرقية'}]",27,5,32,0,,False


In [12]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2012 entries, 0 to 2011
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   tweetid        2012 non-null   int64 
 1   tweet_text     2012 non-null   object
 2   tweet_time     2012 non-null   object
 3   userid         2012 non-null   int64 
 4   urls           1677 non-null   object
 5   hashtags       1264 non-null   object
 6   retweet_count  2012 non-null   int64 
 7   reply_count    2012 non-null   int64 
 8   like_count     2012 non-null   int64 
 9   quote_count    2012 non-null   int64 
 10  user_mentions  383 non-null    object
 11  is_retweet     2012 non-null   bool  
dtypes: bool(1), int64(6), object(5)
memory usage: 175.0+ KB


In [13]:
prop_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 56000 entries, 0 to 55999
Data columns (total 19 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   tweetid                   56000 non-null  int64 
 1   userid                    56000 non-null  object
 2   user_display_name         56000 non-null  object
 3   user_screen_name          56000 non-null  object
 4   user_reported_location    36366 non-null  object
 5   user_profile_description  51749 non-null  object
 6   follower_count            56000 non-null  int64 
 7   following_count           56000 non-null  int64 
 8   account_creation_date     56000 non-null  int64 
 9   tweet_text                56000 non-null  object
 10  is_retweet                56000 non-null  bool  
 11  quote_count               56000 non-null  int64 
 12  reply_count               56000 non-null  int64 
 13  like_count                56000 non-null  int64 
 14  retweet_count             5

## Read Users' Data


In [14]:
paths = glob.glob(gen_raw_users_paths)
paths

['../data/raw/banking_users.csv']

In [15]:
dfs = [pd.read_csv(f, lineterminator="\n") for f in paths]

In [16]:
df2 = pd.concat(dfs, ignore_index=True)
df2.head(2)

Unnamed: 0,profile_image_url,description,id,protected,verified,username,location,url,pinned_tweet_id,name,created_at,public_metrics.followers_count,public_metrics.following_count,public_metrics.tweet_count,public_metrics.listed_count,entities.url.urls,entities.description.mentions,entities.description.hashtags,entities.description.urls\r
0,https://pbs.twimg.com/profile_images/138498510...,بنك التنمية الاجتماعية، أحد الركائز الحكومية ا...,947365777,False,True,SDB_sa,Kingdom of Saudi Arabia,https://t.co/Ddl5jzUUt3,1.55948e+18,بنك التنمية الاجتماعية,2012-11-14T08:50:59.000Z,409379,12,58514,595,"[{'start': 0, 'end': 23, 'url': 'https://t.co/...","[{'start': 146, 'end': 155, 'username': 'SDB_C...",,\r
1,https://pbs.twimg.com/profile_images/144205752...,الحساب الرسمي لـ #إمارة_المنطقة_الشرقية - المم...,2321495687,False,True,emara_sharqia,,https://t.co/u7VbncJEOe,,إمارة المنطقة الشرقية,2014-02-03T08:49:37.000Z,477714,7,12648,351,"[{'start': 0, 'end': 23, 'url': 'https://t.co/...",,"[{'start': 17, 'end': 39, 'tag': 'إمارة_المنطق...",\r


In [17]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 393 entries, 0 to 392
Data columns (total 19 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   profile_image_url               393 non-null    object 
 1   description                     383 non-null    object 
 2   id                              393 non-null    int64  
 3   protected                       393 non-null    bool   
 4   verified                        393 non-null    bool   
 5   username                        393 non-null    object 
 6   location                        308 non-null    object 
 7   url                             305 non-null    object 
 8   pinned_tweet_id                 227 non-null    float64
 9   name                            393 non-null    object 
 10  created_at                      393 non-null    object 
 11  public_metrics.followers_count  393 non-null    int64  
 12  public_metrics.following_count  393 

In [18]:
prop_df.head(2)

Unnamed: 0,tweetid,userid,user_display_name,user_screen_name,user_reported_location,user_profile_description,follower_count,following_count,account_creation_date,tweet_text,is_retweet,quote_count,reply_count,like_count,retweet_count,hashtags,urls,user_mentions,text
0,1161436140945195008,392352672,نجديه عذيه .,Ksa_FO1,نجد العذيه للدعم,مابادل اقل من ٥ الاف ولا أقبل المقدم بدون طلب ...,70616,52577,1318723200000,RT @GROB_07: #حـسـآب_تـمــيزة_بـتـفـاعـل 💫\n⠀┈...,True,0,0,0,0,3,0,2,# حسآب_تميزة_بتفاعل نج م القوآئم لهذآ اليوم # ...
1,764273648496742400,4708503082,ْ‏ོ جود العبداللّهہ|| ོالتنبيهات ما توصل,j06__,,‏‏‏‏‏‏‏‏‏‏‏إنه يستجيب منذُ أن دعوُت ولگِن يُدب...,60061,2617,1451865600000,اللهم اكثر من امثالي ليعم الخير و تعم السعادة ...,False,0,0,0,0,1,0,0,اللهم أكثر من امثالي ليعم الخير و تعم السعادة#...


In [19]:
prop_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 56000 entries, 0 to 55999
Data columns (total 19 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   tweetid                   56000 non-null  int64 
 1   userid                    56000 non-null  object
 2   user_display_name         56000 non-null  object
 3   user_screen_name          56000 non-null  object
 4   user_reported_location    36366 non-null  object
 5   user_profile_description  51749 non-null  object
 6   follower_count            56000 non-null  int64 
 7   following_count           56000 non-null  int64 
 8   account_creation_date     56000 non-null  int64 
 9   tweet_text                56000 non-null  object
 10  is_retweet                56000 non-null  bool  
 11  quote_count               56000 non-null  int64 
 12  reply_count               56000 non-null  int64 
 13  like_count                56000 non-null  int64 
 14  retweet_count             5

Keep user IDs that already has tweets stored in data frame number 1.


In [20]:
df2 = df2[df2.id.isin(df1["userid"])]
df2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 190 entries, 0 to 391
Data columns (total 19 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   profile_image_url               190 non-null    object 
 1   description                     190 non-null    object 
 2   id                              190 non-null    int64  
 3   protected                       190 non-null    bool   
 4   verified                        190 non-null    bool   
 5   username                        190 non-null    object 
 6   location                        158 non-null    object 
 7   url                             174 non-null    object 
 8   pinned_tweet_id                 111 non-null    float64
 9   name                            190 non-null    object 
 10  created_at                      190 non-null    object 
 11  public_metrics.followers_count  190 non-null    int64  
 12  public_metrics.following_count  190 non-n

In [21]:
df2.head(2)

Unnamed: 0,profile_image_url,description,id,protected,verified,username,location,url,pinned_tweet_id,name,created_at,public_metrics.followers_count,public_metrics.following_count,public_metrics.tweet_count,public_metrics.listed_count,entities.url.urls,entities.description.mentions,entities.description.hashtags,entities.description.urls\r
0,https://pbs.twimg.com/profile_images/138498510...,بنك التنمية الاجتماعية، أحد الركائز الحكومية ا...,947365777,False,True,SDB_sa,Kingdom of Saudi Arabia,https://t.co/Ddl5jzUUt3,1.55948e+18,بنك التنمية الاجتماعية,2012-11-14T08:50:59.000Z,409379,12,58514,595,"[{'start': 0, 'end': 23, 'url': 'https://t.co/...","[{'start': 146, 'end': 155, 'username': 'SDB_C...",,\r
1,https://pbs.twimg.com/profile_images/144205752...,الحساب الرسمي لـ #إمارة_المنطقة_الشرقية - المم...,2321495687,False,True,emara_sharqia,,https://t.co/u7VbncJEOe,,إمارة المنطقة الشرقية,2014-02-03T08:49:37.000Z,477714,7,12648,351,"[{'start': 0, 'end': 23, 'url': 'https://t.co/...",,"[{'start': 17, 'end': 39, 'tag': 'إمارة_المنطق...",\r


## Align Data Frames


Create missing columns.


In [22]:
names = []
handles = []
locations = []
bios = []
followers = []
following = []
creation_dates = []


for user in df1.userid.values:
    user_df = df2[df2.id == user]
    if len(user_df) == 0:
        names.append(None)
        handles.append(None)
        locations.append(None)
        bios.append(None)
        followers.append(None)
        following.append(None)
        creation_dates.append(None)
        continue
    names.append(user_df.loc[:, "name"].values[0])
    handles.append(user_df.loc[:, "username"].values[0])
    creation_dates.append(user_df.loc[:, "created_at"].values[0])
    locations.append(user_df.loc[:, "location"].values[0])
    bios.append(user_df.loc[:, "description"].values[0])
    followers.append(user_df.loc[:, "public_metrics.followers_count"].values[0])
    following.append(user_df.loc[:, "public_metrics.following_count"].values[0])

df1["user_display_name"] = names
df1["user_screen_name"] = handles
df1["user_reported_location"] = locations
df1["user_profile_description"] = bios
df1["follower_count"] = followers
df1["following_count"] = following
df1["account_creation_date"] = creation_dates

df1.head(2)

Unnamed: 0,tweetid,tweet_text,tweet_time,userid,urls,hashtags,retweet_count,reply_count,like_count,quote_count,user_mentions,is_retweet,user_display_name,user_screen_name,user_reported_location,user_profile_description,follower_count,following_count,account_creation_date
0,1526234562359066624,الفائز بالمركز الثالث في #هاكثون_الادخار #وزين...,2022-05-16T16:14:10.000Z,947365777,"[{'start': 148, 'end': 171, 'url': 'https://t....","[{'start': 25, 'end': 40, 'tag': 'هاكثون_الادخ...",2,1,8,0,,False,بنك التنمية الاجتماعية,SDB_sa,Kingdom of Saudi Arabia,بنك التنمية الاجتماعية، أحد الركائز الحكومية ا...,409379,12,2012-11-14T08:50:59.000Z
1,1488090545981378568,استقبل صاحب السمو الملكي الأمير سعود بن نايف\n...,2022-01-31T10:03:28.000Z,2321495687,"[{'start': 279, 'end': 302, 'url': 'https://t....","[{'start': 45, 'end': 58, 'tag': 'أمير_الشرقية'}]",27,5,32,0,,False,إمارة المنطقة الشرقية,emara_sharqia,,الحساب الرسمي لـ #إمارة_المنطقة_الشرقية - المم...,477714,7,2014-02-03T08:49:37.000Z


In [23]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2012 entries, 0 to 2011
Data columns (total 19 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   tweetid                   2012 non-null   int64 
 1   tweet_text                2012 non-null   object
 2   tweet_time                2012 non-null   object
 3   userid                    2012 non-null   int64 
 4   urls                      1677 non-null   object
 5   hashtags                  1264 non-null   object
 6   retweet_count             2012 non-null   int64 
 7   reply_count               2012 non-null   int64 
 8   like_count                2012 non-null   int64 
 9   quote_count               2012 non-null   int64 
 10  user_mentions             383 non-null    object
 11  is_retweet                2012 non-null   bool  
 12  user_display_name         2012 non-null   object
 13  user_screen_name          2012 non-null   object
 14  user_reported_location  

In [24]:
df1["urls"] = [
    len(eval(urls)) if not pd.isna(urls) and not urls.isdigit() else 0
    for urls in df1["urls"].values
]
df1["hashtags"] = [
    len(eval(hashtags)) if not pd.isna(hashtags) else 0
    for hashtags in df1["hashtags"].values
]
df1["user_mentions"] = [
    len(eval(mentions)) if not pd.isna(mentions) else 0
    for mentions in df1["user_mentions"].values
]

In [25]:
df1.head(2)

Unnamed: 0,tweetid,tweet_text,tweet_time,userid,urls,hashtags,retweet_count,reply_count,like_count,quote_count,user_mentions,is_retweet,user_display_name,user_screen_name,user_reported_location,user_profile_description,follower_count,following_count,account_creation_date
0,1526234562359066624,الفائز بالمركز الثالث في #هاكثون_الادخار #وزين...,2022-05-16T16:14:10.000Z,947365777,1,2,2,1,8,0,0,False,بنك التنمية الاجتماعية,SDB_sa,Kingdom of Saudi Arabia,بنك التنمية الاجتماعية، أحد الركائز الحكومية ا...,409379,12,2012-11-14T08:50:59.000Z
1,1488090545981378568,استقبل صاحب السمو الملكي الأمير سعود بن نايف\n...,2022-01-31T10:03:28.000Z,2321495687,2,1,27,5,32,0,0,False,إمارة المنطقة الشرقية,emara_sharqia,,الحساب الرسمي لـ #إمارة_المنطقة_الشرقية - المم...,477714,7,2014-02-03T08:49:37.000Z


### Clean Text


In [26]:
# def clean_tweet_text(text):
#     """Process tweet text by removing links, mentions, and hashtags symbol."""
#     # links
#     clean_text = re.sub(r"http\S+|t\.co/\S+", "", text)
#     # mentions
#     clean_text = re.sub(r"@\w+", "", clean_text)
#     # hashtags
#     clean_text = re.sub(r"#", "", clean_text)
#     clean_text = re.sub(r"_", " ", clean_text)
#     # tashqeel - from @bakriano
#     clean_text = re.sub(r"[\u0617-\u061A\u064B-\u0652]", "", clean_text)
#     # emojis
#     clean_text = emoji.replace_emoji(clean_text, replace="")
#     # remove new lines and normalize white spaces
#     clean_text = re.sub(r"\s+", " ", clean_text)
#     return clean_text.replace("RT :", "").strip()

In [27]:
arabert_prep = ArabertPreprocessor(model_name=processing_model)



In [28]:
def process_text(text):
    """Process tweet text by removing links, mentions, and hashtags symbol."""
    clean_text = arabert_prep.preprocess(text)
    clean_text = arabert_prep.unpreprocess(clean_text)
    clean_text = clean_text.replace("[رابط]", "")
    clean_text = clean_text.replace("[مستخدم]", "")
    clean_text = clean_text.replace("RT", "")
    clean_text = clean_text.replace(":", "")
    return clean_text.strip()

In [29]:
df1["text"] = df1.tweet_text.apply(process_text)
df1.head(2)

Unnamed: 0,tweetid,tweet_text,tweet_time,userid,urls,hashtags,retweet_count,reply_count,like_count,quote_count,user_mentions,is_retweet,user_display_name,user_screen_name,user_reported_location,user_profile_description,follower_count,following_count,account_creation_date,text
0,1526234562359066624,الفائز بالمركز الثالث في #هاكثون_الادخار #وزين...,2022-05-16T16:14:10.000Z,947365777,1,2,2,1,8,0,0,False,بنك التنمية الاجتماعية,SDB_sa,Kingdom of Saudi Arabia,بنك التنمية الاجتماعية، أحد الركائز الحكومية ا...,409379,12,2012-11-14T08:50:59.000Z,الفائز بالمركز الثالث في# هاكثون_الادخار# وزين...
1,1488090545981378568,استقبل صاحب السمو الملكي الأمير سعود بن نايف\n...,2022-01-31T10:03:28.000Z,2321495687,2,1,27,5,32,0,0,False,إمارة المنطقة الشرقية,emara_sharqia,,الحساب الرسمي لـ #إمارة_المنطقة_الشرقية - المم...,477714,7,2014-02-03T08:49:37.000Z,استقبل صاحب السمو الملكي الأمير سعود بن نايف# ...


In [30]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2012 entries, 0 to 2011
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   tweetid                   2012 non-null   int64 
 1   tweet_text                2012 non-null   object
 2   tweet_time                2012 non-null   object
 3   userid                    2012 non-null   int64 
 4   urls                      2012 non-null   int64 
 5   hashtags                  2012 non-null   int64 
 6   retweet_count             2012 non-null   int64 
 7   reply_count               2012 non-null   int64 
 8   like_count                2012 non-null   int64 
 9   quote_count               2012 non-null   int64 
 10  user_mentions             2012 non-null   int64 
 11  is_retweet                2012 non-null   bool  
 12  user_display_name         2012 non-null   object
 13  user_screen_name          2012 non-null   object
 14  user_reported_location  

In [31]:
prop_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 56000 entries, 0 to 55999
Data columns (total 19 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   tweetid                   56000 non-null  int64 
 1   userid                    56000 non-null  object
 2   user_display_name         56000 non-null  object
 3   user_screen_name          56000 non-null  object
 4   user_reported_location    36366 non-null  object
 5   user_profile_description  51749 non-null  object
 6   follower_count            56000 non-null  int64 
 7   following_count           56000 non-null  int64 
 8   account_creation_date     56000 non-null  int64 
 9   tweet_text                56000 non-null  object
 10  is_retweet                56000 non-null  bool  
 11  quote_count               56000 non-null  int64 
 12  reply_count               56000 non-null  int64 
 13  like_count                56000 non-null  int64 
 14  retweet_count             5

In [32]:
df1 = df1[prop_df.columns]
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2012 entries, 0 to 2011
Data columns (total 19 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   tweetid                   2012 non-null   int64 
 1   userid                    2012 non-null   int64 
 2   user_display_name         2012 non-null   object
 3   user_screen_name          2012 non-null   object
 4   user_reported_location    1846 non-null   object
 5   user_profile_description  2012 non-null   object
 6   follower_count            2012 non-null   int64 
 7   following_count           2012 non-null   int64 
 8   account_creation_date     2012 non-null   object
 9   tweet_text                2012 non-null   object
 10  is_retweet                2012 non-null   bool  
 11  quote_count               2012 non-null   int64 
 12  reply_count               2012 non-null   int64 
 13  like_count                2012 non-null   int64 
 14  retweet_count           

In [33]:
df1.to_json(gen_processed_data_path)