# Data Exploration


Read the genuine users' data and apply fixes to the columns' names and values.


## Read Data


In [1]:
import re
import emoji
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

In [2]:
# we have mutiple raw files according to topics (gov - jour - news - univ).
# change the target topic in the following variable and rerun the notebook.

topic = "univ"
gen_raw_tweets_paths = f"../data/raw/collected_gen/{topic}/*Tweet*.csv"
gen_raw_users_paths = f"../data/raw/collected_gen/{topic}/*User*.csv"

# In case of sports and banking topics uncomment the follwoing cells

# topic = "banking"
# gen_raw_tweets_paths = f"../data/raw/{topic}_tweets.csv"
# gen_raw_users_paths = f"../data/raw/{topic}_users.csv"

prop_processed_data_path = "../data/processed/propaganda.json"
gen_processed_data_path = f"../data/processed/genuine_{topic}.json"

In [3]:
# we are trying to match the format of this df to concat all later

prop_df = pd.read_json(prop_processed_data_path)
prop_df.head(2)

Unnamed: 0,tweetid,userid,user_display_name,user_screen_name,user_reported_location,user_profile_description,follower_count,following_count,account_creation_date,tweet_text,is_retweet,quote_count,reply_count,like_count,retweet_count,hashtags,urls,user_mentions,text
0,1161436140945195008,392352672,نجديه عذيه .,Ksa_FO1,نجد العذيه للدعم,مابادل اقل من ٥ الاف ولا أقبل المقدم بدون طلب ...,70616,52577,1318723200000,RT @GROB_07: #حـسـآب_تـمــيزة_بـتـفـاعـل 💫\n⠀┈...,True,0,0,0,0,3,0,2,حـسـآب تـمــيزة بـتـفـاعـل ⠀┈┉━◈♔♚♔◈━┅┄ نجــــ...
1,764273648496742400,4708503082,ْ‏ོ جود العبداللّهہ|| ོالتنبيهات ما توصل,j06__,,‏‏‏‏‏‏‏‏‏‏‏إنه يستجيب منذُ أن دعوُت ولگِن يُدب...,60061,2617,1451865600000,اللهم اكثر من امثالي ليعم الخير و تعم السعادة ...,False,0,0,0,0,1,0,0,اللهم اكثر من امثالي ليعم الخير و تعم السعادة ...


In [4]:
prop_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 56000 entries, 0 to 55999
Data columns (total 19 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   tweetid                   56000 non-null  int64 
 1   userid                    56000 non-null  object
 2   user_display_name         56000 non-null  object
 3   user_screen_name          56000 non-null  object
 4   user_reported_location    36366 non-null  object
 5   user_profile_description  51749 non-null  object
 6   follower_count            56000 non-null  int64 
 7   following_count           56000 non-null  int64 
 8   account_creation_date     56000 non-null  int64 
 9   tweet_text                56000 non-null  object
 10  is_retweet                56000 non-null  bool  
 11  quote_count               56000 non-null  int64 
 12  reply_count               56000 non-null  int64 
 13  like_count                56000 non-null  int64 
 14  retweet_count             5

In [5]:
import glob

paths = glob.glob(gen_raw_tweets_paths)
paths

['../data/raw/collected_gen/univ\\UniverstiestTweetFigure.csv',
 '../data/raw/collected_gen/univ\\UniverstiestTweetGeneral.csv',
 '../data/raw/collected_gen/univ\\UniverstiestTweetSalah.csv',
 '../data/raw/collected_gen/univ\\UniverstiestTweetSocial.csv',
 '../data/raw/collected_gen/univ\\UnivTweetSport.csv']

In [6]:
dfs = [pd.read_csv(f) for f in paths]

In [7]:
df1 = pd.concat(dfs, ignore_index=True)
df1.head(2)

Unnamed: 0,author_id,possibly_sensitive,created_at,source,conversation_id,id,text,lang,entities.hashtags,entities.annotations,...,public_metrics.like_count,public_metrics.quote_count,referenced_tweets,in_reply_to_user_id,entities.mentions,users,tweets,places,geo.place_id,attachments.poll_ids
0,1.171063e+18,False,2021-10-24T12:26:07.000Z,Twitter for iPhone,1.45225e+18,1.45225e+18,سمو رئيس اللجنة الأولمبية السعودية الأمير عبد...,ar,"[{'start': 244, 'end': 254, 'tag': 'واس_رياضي'}]","[{'start': 43, 'end': 66, 'probability': 0.743...",...,4.0,0.0,,,,,,,,
1,1.171056e+18,False,2021-10-02T21:59:59.000Z,Twitter Web App,1.444422e+18,1.444422e+18,تركي آل الشيخ يوقّع روايته الأولى في #معرض_الر...,ar,"[{'start': 37, 'end': 63, 'tag': 'معرض_الرياض_...","[{'start': 0, 'end': 12, 'probability': 0.8053...",...,177.0,54.0,,,,,,,,


In [8]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9792 entries, 0 to 9791
Data columns (total 24 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   author_id                     9787 non-null   float64
 1   possibly_sensitive            9787 non-null   object 
 2   created_at                    9787 non-null   object 
 3   source                        9787 non-null   object 
 4   conversation_id               9787 non-null   float64
 5   id                            9787 non-null   float64
 6   text                          9787 non-null   object 
 7   lang                          9787 non-null   object 
 8   entities.hashtags             9449 non-null   object 
 9   entities.annotations          2989 non-null   object 
 10  entities.urls                 8689 non-null   object 
 11  attachments.media_keys        7556 non-null   object 
 12  public_metrics.retweet_count  9787 non-null   float64
 13  pub

## Process Column Names


In [9]:
df1 = df1.dropna(subset=["text"])

In [10]:
is_retweet = []

for ref_tweet in df1.referenced_tweets.values:
    if not pd.isna(ref_tweet) and not ref_tweet.isdigit():
        is_retweet.append(eval(ref_tweet)[0]["type"] == "retweeted")
    else:
        is_retweet.append(False)

df1["is_retweet"] = is_retweet

In [11]:
col_rename = {
    "id": "tweetid",
    "text": "tweet_text",
    "created_at": "tweet_time",
    "author_id": "userid",
    "entities.urls": "urls",
    "entities.hashtags": "hashtags",
    "public_metrics.retweet_count": "retweet_count",
    "public_metrics.reply_count": "reply_count",
    "public_metrics.like_count": "like_count",
    "public_metrics.quote_count": "quote_count",
    "entities.mentions": "user_mentions",
    "is_retweet": "is_retweet",
}

df1.rename(columns=col_rename, inplace=True)
df1 = df1[col_rename.values()]
df1.head(2)

Unnamed: 0,tweetid,tweet_text,tweet_time,userid,urls,hashtags,retweet_count,reply_count,like_count,quote_count,user_mentions,is_retweet
0,1.45225e+18,سمو رئيس اللجنة الأولمبية السعودية الأمير عبد...,2021-10-24T12:26:07.000Z,1.171063e+18,"[{'start': 255, 'end': 278, 'url': 'https://t....","[{'start': 244, 'end': 254, 'tag': 'واس_رياضي'}]",4.0,1.0,4.0,0.0,,False
1,1.444422e+18,تركي آل الشيخ يوقّع روايته الأولى في #معرض_الر...,2021-10-02T21:59:59.000Z,1.171056e+18,"[{'start': 65, 'end': 88, 'url': 'https://t.co...","[{'start': 37, 'end': 63, 'tag': 'معرض_الرياض_...",71.0,174.0,177.0,54.0,,False


In [12]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9787 entries, 0 to 9791
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   tweetid        9787 non-null   float64
 1   tweet_text     9787 non-null   object 
 2   tweet_time     9787 non-null   object 
 3   userid         9787 non-null   float64
 4   urls           8689 non-null   object 
 5   hashtags       9449 non-null   object 
 6   retweet_count  9787 non-null   float64
 7   reply_count    9787 non-null   float64
 8   like_count     9787 non-null   float64
 9   quote_count    9787 non-null   float64
 10  user_mentions  1233 non-null   object 
 11  is_retweet     9787 non-null   bool   
dtypes: bool(1), float64(6), object(5)
memory usage: 927.1+ KB


In [13]:
prop_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 56000 entries, 0 to 55999
Data columns (total 19 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   tweetid                   56000 non-null  int64 
 1   userid                    56000 non-null  object
 2   user_display_name         56000 non-null  object
 3   user_screen_name          56000 non-null  object
 4   user_reported_location    36366 non-null  object
 5   user_profile_description  51749 non-null  object
 6   follower_count            56000 non-null  int64 
 7   following_count           56000 non-null  int64 
 8   account_creation_date     56000 non-null  int64 
 9   tweet_text                56000 non-null  object
 10  is_retweet                56000 non-null  bool  
 11  quote_count               56000 non-null  int64 
 12  reply_count               56000 non-null  int64 
 13  like_count                56000 non-null  int64 
 14  retweet_count             5

## Read Users' Data


In [14]:
paths = glob.glob(gen_raw_users_paths)
paths

['../data/raw/collected_gen/univ\\UniverstiestUsersFigure.csv',
 '../data/raw/collected_gen/univ\\UniverstiestUsersGeneral.csv',
 '../data/raw/collected_gen/univ\\UniverstiestUsersSalah.csv',
 '../data/raw/collected_gen/univ\\UniverstiestUsersSocial.csv',
 '../data/raw/collected_gen/univ\\UniverstiestUsersSport.csv']

In [15]:
dfs = [pd.read_csv(f, lineterminator="\n") for f in paths]

In [16]:
df2 = pd.concat(dfs, ignore_index=True)
df2.head(2)

Unnamed: 0,description,name,profile_image_url,url,id,username,created_at,protected,verified,public_metrics.followers_count,public_metrics.following_count,public_metrics.tweet_count,public_metrics.listed_count,entities.url.urls,location,entities.description.urls,entities.description.mentions,pinned_tweet_id,entities.description.hashtags
0,حساب يهتم بأخبار الرياضة المحلية والعالمية.,واس الرياضي,https://pbs.twimg.com/profile_images/131274593...,https://t.co/71DZXnWjwI,1171062925488001025,SPA_Spor,2019-09-09T14:09:06.000Z,False,True,158216,8,18252,493,"[{'start': 0, 'end': 23, 'url': 'https://t.co/...",,,,,
1,حساب يهتم بالشأن العام على المستويين المحلي وا...,واس العام,https://pbs.twimg.com/profile_images/151794435...,https://t.co/zMWBLahSfG,1171056150999158791,SPAregions,2019-09-09T13:42:06.000Z,False,True,2165525,8,52420,2031,"[{'start': 0, 'end': 23, 'url': 'https://t.co/...",الرياض,,,,


In [17]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1212 entries, 0 to 1211
Data columns (total 19 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   description                     1204 non-null   object 
 1   name                            1212 non-null   object 
 2   profile_image_url               1212 non-null   object 
 3   url                             1156 non-null   object 
 4   id                              1212 non-null   int64  
 5   username                        1212 non-null   object 
 6   created_at                      1212 non-null   object 
 7   protected                       1212 non-null   bool   
 8   verified                        1212 non-null   bool   
 9   public_metrics.followers_count  1212 non-null   int64  
 10  public_metrics.following_count  1212 non-null   int64  
 11  public_metrics.tweet_count      1212 non-null   int64  
 12  public_metrics.listed_count     12

In [18]:
prop_df.head(2)

Unnamed: 0,tweetid,userid,user_display_name,user_screen_name,user_reported_location,user_profile_description,follower_count,following_count,account_creation_date,tweet_text,is_retweet,quote_count,reply_count,like_count,retweet_count,hashtags,urls,user_mentions,text
0,1161436140945195008,392352672,نجديه عذيه .,Ksa_FO1,نجد العذيه للدعم,مابادل اقل من ٥ الاف ولا أقبل المقدم بدون طلب ...,70616,52577,1318723200000,RT @GROB_07: #حـسـآب_تـمــيزة_بـتـفـاعـل 💫\n⠀┈...,True,0,0,0,0,3,0,2,حـسـآب تـمــيزة بـتـفـاعـل ⠀┈┉━◈♔♚♔◈━┅┄ نجــــ...
1,764273648496742400,4708503082,ْ‏ོ جود العبداللّهہ|| ོالتنبيهات ما توصل,j06__,,‏‏‏‏‏‏‏‏‏‏‏إنه يستجيب منذُ أن دعوُت ولگِن يُدب...,60061,2617,1451865600000,اللهم اكثر من امثالي ليعم الخير و تعم السعادة ...,False,0,0,0,0,1,0,0,اللهم اكثر من امثالي ليعم الخير و تعم السعادة ...


In [19]:
prop_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 56000 entries, 0 to 55999
Data columns (total 19 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   tweetid                   56000 non-null  int64 
 1   userid                    56000 non-null  object
 2   user_display_name         56000 non-null  object
 3   user_screen_name          56000 non-null  object
 4   user_reported_location    36366 non-null  object
 5   user_profile_description  51749 non-null  object
 6   follower_count            56000 non-null  int64 
 7   following_count           56000 non-null  int64 
 8   account_creation_date     56000 non-null  int64 
 9   tweet_text                56000 non-null  object
 10  is_retweet                56000 non-null  bool  
 11  quote_count               56000 non-null  int64 
 12  reply_count               56000 non-null  int64 
 13  like_count                56000 non-null  int64 
 14  retweet_count             5

Keep user IDs that already has tweets stored in data frame number 1.


In [20]:
df2 = df2[df2.id.isin(df1["userid"])]
df2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 664 entries, 0 to 1208
Data columns (total 19 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   description                     664 non-null    object 
 1   name                            664 non-null    object 
 2   profile_image_url               664 non-null    object 
 3   url                             664 non-null    object 
 4   id                              664 non-null    int64  
 5   username                        664 non-null    object 
 6   created_at                      664 non-null    object 
 7   protected                       664 non-null    bool   
 8   verified                        664 non-null    bool   
 9   public_metrics.followers_count  664 non-null    int64  
 10  public_metrics.following_count  664 non-null    int64  
 11  public_metrics.tweet_count      664 non-null    int64  
 12  public_metrics.listed_count     664 non-

In [21]:
df2.head(2)

Unnamed: 0,description,name,profile_image_url,url,id,username,created_at,protected,verified,public_metrics.followers_count,public_metrics.following_count,public_metrics.tweet_count,public_metrics.listed_count,entities.url.urls,location,entities.description.urls,entities.description.mentions,pinned_tweet_id,entities.description.hashtags
0,حساب يهتم بأخبار الرياضة المحلية والعالمية.,واس الرياضي,https://pbs.twimg.com/profile_images/131274593...,https://t.co/71DZXnWjwI,1171062925488001025,SPA_Spor,2019-09-09T14:09:06.000Z,False,True,158216,8,18252,493,"[{'start': 0, 'end': 23, 'url': 'https://t.co/...",,,,,
1,حساب يهتم بالشأن العام على المستويين المحلي وا...,واس العام,https://pbs.twimg.com/profile_images/151794435...,https://t.co/zMWBLahSfG,1171056150999158791,SPAregions,2019-09-09T13:42:06.000Z,False,True,2165525,8,52420,2031,"[{'start': 0, 'end': 23, 'url': 'https://t.co/...",الرياض,,,,


## Align Data Frames


Create missing columns.


In [22]:
names = []
handles = []
locations = []
bios = []
followers = []
following = []
creation_dates = []


for user in df1.userid.values:
    user_df = df2[df2.id == user]
    if len(user_df) == 0:
        names.append(None)
        handles.append(None)
        locations.append(None)
        bios.append(None)
        followers.append(None)
        following.append(None)
        creation_dates.append(None)
        continue
    names.append(user_df.loc[:, "name"].values[0])
    handles.append(user_df.loc[:, "username"].values[0])
    creation_dates.append(user_df.loc[:, "created_at"].values[0])
    locations.append(user_df.loc[:, "location"].values[0])
    bios.append(user_df.loc[:, "description"].values[0])
    followers.append(user_df.loc[:, "public_metrics.followers_count"].values[0])
    following.append(user_df.loc[:, "public_metrics.following_count"].values[0])

df1["user_display_name"] = names
df1["user_screen_name"] = handles
df1["user_reported_location"] = locations
df1["user_profile_description"] = bios
df1["follower_count"] = followers
df1["following_count"] = following
df1["account_creation_date"] = creation_dates

df1.head(2)

Unnamed: 0,tweetid,tweet_text,tweet_time,userid,urls,hashtags,retweet_count,reply_count,like_count,quote_count,user_mentions,is_retweet,user_display_name,user_screen_name,user_reported_location,user_profile_description,follower_count,following_count,account_creation_date
0,1.45225e+18,سمو رئيس اللجنة الأولمبية السعودية الأمير عبد...,2021-10-24T12:26:07.000Z,1.171063e+18,"[{'start': 255, 'end': 278, 'url': 'https://t....","[{'start': 244, 'end': 254, 'tag': 'واس_رياضي'}]",4.0,1.0,4.0,0.0,,False,واس الرياضي,SPA_Spor,,حساب يهتم بأخبار الرياضة المحلية والعالمية.,158216,8,2019-09-09T14:09:06.000Z
1,1.444422e+18,تركي آل الشيخ يوقّع روايته الأولى في #معرض_الر...,2021-10-02T21:59:59.000Z,1.171056e+18,"[{'start': 65, 'end': 88, 'url': 'https://t.co...","[{'start': 37, 'end': 63, 'tag': 'معرض_الرياض_...",71.0,174.0,177.0,54.0,,False,واس العام,SPAregions,الرياض,حساب يهتم بالشأن العام على المستويين المحلي وا...,2165525,8,2019-09-09T13:42:06.000Z


In [23]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9787 entries, 0 to 9791
Data columns (total 19 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   tweetid                   9787 non-null   float64
 1   tweet_text                9787 non-null   object 
 2   tweet_time                9787 non-null   object 
 3   userid                    9787 non-null   float64
 4   urls                      8689 non-null   object 
 5   hashtags                  9449 non-null   object 
 6   retweet_count             9787 non-null   float64
 7   reply_count               9787 non-null   float64
 8   like_count                9787 non-null   float64
 9   quote_count               9787 non-null   float64
 10  user_mentions             1233 non-null   object 
 11  is_retweet                9787 non-null   bool   
 12  user_display_name         9787 non-null   object 
 13  user_screen_name          9787 non-null   object 
 14  user_reported

In [24]:
df1["urls"] = [
    len(eval(urls)) if not pd.isna(urls) and not urls.isdigit() else 0
    for urls in df1["urls"].values
]
df1["hashtags"] = [
    len(eval(hashtags)) if not pd.isna(hashtags) else 0
    for hashtags in df1["hashtags"].values
]
df1["user_mentions"] = [
    len(eval(mentions)) if not pd.isna(mentions) else 0
    for mentions in df1["user_mentions"].values
]

In [25]:
df1.head(2)

Unnamed: 0,tweetid,tweet_text,tweet_time,userid,urls,hashtags,retweet_count,reply_count,like_count,quote_count,user_mentions,is_retweet,user_display_name,user_screen_name,user_reported_location,user_profile_description,follower_count,following_count,account_creation_date
0,1.45225e+18,سمو رئيس اللجنة الأولمبية السعودية الأمير عبد...,2021-10-24T12:26:07.000Z,1.171063e+18,4,1,4.0,1.0,4.0,0.0,0,False,واس الرياضي,SPA_Spor,,حساب يهتم بأخبار الرياضة المحلية والعالمية.,158216,8,2019-09-09T14:09:06.000Z
1,1.444422e+18,تركي آل الشيخ يوقّع روايته الأولى في #معرض_الر...,2021-10-02T21:59:59.000Z,1.171056e+18,3,3,71.0,174.0,177.0,54.0,0,False,واس العام,SPAregions,الرياض,حساب يهتم بالشأن العام على المستويين المحلي وا...,2165525,8,2019-09-09T13:42:06.000Z


### Clean Text


In [26]:
def clean_tweet_text(text):
    """Process tweet text by removing links, mentions, and hashtags symbol."""
    # links
    clean_text = re.sub(r"http\S+|t\.co/\S+", "", text)
    # mentions
    clean_text = re.sub(r"@\w+", "", clean_text)
    # hashtags
    clean_text = re.sub(r"#", "", clean_text)
    clean_text = re.sub(r"_", " ", clean_text)
    # tashqeel - from @bakriano
    clean_text = re.sub(r"[\u0617-\u061A\u064B-\u0652]", "", clean_text)
    # emojis
    clean_text = emoji.replace_emoji(clean_text, replace="")
    # remove new lines and normalize white spaces
    clean_text = re.sub(r"\s+", " ", clean_text)
    return clean_text.replace("RT :", "").strip()

In [27]:
df1["text"] = df1.tweet_text.apply(clean_tweet_text)
df1.head(2)

Unnamed: 0,tweetid,tweet_text,tweet_time,userid,urls,hashtags,retweet_count,reply_count,like_count,quote_count,user_mentions,is_retweet,user_display_name,user_screen_name,user_reported_location,user_profile_description,follower_count,following_count,account_creation_date,text
0,1.45225e+18,سمو رئيس اللجنة الأولمبية السعودية الأمير عبد...,2021-10-24T12:26:07.000Z,1.171063e+18,4,1,4.0,1.0,4.0,0.0,0,False,واس الرياضي,SPA_Spor,,حساب يهتم بأخبار الرياضة المحلية والعالمية.,158216,8,2019-09-09T14:09:06.000Z,سمو رئيس اللجنة الأولمبية السعودية الأمير عبدا...
1,1.444422e+18,تركي آل الشيخ يوقّع روايته الأولى في #معرض_الر...,2021-10-02T21:59:59.000Z,1.171056e+18,3,3,71.0,174.0,177.0,54.0,0,False,واس العام,SPAregions,الرياض,حساب يهتم بالشأن العام على المستويين المحلي وا...,2165525,8,2019-09-09T13:42:06.000Z,تركي آل الشيخ يوقع روايته الأولى في معرض الريا...


In [28]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9787 entries, 0 to 9791
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   tweetid                   9787 non-null   float64
 1   tweet_text                9787 non-null   object 
 2   tweet_time                9787 non-null   object 
 3   userid                    9787 non-null   float64
 4   urls                      9787 non-null   int64  
 5   hashtags                  9787 non-null   int64  
 6   retweet_count             9787 non-null   float64
 7   reply_count               9787 non-null   float64
 8   like_count                9787 non-null   float64
 9   quote_count               9787 non-null   float64
 10  user_mentions             9787 non-null   int64  
 11  is_retweet                9787 non-null   bool   
 12  user_display_name         9787 non-null   object 
 13  user_screen_name          9787 non-null   object 
 14  user_reported

In [29]:
prop_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 56000 entries, 0 to 55999
Data columns (total 19 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   tweetid                   56000 non-null  int64 
 1   userid                    56000 non-null  object
 2   user_display_name         56000 non-null  object
 3   user_screen_name          56000 non-null  object
 4   user_reported_location    36366 non-null  object
 5   user_profile_description  51749 non-null  object
 6   follower_count            56000 non-null  int64 
 7   following_count           56000 non-null  int64 
 8   account_creation_date     56000 non-null  int64 
 9   tweet_text                56000 non-null  object
 10  is_retweet                56000 non-null  bool  
 11  quote_count               56000 non-null  int64 
 12  reply_count               56000 non-null  int64 
 13  like_count                56000 non-null  int64 
 14  retweet_count             5

In [30]:
df1 = df1[prop_df.columns]
df1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9787 entries, 0 to 9791
Data columns (total 19 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   tweetid                   9787 non-null   float64
 1   userid                    9787 non-null   float64
 2   user_display_name         9787 non-null   object 
 3   user_screen_name          9787 non-null   object 
 4   user_reported_location    8769 non-null   object 
 5   user_profile_description  9787 non-null   object 
 6   follower_count            9787 non-null   int64  
 7   following_count           9787 non-null   int64  
 8   account_creation_date     9787 non-null   object 
 9   tweet_text                9787 non-null   object 
 10  is_retweet                9787 non-null   bool   
 11  quote_count               9787 non-null   float64
 12  reply_count               9787 non-null   float64
 13  like_count                9787 non-null   float64
 14  retweet_count

In [31]:
df1.to_json(gen_processed_data_path)