# Code to generate a dataset containing Twitter users' and tweets' information

In [1]:
import json
import pandas as pd
import glob
import random
import os

In [None]:
PATH_DIR = '' # directory where twitter stream data is located
os.chdir(PATH_DIR)

## Preparing the dataset

In [2]:
# Merge multiple json files into one list
def merge_JsonFiles(files):
    df = list()
    for file in files:
        with open(file, 'r') as f:
            for line in f.readlines():
                if line.startswith('{'):
                    df.append(json.loads(line))
    return df

In [2]:
# Run this code to merge json files located in the same folder

json_files = glob.glob(PATH_DIR + "/*.json") # get all the json files inside the folder

In [3]:
# Run this code to merge a random set of json files located in different folders

days = ['02', '13', '21']
PATH_FILES = [PATH_DIR+'/202211'+x+'/' for x in days] # chosen folders

json_files = []
files_per_folder = 40 # number of files to randomly choose from each folder

for x in PATH_FILES:
    files = glob.glob(x + "/*.json") # get all the json files inside the folder
    random_idx = sorted(random.sample(range(0, len(files)), files_per_folder)) # get index for each randomly chosen file
    for y in random_idx:
        json_files.append(files[y])

# Save the chosen files directories for future use
with open("chosen_files", "w") as f:
    json.dump(json_files, f)

In [None]:
# Recover chosen files from previous randomizing
with open("chosen_files", "r") as f:
    json_files = json.load(f)

In [4]:
df = merge_JsonFiles(json_files) # merge the json files into one list

In [5]:
print("Number of tweets extracted:", len(df))

Number of tweets extracted: 345594


In [6]:
# We reverse the order of the tweets so they go from newest to oldest.
# This way, we avoid having to update the tweet and user's attributes, since we already have the latest info.
df.reverse()

# Creating the datasets

## Users and tweets datasets

In [7]:
# Run this code if you want to get both users and tweets information

# Attributes to get from each user
attr_user = ['name', 'screen_name', 'location', 'protected', 'verified', 'followers_count',
              'friends_count', 'listed_count', 'favourites_count', 'statuses_count', 'created_at',
              'geo_enabled', 'default_profile', 'default_profile_image']

# Attributes to get from each tweet
attr_tweet = ['created_at', 'in_reply_to_status_id', 'in_reply_to_user_id', 'quote_count',
              'reply_count', 'retweet_count', 'favorite_count', 'entities']

tweets = {}
users = {}

# Save the chosen attributes for each user
def save_user_info(df_2):
    user_id = df_2['user']['id']
    tweet_id = df_2['id']

    if (user_id not in users): # check if user info is already saved
        attr = {}

        for x in attr_user:
            attr[x] = df_2['user'][x]

        users[user_id]= attr
        users[user_id]['tweets'] = []
    
    users[user_id]['tweets'].append(tweet_id) # save the ids for each tweet we get from the user

# Save the chosen attributes for each tweet
def save_tweet_info(df_2):
    user_id = df_2['user']['id']
    tweet_id = df_2['id']
    
    if (tweet_id not in tweets): # check if tweet info is already saved
        attr = {}
        attr['user_id'] = user_id

        # Get the full text of the tweet
        if (df_2['truncated'] == True):
            attr['text'] = df_2['extended_tweet']['full_text']
        else:
            attr['text'] = df_2['text']

        for x in attr_tweet:
            attr[x] = df_2[x]

        tweets[tweet_id]= attr

        save_user_info(df_2)
    
    # filter RT and QRT
    if ('retweeted_status' in df_2):
        tweets[tweet_id]['type'] = 'RT'
        if (df_2['retweeted_status']['lang'] == 'en'):
            df_3 = df_2['retweeted_status']
            save_tweet_info(df_3)
    elif ('quoted_status' in df_2):
        tweets[tweet_id]['type'] = 'QRT'
        if (df_2['quoted_status']['lang'] == 'en'):
            df_3 = df_2['quoted_status']
            save_tweet_info(df_3)
    else:
        tweets[tweet_id]['type'] = 'tweet'


for i in range(len(df)):
    # Filter tweets in English and retweets
    if (df[i]['lang'] in ['en', 'None'] ):
        df_2 = df[i]
        save_tweet_info(df_2)

In [8]:
print("Number of users saved:", len(users))
print("Number of tweets saved:", len(tweets))

Number of users saved: 145015
Number of tweets saved: 169675


## Users dataset

In [16]:
# Run this code if you only want to get the users information

# Attributes to get from each user
attributes = ['name', 'screen_name', 'location', 'protected', 'verified', 'followers_count',
              'friends_count', 'listed_count', 'favourites_count', 'statuses_count', 'created_at',
              'geo_enabled', 'default_profile', 'default_profile_image']

# Save the chosen attributes for each user
def save_user_info(df_2):
    user_id = df_2['user']['id']
    tweet_id = df_2['id']

    if (user_id not in users): # check if user info is already saved
        attr = {}

        for x in attributes:
            attr[x] = df[i]['user'][x]

        users[user_id]= attr
        users[user_id]['tweets'] = []
    
    users[user_id]['tweets'].append(tweet_id) # save the ids for each tweet we get from the user

    # filter RT and QRT
    if ('retweeted_status' in df_2):
        if (df_2['retweeted_status']['lang'] == 'en'):
            df_3 = df_2['retweeted_status']
            save_user_info(df_3)
    elif ('quoted_status' in df_2):
        if (df_2['quoted_status']['lang'] == 'en'):
            df_3 = df_2['quoted_status']
            save_user_info(df_3)

users = {}

for i in range(len(df)):
    # Filter tweets in English and retweets
    if (df[i]['lang'] in ['en', 'None'] ):
        df_2 = df[i]
        save_user_info(df_2)

In [17]:
print("Number of users saved:", len(users))

Number of users saved: 25769


## Tweets dataset

In [12]:
# Run this code if you only want to get the tweets information

# Attributes to get from each tweet
attributes = ['created_at', 'in_reply_to_status_id', 'in_reply_to_user_id', 'quote_count',
              'reply_count', 'retweet_count', 'favorite_count', 'entities']

tweets = {}

# Save the chosen attributes for each tweet
def save_tweet_info(df_2):
    user_id = df_2['user']['id']
    tweet_id = df_2['id']
    
    if (tweet_id not in tweets): # check if tweet info is already saved
        attr = {}
        attr['user_id'] = user_id

        # Get the full text of the tweet
        if (df_2['truncated'] == True):
            attr['text'] = df_2['extended_tweet']['full_text']
        else:
            attr['text'] = df_2['text']

        for x in attributes:
            attr[x] = df_2[x]

        tweets[tweet_id]= attr
    
    # filter RT and QRT
    if ('retweeted_status' in df_2):
        tweets[tweet_id]['type'] = 'RT'
        if (df_2['retweeted_status']['lang'] == 'en'):
            df_3 = df_2['retweeted_status']
            save_tweet_info(df_3)
    elif ('quoted_status' in df_2):
        tweets[tweet_id]['type'] = 'QRT'
        if (df_2['quoted_status']['lang'] == 'en'):
            df_3 = df_2['quoted_status']
            save_tweet_info(df_3)
    else:
        tweets[tweet_id]['type'] = 'tweet'

for i in range(len(df)):
    # Filter tweets in English and retweets
    if (df[i]['lang'] in ['en', 'None'] ):
        df_2 = df[i]
        save_tweet_info(df_2)

In [13]:
print("Number of tweets saved:", len(tweets))

Number of tweets saved: 27881


# Modifying and saving the datasets

In [9]:
# Users dataset
df_users = pd.DataFrame(data=users)
df_users = (df_users.T)
df_users.index.name = 'user_id'
df_users.head()

Unnamed: 0_level_0,name,screen_name,location,protected,verified,followers_count,friends_count,listed_count,favourites_count,statuses_count,created_at,geo_enabled,default_profile,default_profile_image,tweets
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1572625360704905219,دختر ایرانم,im___sweety,,False,False,395,471,0,8050,5578,Wed Sep 21 16:34:54 +0000 2022,False,True,False,[1594800537098420224]
1572993765588992001,خرچنگ موردعلاقه نامجون⁷,MmdNamjwn2009,ایران ازاد,False,False,447,438,1,1620,7126,Thu Sep 22 16:58:42 +0000 2022,False,True,False,[1594789854834991115]
942140043958812672,BLetke,B_Letke,"North Carolina, USA",False,False,19,27,0,795,737,Sat Dec 16 21:11:05 +0000 2017,False,True,False,[1594800537106620416]
1389360201996832771,il Donaldo Trumpo,PapiTrumpo,United States,False,False,590591,8124,875,41960,8397,Mon May 03 23:24:58 +0000 2021,False,True,False,"[1594792366904320012, 1591822794677002241, 159..."
1590236500872105984,Steven Stanley,StevenS75946994,,False,False,126,0,0,669,3512,Wed Nov 09 06:55:02 +0000 2022,False,True,False,[1594800537106808833]


In [10]:
# Tweets dataset
df_tweets = pd.DataFrame(data=tweets)
df_tweets = (df_tweets.T)
df_tweets.index.name = 'tweet_id'
df_tweets.head()

Unnamed: 0_level_0,user_id,text,created_at,in_reply_to_status_id,in_reply_to_user_id,quote_count,reply_count,retweet_count,favorite_count,entities,type
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1594800537098420224,1572625360704905219,RT @MmdNamjwn2009: @BabyMochiMom The green gas...,Mon Nov 21 21:10:52 +0000 2022,,,0,0,0,0,"{'hashtags': [{'text': 'javanroud', 'indices':...",RT
1594789854834991115,1572993765588992001,@BabyMochiMom The green gas is being used by I...,Mon Nov 21 20:28:25 +0000 2022,1.5947881594346903e+18,1.4627133354687693e+18,1,0,36,16,"{'hashtags': [{'text': 'javanroud', 'indices':...",tweet
1594800537106620416,942140043958812672,RT @PapiTrumpo: Who did this??😂😂😂 https://t.co...,Mon Nov 21 21:10:52 +0000 2022,,,0,0,0,0,"{'hashtags': [], 'urls': [], 'user_mentions': ...",RT
1594792366904320012,1389360201996832771,Who did this??😂😂😂 https://t.co/m2JCqnhlxL,Mon Nov 21 20:38:24 +0000 2022,,,49,154,298,1210,"{'hashtags': [], 'urls': [], 'user_mentions': ...",tweet
1594800537106808833,1590236500872105984,"Success has life. We're learning technologies,...",Mon Nov 21 21:10:52 +0000 2022,,,0,0,0,0,"{'hashtags': [], 'urls': [], 'user_mentions': ...",QRT


In [11]:
# Change boolean attributes to 0 and 1
df_users['location'].loc[~df_users['location'].isnull()] = 1  # not nan
df_users['location'].loc[df_users['location'].isnull()] = 0   # nan

df_users['protected'] = df_users['protected'].astype(int)
df_users['verified'] = df_users['verified'].astype(int)
df_users['geo_enabled'] = df_users['geo_enabled'].astype(int)
df_users['default_profile'] = df_users['default_profile'].astype(int)
df_users['default_profile_image'] = df_users['default_profile_image'].astype(int)

# Change string to integer of year
df_users['created_at'] = df_users['created_at'].str.slice(start=26).astype(int)

In [12]:
# Save the entire datasets
PATH = 'Data/Original Dataset 11-2022'
users_file = 'users'
tweets_file = 'tweets'

df_users.to_excel(PATH+users_file+'.xlsx', engine='xlsxwriter')
df_tweets.to_excel(PATH+tweets_file+'.xlsx', engine='xlsxwriter')

In [13]:
# Divide each dataset into smaller subsets and save them
limit = 45000 # maximum number of users/tweets in each subset
aux_users = [df_users[i:i+limit] for i in range(0, df_users.shape[0], limit)]
aux_tweets = [df_tweets[df_tweets['user_id'].isin(x.index)] for x in aux_users]

for i in range(len(aux_users)):
    aux_users[i].to_excel(users_file+'_'+str(i)+'.xlsx', engine='xlsxwriter')
    aux_tweets[i].to_excel(tweets_file+'_'+str(i)+'.xlsx', engine='xlsxwriter')