In [6]:
import os, json
import pandas as pd

### Read source tweets

In [7]:
def fetch_source_df(topic, classtype):
    # Define the directory path
    directory_path = "phemernrdataset/pheme-rnr-dataset/"+topic+"/"+classtype+"/"
    
    #create an empty DataFrame
    source_Tweet_data = pd.DataFrame()
    user_id = []
    tweet_id = []
    tweet_text = []
    retweet_count = []
    favorite_count = []
    followers_count = []
    friends_count = []
    verified = []
    created_at = []
    # Get the list of folders in the directory
    folders = [folder for folder in os.listdir(directory_path) if os.path.isdir(os.path.join(directory_path, folder))]
    
    # Get the first folder name
    first_folder_name = folders[0] if folders else None
    #print(len(folders))
    print("First folder name:", first_folder_name)
    index = 0
    for file_name in folders:
        new_path = directory_path + file_name+'/' + 'source-tweet/' + file_name + '.json'
        f = open(new_path)
        data = json.load(f)
        #print(data)
        user_id.append(data['user']['id_str'])
        tweet_id.append(data['id_str'])
        tweet_text.append(data['text'])
        retweet_count.append(data['retweet_count'])
        favorite_count.append(data['favorite_count'])
        followers_count.append(data['user']['followers_count'])
        friends_count.append(data['user']['friends_count'])
        verified.append(data['user']['verified'])
        created_at.append(data['created_at'])
    
    #create a df
    data = {'user_id': user_id,
            'tweet_id':tweet_id, 
            'tweet_text':tweet_text, 
            'retweet_count':retweet_count,
            'favorite_count':favorite_count,
            'followers_count':followers_count, 
            'friends_count':friends_count, 
            'verified':verified, 
            'created_at':created_at}
    
    df = pd.DataFrame(data)

    if(classtype=='rumours'):
        df['label'] = 'rumor'

    else:
        df['label'] = 'non-rumor'
    
    df['label'] = df['label'].replace({'rumor': 1, 'non-rumor': 0})
    df['verified'] = df['verified'].replace({True: 1, False: 0})
    
    return df

### Read response tweets

In [8]:
def fetch_response_tweets(topic, classtype, res_type):
    response_user_id = []
    response_tweet_id = []
    in_reply_to_status_id = []
    in_reply_to_user_id = []
    response_tweet_text = []
    response_retweet_count = []
    response_favorite_count = []
    response_created_at = []
    main_thread_id = []
    response_directory_path = "phemernrdataset/pheme-rnr-dataset/"+topic+"/"+classtype+"/"
    # Get the list of folders in the directory
    folders = [folder for folder in os.listdir(response_directory_path) 
               if os.path.isdir(os.path.join(response_directory_path, folder))]
    
    #loop through all the folders in the directory
    for folder in folders:
        path = response_directory_path + folder+'/' + 'reactions/'
        json_files = sorted([pos_json for pos_json in os.listdir(path)
                 if pos_json.endswith('.json')])
        
        for index, js in enumerate(json_files):
    
            try:
                #open the file using the given directory
                with open(os.path.join(path, js)) as file:
                    #load data from json file
                    data = json.load(file)
    
                    #convert to string first before storing in array
                    main_thread_id.append(folder)
                    response_user_id.append(data['user']['id_str'])
                    response_tweet_id.append(data['id_str'])
                    in_reply_to_status_id.append(str(data['in_reply_to_status_id_str']))
                    in_reply_to_user_id.append(str(data['in_reply_to_user_id_str']))
                    response_tweet_text.append(data['text'])
                    response_retweet_count.append(data['retweet_count'])
                    response_favorite_count.append(data['favorite_count'])
                    response_created_at.append(data['created_at'])
    
            except Exception as inst:
                print(type(inst))    # the exception type
                print(inst.args)     # arguments stored in .args
                print(inst) 
    
    
    
    #create a df
    data = {'main_tweet_id': main_thread_id,
            'response_tweet_id':response_tweet_id, 
            'user_id': response_user_id,
            'in_reply_to_status_id':in_reply_to_status_id, 
            'in_reply_to_user_id':in_reply_to_user_id, 
            'response_tweet_text':response_tweet_text,
            'retweet_count': response_retweet_count,
            'favorite_count':response_favorite_count,
            'response_created_at':response_created_at}
    
    response_tweet_data = pd.DataFrame(data)
    response_tweet_data['label'] = res_type
    response_tweet_data['label'] = response_tweet_data['label'].replace({'rumour': 1, 'non-rumour': 0})
    return response_tweet_data

In [3]:
def concat_files_and_save(rumor_df, nr_df, topic, category):
    frame = [rumor_df, nr_df]
    df = pd.concat(frame)
    df.to_csv(f'{topic}_{category}.csv', index=False, na_rep='None')

In [139]:
def check_nulls(df):
    # check null rows
    null_mask = df.isnull().any(axis=1)
    null_rows = df[null_mask]
    return null_rows

In [281]:
def replace_vals(df):
    columns = ['in_reply_to_status_id', 'in_reply_to_user_id']
    df[columns] = df[columns].fillna('No ID') 
    return df

In [141]:
def to_str(df):
    rows = ['main_tweet_id', 'in_reply_to_status_id', 'in_reply_to_user_id','user_id','response_tweet_id']
    df[rows] = df[rows].astype(str)
    return df

In [321]:
concat_files_and_save(ottawa_st_rumors, ottawa_st_nr, 'ottawa', 'source') 
concat_files_and_save(charliehebdo_st_rumors, charliehebdo_st_nr, 'charliehebdo', 'source') 
concat_files_and_save(ferguson_st_rumors, ferguson_st_nr, 'ferguson', 'source') 
concat_files_and_save(germanwings_crash_st_rumors, germanwingscrash_st_nr, 'germanwings', 'source') 
concat_files_and_save(sydneysiege_st_rumors, sydneysiege_st_nr, 'sydneysiege', 'source') 

In [27]:
concat_files_and_save(ottawa_r_res, ottawa_res_nr, 'ottawa', 'response') 
concat_files_and_save(charliehebdo_r_res, charliehebdo_nr_res, 'charliehebdo', 'response') 
concat_files_and_save(ferguson_r_res, ferguson_nr_res, 'ferguson', 'response') 
concat_files_and_save(germanwings_crash_r_res, germanwings_nr_res, 'germanwings', 'response') 
concat_files_and_save(sydneysiege_r_res, sydneysiege_nr_res, 'sydneysiege', 'response') 

In [20]:
# Convert all columns to appropriate types
frame = [germanwings_crash_r_res, germanwings_nr_res]
df = pd.concat(frame)
df['main_tweet_id'] = df['main_tweet_id'].astype(str)
df['response_tweet_id'] = df['response_tweet_id'].astype(str)
df['user_id'] = df['user_id'].astype(str)
df['in_reply_to_status_id'] = df['in_reply_to_status_id'].astype(str)
df['in_reply_to_user_id'] = df['in_reply_to_user_id'].astype(str)
df['response_tweet_text'] = df['response_tweet_text'].astype(str)
df['retweet_count'] = df['retweet_count'].astype(int)
df['favorite_count'] = df['favorite_count'].astype(int)
df['response_created_at'] = df['response_created_at'].astype(str)
df['label'] = df['label'].astype(int)

# Check for null values
print("Null values before saving:")
print(df.isnull().sum())

Null values before saving:
main_tweet_id            0
response_tweet_id        0
user_id                  0
in_reply_to_status_id    0
in_reply_to_user_id      0
response_tweet_text      0
retweet_count            0
favorite_count           0
response_created_at      0
label                    0
dtype: int64


In [21]:
df.to_csv('german_reponses.csv', index=False, encoding='utf-8', na_rep='')
print('File_saved')

File_saved


In [26]:
print((df['in_reply_to_status_id'][df['in_reply_to_status_id']=='None']).dtype)

object


## Read Rumors

### Source Tweets

In [299]:
ottawa_st_rumors = fetch_source_df('ottawashooting', 'rumours')
print(ottawa_st_rumors.info())

First folder name: 524944399890124801
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 470 entries, 0 to 469
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   user_id          470 non-null    object
 1   tweet_id         470 non-null    object
 2   tweet_text       470 non-null    object
 3   retweet_count    470 non-null    int64 
 4   favorite_count   470 non-null    int64 
 5   followers_count  470 non-null    int64 
 6   friends_count    470 non-null    int64 
 7   verified         470 non-null    int64 
 8   created_at       470 non-null    object
 9   label            470 non-null    int64 
dtypes: int64(6), object(4)
memory usage: 36.8+ KB
None


  df['label'] = df['label'].replace({'rumor': 1, 'non-rumor': 0})
  df['verified'] = df['verified'].replace({True: 1, False: 0})


In [300]:
charliehebdo_st_rumors = fetch_source_df('charliehebdo', 'rumours')
print(charliehebdo_st_rumors.info())

First folder name: 553487971497041920
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458 entries, 0 to 457
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   user_id          458 non-null    object
 1   tweet_id         458 non-null    object
 2   tweet_text       458 non-null    object
 3   retweet_count    458 non-null    int64 
 4   favorite_count   458 non-null    int64 
 5   followers_count  458 non-null    int64 
 6   friends_count    458 non-null    int64 
 7   verified         458 non-null    int64 
 8   created_at       458 non-null    object
 9   label            458 non-null    int64 
dtypes: int64(6), object(4)
memory usage: 35.9+ KB
None


  df['label'] = df['label'].replace({'rumor': 1, 'non-rumor': 0})
  df['verified'] = df['verified'].replace({True: 1, False: 0})


In [301]:
ferguson_st_rumors = fetch_source_df('ferguson', 'rumours')
print(ferguson_st_rumors.info())

First folder name: 500291013521334272
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284 entries, 0 to 283
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   user_id          284 non-null    object
 1   tweet_id         284 non-null    object
 2   tweet_text       284 non-null    object
 3   retweet_count    284 non-null    int64 
 4   favorite_count   284 non-null    int64 
 5   followers_count  284 non-null    int64 
 6   friends_count    284 non-null    int64 
 7   verified         284 non-null    int64 
 8   created_at       284 non-null    object
 9   label            284 non-null    int64 
dtypes: int64(6), object(4)
memory usage: 22.3+ KB
None


  df['label'] = df['label'].replace({'rumor': 1, 'non-rumor': 0})
  df['verified'] = df['verified'].replace({True: 1, False: 0})


In [302]:
germanwings_crash_st_rumors = fetch_source_df('germanwings-crash', 'rumours')
print(germanwings_crash_st_rumors.info())

First folder name: 580387098039046145
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 238 entries, 0 to 237
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   user_id          238 non-null    object
 1   tweet_id         238 non-null    object
 2   tweet_text       238 non-null    object
 3   retweet_count    238 non-null    int64 
 4   favorite_count   238 non-null    int64 
 5   followers_count  238 non-null    int64 
 6   friends_count    238 non-null    int64 
 7   verified         238 non-null    int64 
 8   created_at       238 non-null    object
 9   label            238 non-null    int64 
dtypes: int64(6), object(4)
memory usage: 18.7+ KB
None


  df['label'] = df['label'].replace({'rumor': 1, 'non-rumor': 0})
  df['verified'] = df['verified'].replace({True: 1, False: 0})


In [303]:
sydneysiege_st_rumors = fetch_source_df('sydneysiege', 'rumours')
print(sydneysiege_st_rumors.info())

First folder name: 544284823041212416
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 522 entries, 0 to 521
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   user_id          522 non-null    object
 1   tweet_id         522 non-null    object
 2   tweet_text       522 non-null    object
 3   retweet_count    522 non-null    int64 
 4   favorite_count   522 non-null    int64 
 5   followers_count  522 non-null    int64 
 6   friends_count    522 non-null    int64 
 7   verified         522 non-null    int64 
 8   created_at       522 non-null    object
 9   label            522 non-null    int64 
dtypes: int64(6), object(4)
memory usage: 40.9+ KB
None


  df['label'] = df['label'].replace({'rumor': 1, 'non-rumor': 0})
  df['verified'] = df['verified'].replace({True: 1, False: 0})


### Rumor Responses

#### Ottawa

In [9]:
ottawa_r_res = fetch_response_tweets('ottawashooting', 'rumours','rumour')
print(ottawa_r_res.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5966 entries, 0 to 5965
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   main_tweet_id          5966 non-null   object
 1   response_tweet_id      5966 non-null   object
 2   user_id                5966 non-null   object
 3   in_reply_to_status_id  5966 non-null   object
 4   in_reply_to_user_id    5966 non-null   object
 5   response_tweet_text    5966 non-null   object
 6   retweet_count          5966 non-null   int64 
 7   favorite_count         5966 non-null   int64 
 8   response_created_at    5966 non-null   object
 9   label                  5966 non-null   int64 
dtypes: int64(3), object(7)
memory usage: 466.2+ KB
None


  response_tweet_data['label'] = response_tweet_data['label'].replace({'rumour': 1, 'non-rumour': 0})


In [148]:
# check null rows
display(check_nulls(ottawa_r_res))

Unnamed: 0,main_tweet_id,in_reply_to_status_id,in_reply_to_user_id,user_id,response_tweet_id,response_tweet_text,response_created_at,label
407,524969144832491520,,,1574237179,524972939813605376,@Nduna_lux @SkyNews Religion. People who can't...,Wed Oct 22 17:17:57 +0000 2014,rumour
1870,524941504796962816,,,284006171,524955504758779905,@TheGreatKnuckle @catekustanczy frightening sc...,Wed Oct 22 16:08:40 +0000 2014,rumour
2561,524931913426157568,,,181033896,524944289043066880,@manof1000 @CP24 2 soldiers got run over yeste...,Wed Oct 22 15:24:06 +0000 2014,rumour
3317,525023025792835585,,,2675225370,525040225441951744,@butterfly_LV @Cameron_Gray @globeandmail 😢,Wed Oct 22 21:45:19 +0000 2014,rumour
3323,524949339131904000,,,168697045,524952230324371456,@Nduna_lux @BBCBreaking @BBCWorld It's not Mus...,Wed Oct 22 15:55:39 +0000 2014,rumour
3327,524949339131904000,,,21213776,524954176619831297,@Nduna_lux @BBCBreaking @BBCWorld learn how to...,Wed Oct 22 16:03:23 +0000 2014,rumour
3332,524949339131904000,,,2547509651,524956770268372992,@Nduna_lux @BBCBreaking @BBCWorld @Dowlerjnr b...,Wed Oct 22 16:13:41 +0000 2014,rumour
3340,524949339131904000,,,21213776,524960761677103104,@Nduna_lux @BBCBreaking @BBCWorld go there or ...,Wed Oct 22 16:29:33 +0000 2014,rumour
3343,524949339131904000,,,21213776,524963463689666560,@Nduna_lux @BBCBreaking @BBCWorld look I'm not...,Wed Oct 22 16:40:17 +0000 2014,rumour
3365,524949339131904000,,,325942107,524976393499992064,"@Nduna_lux @sophie__malik It's official, you'r...",Wed Oct 22 17:31:40 +0000 2014,rumour


In [199]:
ottawa_r_res=replace_vals(ottawa_r_res)
ottawa_r_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5966 entries, 0 to 5965
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   main_tweet_id          5966 non-null   object
 1   in_reply_to_status_id  5966 non-null   object
 2   in_reply_to_user_id    5966 non-null   object
 3   user_id                5966 non-null   int64 
 4   response_tweet_id      5966 non-null   int64 
 5   response_tweet_text    5966 non-null   object
 6   response_created_at    5966 non-null   object
 7   label                  5966 non-null   object
dtypes: int64(2), object(6)
memory usage: 373.0+ KB


In [200]:
ottawa_r_res=to_str(ottawa_r_res)
ottawa_r_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5966 entries, 0 to 5965
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   main_tweet_id          5966 non-null   object
 1   in_reply_to_status_id  5966 non-null   object
 2   in_reply_to_user_id    5966 non-null   object
 3   user_id                5966 non-null   object
 4   response_tweet_id      5966 non-null   object
 5   response_tweet_text    5966 non-null   object
 6   response_created_at    5966 non-null   object
 7   label                  5966 non-null   object
dtypes: object(8)
memory usage: 373.0+ KB


#### Charlie

In [10]:
charliehebdo_r_res = fetch_response_tweets('charliehebdo', 'rumours','rumour')
print(charliehebdo_r_res.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6887 entries, 0 to 6886
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   main_tweet_id          6887 non-null   object
 1   response_tweet_id      6887 non-null   object
 2   user_id                6887 non-null   object
 3   in_reply_to_status_id  6887 non-null   object
 4   in_reply_to_user_id    6887 non-null   object
 5   response_tweet_text    6887 non-null   object
 6   retweet_count          6887 non-null   int64 
 7   favorite_count         6887 non-null   int64 
 8   response_created_at    6887 non-null   object
 9   label                  6887 non-null   int64 
dtypes: int64(3), object(7)
memory usage: 538.2+ KB
None


  response_tweet_data['label'] = response_tweet_data['label'].replace({'rumour': 1, 'non-rumour': 0})


In [272]:
charliehebdo_r_res.isna().sum()

main_tweet_id            0
response_tweet_id        0
user_id                  0
in_reply_to_status_id    0
in_reply_to_user_id      0
response_tweet_text      0
retweet_count            0
favorite_count           0
response_created_at      0
label                    0
dtype: int64

In [273]:
# check null rows
null_mask = charliehebdo_r_res.isnull().any(axis=1)
null_rows = charliehebdo_r_res[null_mask]
display(null_rows)

Unnamed: 0,main_tweet_id,response_tweet_id,user_id,in_reply_to_status_id,in_reply_to_user_id,response_tweet_text,retweet_count,favorite_count,response_created_at,label


In [271]:
columns = ['in_reply_to_status_id', 'in_reply_to_user_id']
charliehebdo_r_res[columns] = charliehebdo_r_res[columns].fillna('No ID') 
charliehebdo_r_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6887 entries, 0 to 6886
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   main_tweet_id          6887 non-null   object
 1   response_tweet_id      6887 non-null   object
 2   user_id                6887 non-null   object
 3   in_reply_to_status_id  6887 non-null   object
 4   in_reply_to_user_id    6887 non-null   object
 5   response_tweet_text    6887 non-null   object
 6   retweet_count          6887 non-null   int64 
 7   favorite_count         6887 non-null   int64 
 8   response_created_at    6887 non-null   object
 9   label                  6887 non-null   object
dtypes: int64(2), object(8)
memory usage: 538.2+ KB


#### Ferguson

In [11]:
ferguson_r_res = fetch_response_tweets('ferguson', 'rumours','rumour')
print(ferguson_r_res.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6195 entries, 0 to 6194
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   main_tweet_id          6195 non-null   object
 1   response_tweet_id      6195 non-null   object
 2   user_id                6195 non-null   object
 3   in_reply_to_status_id  6195 non-null   object
 4   in_reply_to_user_id    6195 non-null   object
 5   response_tweet_text    6195 non-null   object
 6   retweet_count          6195 non-null   int64 
 7   favorite_count         6195 non-null   int64 
 8   response_created_at    6195 non-null   object
 9   label                  6195 non-null   int64 
dtypes: int64(3), object(7)
memory usage: 484.1+ KB
None


  response_tweet_data['label'] = response_tweet_data['label'].replace({'rumour': 1, 'non-rumour': 0})


In [275]:
ferguson_r_res.isna().sum()

main_tweet_id              0
response_tweet_id          0
user_id                    0
in_reply_to_status_id    222
in_reply_to_user_id      222
response_tweet_text        0
retweet_count              0
favorite_count             0
response_created_at        0
label                      0
dtype: int64

In [276]:
# check null rows
null_mask = ferguson_r_res.isnull().any(axis=1)
null_rows = ferguson_r_res[null_mask]
display(null_rows)

Unnamed: 0,main_tweet_id,response_tweet_id,user_id,in_reply_to_status_id,in_reply_to_user_id,response_tweet_text,retweet_count,favorite_count,response_created_at,label
0,500374427448516609,500374427448516609,1291770157,,,.@CNN KEEPS playing the video while saying it ...,227,63,Fri Aug 15 20:12:14 +0000 2014,rumour
4,500277808883830784,500277808883830784,225235528,,,BREAKING: #Ferguson police chief just announce...,122,47,Fri Aug 15 13:48:19 +0000 2014,rumour
18,500280477920796672,500280477920796672,21728303,,,"Because, of course, when someone commits a rob...",539,213,Fri Aug 15 13:58:55 +0000 2014,rumour
57,500326267640905728,500326267640905728,14717197,,,Report: #Ferguson police beat up wrong suspect...,165,74,Fri Aug 15 17:00:52 +0000 2014,rumour
61,500292917957648384,500292917957648384,360129829,,,So in 5mins mike brown shaved his head and cha...,132,60,Fri Aug 15 14:48:21 +0000 2014,rumour
...,...,...,...,...,...,...,...,...,...,...
6068,500300813835988992,500300813835988992,36493412,,,They wanted u 2 believe that #TrayvonMartin wa...,309,86,Fri Aug 15 15:19:43 +0000 2014,rumour
6083,500375545918803970,500375545918803970,23831448,,,This is what #Ferguson PD did to #HenryDavis i...,280,58,Fri Aug 15 20:16:41 +0000 2014,rumour
6105,500375190535421953,500375190535421953,370310957,,,If anyone's looking for a reason to murder me ...,150,205,Fri Aug 15 20:15:16 +0000 2014,rumour
6112,500260514807947264,500260514807947264,972651,,,Developing: Police in #Ferguson expected to an...,100,28,Fri Aug 15 12:39:35 +0000 2014,rumour


In [277]:
columns = ['in_reply_to_status_id', 'in_reply_to_user_id']
ferguson_r_res[columns] = ferguson_r_res[columns].fillna('No ID') 
ferguson_r_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6195 entries, 0 to 6194
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   main_tweet_id          6195 non-null   object
 1   response_tweet_id      6195 non-null   object
 2   user_id                6195 non-null   object
 3   in_reply_to_status_id  6195 non-null   object
 4   in_reply_to_user_id    6195 non-null   object
 5   response_tweet_text    6195 non-null   object
 6   retweet_count          6195 non-null   int64 
 7   favorite_count         6195 non-null   int64 
 8   response_created_at    6195 non-null   object
 9   label                  6195 non-null   object
dtypes: int64(2), object(8)
memory usage: 484.1+ KB


In [123]:
rows = ['main_tweet_id', 'in_reply_to_status_id', 'in_reply_to_user_id','user_id','response_tweet_id']
ferguson_r_res[rows] = ferguson_r_res[rows].astype(str)
ferguson_r_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6195 entries, 0 to 6194
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   main_tweet_id          6195 non-null   object
 1   in_reply_to_status_id  6195 non-null   object
 2   in_reply_to_user_id    6195 non-null   object
 3   user_id                6195 non-null   object
 4   response_tweet_id      6195 non-null   object
 5   response_tweet_text    6195 non-null   object
 6   response_created_at    6195 non-null   object
 7   lable                  6195 non-null   object
dtypes: object(8)
memory usage: 387.3+ KB


#### German wings crash

In [12]:
germanwings_crash_r_res = fetch_response_tweets('germanwings-crash', 'rumours','rumour')
print(germanwings_crash_r_res.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2256 entries, 0 to 2255
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   main_tweet_id          2256 non-null   object
 1   response_tweet_id      2256 non-null   object
 2   user_id                2256 non-null   object
 3   in_reply_to_status_id  2256 non-null   object
 4   in_reply_to_user_id    2256 non-null   object
 5   response_tweet_text    2256 non-null   object
 6   retweet_count          2256 non-null   int64 
 7   favorite_count         2256 non-null   int64 
 8   response_created_at    2256 non-null   object
 9   label                  2256 non-null   int64 
dtypes: int64(3), object(7)
memory usage: 176.4+ KB
None


  response_tweet_data['label'] = response_tweet_data['label'].replace({'rumour': 1, 'non-rumour': 0})


In [195]:
germanwings_crash_r_res.isna().sum()

main_tweet_id            0
in_reply_to_status_id    0
in_reply_to_user_id      0
user_id                  0
response_tweet_id        0
response_tweet_text      0
response_created_at      0
label                    0
dtype: int64

In [196]:
rows = ['in_reply_to_status_id', 'in_reply_to_user_id','user_id','response_tweet_id']
sydneysiege_r_res[rows] = sydneysiege_r_res[rows].astype(str)
sydneysiege_r_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8154 entries, 0 to 8153
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   main_tweet_id          8154 non-null   object
 1   in_reply_to_status_id  8154 non-null   object
 2   in_reply_to_user_id    8154 non-null   object
 3   user_id                8154 non-null   object
 4   response_tweet_id      8154 non-null   object
 5   response_tweet_text    8154 non-null   object
 6   response_created_at    8154 non-null   object
 7   label                  8154 non-null   object
dtypes: object(8)
memory usage: 509.8+ KB


#### Sydney

In [13]:
sydneysiege_r_res = fetch_response_tweets('sydneysiege', 'rumours','rumour')
print(sydneysiege_r_res.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8154 entries, 0 to 8153
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   main_tweet_id          8154 non-null   object
 1   response_tweet_id      8154 non-null   object
 2   user_id                8154 non-null   object
 3   in_reply_to_status_id  8154 non-null   object
 4   in_reply_to_user_id    8154 non-null   object
 5   response_tweet_text    8154 non-null   object
 6   retweet_count          8154 non-null   int64 
 7   favorite_count         8154 non-null   int64 
 8   response_created_at    8154 non-null   object
 9   label                  8154 non-null   int64 
dtypes: int64(3), object(7)
memory usage: 637.2+ KB
None


  response_tweet_data['label'] = response_tweet_data['label'].replace({'rumour': 1, 'non-rumour': 0})


In [189]:
sydneysiege_r_res.isna().sum()

main_tweet_id            0
in_reply_to_status_id    9
in_reply_to_user_id      9
user_id                  0
response_tweet_id        0
response_tweet_text      0
response_created_at      0
label                    0
dtype: int64

In [116]:
# check null rows
null_mask = sydneysiege_r_res.isnull().any(axis=1)
null_rows = sydneysiege_r_res[null_mask]
display(null_rows)

Unnamed: 0,main_tweet_id,in_reply_to_status_id,in_reply_to_user_id,user_id,response_tweet_id,response_tweet_text,response_created_at,lable
1172,544350712365207552,,,611097005,544354905335820288,@Joseph_M20 @dailytelegraph Christians: Timoth...,Mon Dec 15 04:54:57 +0000 2014,rumour
1175,544350712365207552,,,611097005,544356405759995904,@Joseph_M20 @dailytelegraph and yet he claimed...,Mon Dec 15 05:00:55 +0000 2014,rumour
1179,544350712365207552,,,611097005,544362235414720512,@Joseph_M20 @dailytelegraph that's a dime a do...,Mon Dec 15 05:24:05 +0000 2014,rumour
1180,544350712365207552,,,611097005,544363228885966848,@Joseph_M20 @dailytelegraph clearly not the ne...,Mon Dec 15 05:28:02 +0000 2014,rumour
1182,544350712365207552,,,611097005,544365021011390464,@Joseph_M20 @dailytelegraph You do realize Mus...,Mon Dec 15 05:35:09 +0000 2014,rumour
1183,544350712365207552,,,611097005,544366589546864640,@Joseph_M20 @dailytelegraph and by that you me...,Mon Dec 15 05:41:23 +0000 2014,rumour
1184,544350712365207552,,,611097005,544367122202505216,@Joseph_M20 @dailytelegraph In Iran women can ...,Mon Dec 15 05:43:30 +0000 2014,rumour
4242,544492053900836864,,,2848043788,544496313539522561,@Joseph_M20 @SkyNews So now you're saying Iran...,Mon Dec 15 14:16:52 +0000 2014,rumour
7149,544289448150986752,,,20639175,544292218992082944,"@jay_psych Australian radio/TV. I disagree, th...",Mon Dec 15 00:45:52 +0000 2014,rumour


In [190]:
columns = ['in_reply_to_status_id', 'in_reply_to_user_id']
sydneysiege_r_res[columns] = sydneysiege_r_res[columns].fillna('null') 
sydneysiege_r_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8154 entries, 0 to 8153
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   main_tweet_id          8154 non-null   object
 1   in_reply_to_status_id  8154 non-null   object
 2   in_reply_to_user_id    8154 non-null   object
 3   user_id                8154 non-null   int64 
 4   response_tweet_id      8154 non-null   int64 
 5   response_tweet_text    8154 non-null   object
 6   response_created_at    8154 non-null   object
 7   label                  8154 non-null   object
dtypes: int64(2), object(6)
memory usage: 509.8+ KB


In [191]:
rows = ['main_tweet_id', 'in_reply_to_status_id', 'in_reply_to_user_id','user_id','response_tweet_id']
sydneysiege_r_res[rows] = sydneysiege_r_res[rows].astype(str)
sydneysiege_r_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8154 entries, 0 to 8153
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   main_tweet_id          8154 non-null   object
 1   in_reply_to_status_id  8154 non-null   object
 2   in_reply_to_user_id    8154 non-null   object
 3   user_id                8154 non-null   object
 4   response_tweet_id      8154 non-null   object
 5   response_tweet_text    8154 non-null   object
 6   response_created_at    8154 non-null   object
 7   label                  8154 non-null   object
dtypes: object(8)
memory usage: 509.8+ KB


## Read Non-rumors

### Source Tweets

In [311]:
ottawa_st_nr = fetch_source_df('ottawashooting', 'non-rumours')
print(ottawa_st_nr.info())

First folder name: 524941318389501953
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 420 entries, 0 to 419
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   user_id          420 non-null    object
 1   tweet_id         420 non-null    object
 2   tweet_text       420 non-null    object
 3   retweet_count    420 non-null    int64 
 4   favorite_count   420 non-null    int64 
 5   followers_count  420 non-null    int64 
 6   friends_count    420 non-null    int64 
 7   verified         420 non-null    int64 
 8   created_at       420 non-null    object
 9   label            420 non-null    int64 
dtypes: int64(6), object(4)
memory usage: 32.9+ KB
None


  df['label'] = df['label'].replace({'rumor': 1, 'non-rumor': 0})
  df['verified'] = df['verified'].replace({True: 1, False: 0})


In [312]:
charliehebdo_st_nr = fetch_source_df('charliehebdo', 'non-rumours')
print(charliehebdo_st_nr.info())

First folder name: 552829395690201088
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1621 entries, 0 to 1620
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   user_id          1621 non-null   object
 1   tweet_id         1621 non-null   object
 2   tweet_text       1621 non-null   object
 3   retweet_count    1621 non-null   int64 
 4   favorite_count   1621 non-null   int64 
 5   followers_count  1621 non-null   int64 
 6   friends_count    1621 non-null   int64 
 7   verified         1621 non-null   int64 
 8   created_at       1621 non-null   object
 9   label            1621 non-null   int64 
dtypes: int64(6), object(4)
memory usage: 126.8+ KB
None


  df['label'] = df['label'].replace({'rumor': 1, 'non-rumor': 0})
  df['verified'] = df['verified'].replace({True: 1, False: 0})


In [313]:
ferguson_st_nr = fetch_source_df('ferguson', 'non-rumours')
print(ferguson_st_nr.info())

First folder name: 499409455814287360
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 859 entries, 0 to 858
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   user_id          859 non-null    object
 1   tweet_id         859 non-null    object
 2   tweet_text       859 non-null    object
 3   retweet_count    859 non-null    int64 
 4   favorite_count   859 non-null    int64 
 5   followers_count  859 non-null    int64 
 6   friends_count    859 non-null    int64 
 7   verified         859 non-null    int64 
 8   created_at       859 non-null    object
 9   label            859 non-null    int64 
dtypes: int64(6), object(4)
memory usage: 67.2+ KB
None


  df['label'] = df['label'].replace({'rumor': 1, 'non-rumor': 0})
  df['verified'] = df['verified'].replace({True: 1, False: 0})


In [314]:
germanwingscrash_st_nr = fetch_source_df('germanwings-crash', 'non-rumours')
print(germanwingscrash_st_nr.info())

First folder name: 580330986220687360
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 231 entries, 0 to 230
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   user_id          231 non-null    object
 1   tweet_id         231 non-null    object
 2   tweet_text       231 non-null    object
 3   retweet_count    231 non-null    int64 
 4   favorite_count   231 non-null    int64 
 5   followers_count  231 non-null    int64 
 6   friends_count    231 non-null    int64 
 7   verified         231 non-null    int64 
 8   created_at       231 non-null    object
 9   label            231 non-null    int64 
dtypes: int64(6), object(4)
memory usage: 18.2+ KB
None


  df['label'] = df['label'].replace({'rumor': 1, 'non-rumor': 0})
  df['verified'] = df['verified'].replace({True: 1, False: 0})


In [315]:
sydneysiege_st_nr = fetch_source_df('sydneysiege', 'non-rumours')
print(sydneysiege_st_nr.info())

First folder name: 544506678922977280
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   user_id          699 non-null    object
 1   tweet_id         699 non-null    object
 2   tweet_text       699 non-null    object
 3   retweet_count    699 non-null    int64 
 4   favorite_count   699 non-null    int64 
 5   followers_count  699 non-null    int64 
 6   friends_count    699 non-null    int64 
 7   verified         699 non-null    int64 
 8   created_at       699 non-null    object
 9   label            699 non-null    int64 
dtypes: int64(6), object(4)
memory usage: 54.7+ KB
None


  df['label'] = df['label'].replace({'rumor': 1, 'non-rumor': 0})
  df['verified'] = df['verified'].replace({True: 1, False: 0})


### Response Tweets

#### Ottawa

In [14]:
ottawa_res_nr = fetch_response_tweets('ottawashooting', 'non-rumours','non-rumour')
print(ottawa_res_nr.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5428 entries, 0 to 5427
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   main_tweet_id          5428 non-null   object
 1   response_tweet_id      5428 non-null   object
 2   user_id                5428 non-null   object
 3   in_reply_to_status_id  5428 non-null   object
 4   in_reply_to_user_id    5428 non-null   object
 5   response_tweet_text    5428 non-null   object
 6   retweet_count          5428 non-null   int64 
 7   favorite_count         5428 non-null   int64 
 8   response_created_at    5428 non-null   object
 9   label                  5428 non-null   int64 
dtypes: int64(3), object(7)
memory usage: 424.2+ KB
None


  response_tweet_data['label'] = response_tweet_data['label'].replace({'rumour': 1, 'non-rumour': 0})


In [179]:
ottawa_res_nr.isna().sum()

main_tweet_id            0
in_reply_to_status_id    4
in_reply_to_user_id      4
user_id                  0
response_tweet_id        0
response_tweet_text      0
response_created_at      0
label                    0
dtype: int64

In [178]:
display(check_nulls(ottawa_res_nr))

Unnamed: 0,main_tweet_id,in_reply_to_status_id,in_reply_to_user_id,user_id,response_tweet_id,response_tweet_text,response_created_at,label
1095,524995776578330624,,,569655756,524999366877384706,@ISIS3HunnaSquad @pmharper @nypost garbage Sir...,Wed Oct 22 19:02:57 +0000 2014,non-rumour
1097,524995776578330624,,,569655756,524999824543072256,@ISIS3HunnaSquad @pmharper @nypost What nonsen...,Wed Oct 22 19:04:46 +0000 2014,non-rumour
1099,524995776578330624,,,1382947069,525005026851573761,@ISIS3HunnaSquad @pmharper @nypost @peterskuce...,Wed Oct 22 19:25:27 +0000 2014,non-rumour
3805,524934507892580352,,,35338880,524941095944601600,@HersiHujaleh @dgardner thanks Dan. I didn't r...,Wed Oct 22 15:11:24 +0000 2014,non-rumour


In [154]:
ottawa_res_nr=replace_vals(ottawa_res_nr)
ottawa_res_nr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5428 entries, 0 to 5427
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   main_tweet_id          5428 non-null   object
 1   in_reply_to_status_id  5428 non-null   object
 2   in_reply_to_user_id    5428 non-null   object
 3   user_id                5428 non-null   int64 
 4   response_tweet_id      5428 non-null   int64 
 5   response_tweet_text    5428 non-null   object
 6   response_created_at    5428 non-null   object
 7   label                  5428 non-null   object
dtypes: int64(2), object(6)
memory usage: 339.4+ KB


In [155]:
ottawa_res_nr=to_str(ottawa_res_nr)
ottawa_res_nr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5428 entries, 0 to 5427
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   main_tweet_id          5428 non-null   object
 1   in_reply_to_status_id  5428 non-null   object
 2   in_reply_to_user_id    5428 non-null   object
 3   user_id                5428 non-null   object
 4   response_tweet_id      5428 non-null   object
 5   response_tweet_text    5428 non-null   object
 6   response_created_at    5428 non-null   object
 7   label                  5428 non-null   object
dtypes: object(8)
memory usage: 339.4+ KB


#### Charlie

In [15]:
charliehebdo_nr_res = fetch_response_tweets('charliehebdo', 'non-rumours','non-rumour')
print(charliehebdo_nr_res.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29302 entries, 0 to 29301
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   main_tweet_id          29302 non-null  object
 1   response_tweet_id      29302 non-null  object
 2   user_id                29302 non-null  object
 3   in_reply_to_status_id  29302 non-null  object
 4   in_reply_to_user_id    29302 non-null  object
 5   response_tweet_text    29302 non-null  object
 6   retweet_count          29302 non-null  int64 
 7   favorite_count         29302 non-null  int64 
 8   response_created_at    29302 non-null  object
 9   label                  29302 non-null  int64 
dtypes: int64(3), object(7)
memory usage: 2.2+ MB
None


  response_tweet_data['label'] = response_tweet_data['label'].replace({'rumour': 1, 'non-rumour': 0})


In [292]:
charliehebdo_nr_res.isna().sum()

main_tweet_id            0
response_tweet_id        0
user_id                  0
in_reply_to_status_id    0
in_reply_to_user_id      0
response_tweet_text      0
retweet_count            0
favorite_count           0
response_created_at      0
label                    0
dtype: int64

In [157]:
display(check_nulls(charliehebdo_nr_res))

Unnamed: 0,main_tweet_id,in_reply_to_status_id,in_reply_to_user_id,user_id,response_tweet_id,response_tweet_text,response_created_at,label
542,552815443959091200,,,2319710954,552863202238013440,@Slaphappy70 @MaryDram @NewStatesman One thing...,Wed Jan 07 16:23:53 +0000 2015,non-rumour
546,552815443959091200,,,47491845,552867309455695872,@Slaphappy70 @WayneParcheman @NewStatesman nut...,Wed Jan 07 16:40:13 +0000 2015,non-rumour
547,552815443959091200,,,2319710954,552923345042542593,@Slaphappy70 @MaryDram @NewStatesman it's abou...,Wed Jan 07 20:22:52 +0000 2015,non-rumour
548,552815443959091200,,,47491845,553202872830402561,@Slaphappy70 @WayneParcheman @NewStatesman as ...,Thu Jan 08 14:53:37 +0000 2015,non-rumour
735,553590072373968897,,,16285681,553652765508255744,@The_Meme_Artist @ageless_vintage @CBSNews @cr...,Fri Jan 09 20:41:20 +0000 2015,non-rumour
...,...,...,...,...,...,...,...,...
29051,553514329166061568,,,2425223344,553567897076129792,@candymondaytoo @Naila_TRQ @DAVIDMDRAIMAN @Zak...,Fri Jan 09 15:04:06 +0000 2015,non-rumour
29103,553514329166061568,,,2425223344,556208204099878913,@candymondaytoo @Naila_TRQ @DAVIDMDRAIMAN @Zak...,Fri Jan 16 21:55:44 +0000 2015,non-rumour
29104,553514329166061568,,,538818845,556213859917512704,@candymondaytoo @i_luv_ISRAEL @Naila_TRQ @DAVI...,Fri Jan 16 22:18:12 +0000 2015,non-rumour
29106,553514329166061568,,,427655976,556218413752664064,@candymondaytoo @Callitspades @Naila_TRQ @i_lu...,Fri Jan 16 22:36:18 +0000 2015,non-rumour


In [159]:
charliehebdo_nr_res=replace_vals(charliehebdo_nr_res)
charliehebdo_nr_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29302 entries, 0 to 29301
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   main_tweet_id          29302 non-null  object
 1   in_reply_to_status_id  29302 non-null  object
 2   in_reply_to_user_id    29302 non-null  object
 3   user_id                29302 non-null  int64 
 4   response_tweet_id      29302 non-null  int64 
 5   response_tweet_text    29302 non-null  object
 6   response_created_at    29302 non-null  object
 7   label                  29302 non-null  object
dtypes: int64(2), object(6)
memory usage: 1.8+ MB


In [161]:
charliehebdo_nr_res=to_str(charliehebdo_nr_res)
charliehebdo_nr_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29302 entries, 0 to 29301
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   main_tweet_id          29302 non-null  object
 1   in_reply_to_status_id  29302 non-null  object
 2   in_reply_to_user_id    29302 non-null  object
 3   user_id                29302 non-null  object
 4   response_tweet_id      29302 non-null  object
 5   response_tweet_text    29302 non-null  object
 6   response_created_at    29302 non-null  object
 7   label                  29302 non-null  object
dtypes: object(8)
memory usage: 1.8+ MB


#### Ferguson

In [16]:
ferguson_nr_res = fetch_response_tweets('ferguson', 'non-rumours','non-rumour')
print(ferguson_nr_res.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16837 entries, 0 to 16836
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   main_tweet_id          16837 non-null  object
 1   response_tweet_id      16837 non-null  object
 2   user_id                16837 non-null  object
 3   in_reply_to_status_id  16837 non-null  object
 4   in_reply_to_user_id    16837 non-null  object
 5   response_tweet_text    16837 non-null  object
 6   retweet_count          16837 non-null  int64 
 7   favorite_count         16837 non-null  int64 
 8   response_created_at    16837 non-null  object
 9   label                  16837 non-null  int64 
dtypes: int64(3), object(7)
memory usage: 1.3+ MB
None


  response_tweet_data['label'] = response_tweet_data['label'].replace({'rumour': 1, 'non-rumour': 0})


In [294]:
ferguson_nr_res.isna().sum()

main_tweet_id            0
response_tweet_id        0
user_id                  0
in_reply_to_status_id    0
in_reply_to_user_id      0
response_tweet_text      0
retweet_count            0
favorite_count           0
response_created_at      0
label                    0
dtype: int64

In [280]:
display(check_nulls(ferguson_nr_res))

Unnamed: 0,main_tweet_id,response_tweet_id,user_id,in_reply_to_status_id,in_reply_to_user_id,response_tweet_text,retweet_count,favorite_count,response_created_at,label
0,499409455814287360,499409455814287360,14090948,,,Crowd peacefully dispersing at the QT tonight....,106,54,Wed Aug 13 04:17:47 +0000 2014,non-rumour
7,499695028407922689,499695028407922689,14782581,,,what the FUCK are you looking at through a MOU...,128,47,Wed Aug 13 23:12:33 +0000 2014,non-rumour
130,500071770192216065,500071770192216065,2097571,,,Are these cops or soldiers? Experts say #Fergu...,845,454,Fri Aug 15 00:09:35 +0000 2014,non-rumour
151,500352655827279874,500352655827279874,195271137,,,Muslims butchering Muslims=no biggie 'til an I...,149,96,Fri Aug 15 18:45:43 +0000 2014,non-rumour
170,499701669437784064,499701669437784064,26559241,,,Just as the Founding Fathers intended. RT @Bmo...,146,57,Wed Aug 13 23:38:56 +0000 2014,non-rumour
...,...,...,...,...,...,...,...,...,...,...
16767,500212911856295940,500212911856295940,2725579568,,,"No matter what we do, it won't be as bad as ki...",123,139,Fri Aug 15 09:30:26 +0000 2014,non-rumour
16770,500385585807499266,500385585807499266,26792275,,,Iran’s Supreme Leader has taken to Twitter to ...,139,51,Fri Aug 15 20:56:35 +0000 2014,non-rumour
16789,498530904139378688,498530904139378688,142034504,,,"""Don't shoot! Don't shoot!"" #ferguson https:/...",130,39,Sun Aug 10 18:06:44 +0000 2014,non-rumour
16795,499705810076053504,499705810076053504,279390084,,,SWAT police just entered a McDonald's in #Ferg...,961,223,Wed Aug 13 23:55:23 +0000 2014,non-rumour


In [282]:
ferguson_nr_res=replace_vals(ferguson_nr_res)
ferguson_nr_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16837 entries, 0 to 16836
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   main_tweet_id          16837 non-null  object
 1   response_tweet_id      16837 non-null  object
 2   user_id                16837 non-null  object
 3   in_reply_to_status_id  16837 non-null  object
 4   in_reply_to_user_id    16837 non-null  object
 5   response_tweet_text    16837 non-null  object
 6   retweet_count          16837 non-null  int64 
 7   favorite_count         16837 non-null  int64 
 8   response_created_at    16837 non-null  object
 9   label                  16837 non-null  object
dtypes: int64(2), object(8)
memory usage: 1.3+ MB


In [166]:
ferguson_nr_res=to_str(ferguson_nr_res)
ferguson_nr_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16837 entries, 0 to 16836
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   main_tweet_id          16837 non-null  object
 1   in_reply_to_status_id  16837 non-null  object
 2   in_reply_to_user_id    16837 non-null  object
 3   user_id                16837 non-null  object
 4   response_tweet_id      16837 non-null  object
 5   response_tweet_text    16837 non-null  object
 6   response_created_at    16837 non-null  object
 7   label                  16837 non-null  object
dtypes: object(8)
memory usage: 1.0+ MB


#### Germanwings

In [17]:
germanwings_nr_res = fetch_response_tweets('germanwings-crash', 'non-rumours','non-rumour')
print(germanwings_nr_res.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1764 entries, 0 to 1763
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   main_tweet_id          1764 non-null   object
 1   response_tweet_id      1764 non-null   object
 2   user_id                1764 non-null   object
 3   in_reply_to_status_id  1764 non-null   object
 4   in_reply_to_user_id    1764 non-null   object
 5   response_tweet_text    1764 non-null   object
 6   retweet_count          1764 non-null   int64 
 7   favorite_count         1764 non-null   int64 
 8   response_created_at    1764 non-null   object
 9   label                  1764 non-null   int64 
dtypes: int64(3), object(7)
memory usage: 137.9+ KB
None


  response_tweet_data['label'] = response_tweet_data['label'].replace({'rumour': 1, 'non-rumour': 0})


In [250]:
germanwings_nr_res.tail(20)

Unnamed: 0,main_tweet_id,in_reply_to_status_id,in_reply_to_user_id,user_id,response_tweet_id,response_tweet_text,response_created_at,label
1744,580328029852856320,580328029852856320,15513604,2571860615,580978964832743425,@foxandfriends Inside job.,Thu Mar 26 06:25:53 +0000 2015,non-rumour
1745,580341214098137089,580341214098137089,1453757544,2933587210,580341517438590976,@TZON57 RIP with love to their families and fr...,Tue Mar 24 12:12:54 +0000 2015,non-rumour
1746,580341214098137089,580341214098137089,1453757544,2897867019,580341566340001792,@TZON57 ❤😘💕😍,Tue Mar 24 12:13:06 +0000 2015,non-rumour
1747,580341214098137089,580341214098137089,1453757544,2865632684,580341591497424896,@TZON57 Mein Beileid an alle Angehörigen :( #...,Tue Mar 24 12:13:12 +0000 2015,non-rumour
1748,580341214098137089,580341214098137089,1453757544,2797667877,580341839850524672,@TZON57 Finde es gut das du etwas dazu schreib...,Tue Mar 24 12:14:11 +0000 2015,non-rumour
1749,580341214098137089,580341214098137089,1453757544,2797667877,580341903910146048,@TZON57 wie schlimm muss das für die angehörig...,Tue Mar 24 12:14:26 +0000 2015,non-rumour
1750,580341214098137089,580341214098137089,1453757544,2493061162,580342522028920833,@TZON57 😔,Tue Mar 24 12:16:54 +0000 2015,non-rumour
1751,580341214098137089,580341214098137089,1453757544,3079006463,580342586931593216,@TZON57 r.i.p🙇 mein beileid😩,Tue Mar 24 12:17:09 +0000 2015,non-rumour
1752,580341214098137089,580341214098137089,1453757544,2869054865,580342607110365184,@TZON57 💕,Tue Mar 24 12:17:14 +0000 2015,non-rumour
1753,580341214098137089,580341214098137089,1453757544,2778332944,580343623822245888,@TZON57 #Pray #Germanwings das leben kann so s...,Tue Mar 24 12:21:16 +0000 2015,non-rumour


In [213]:
germanwings_nr_res.isna().sum()

main_tweet_id            0
in_reply_to_status_id    1
in_reply_to_user_id      1
user_id                  0
response_tweet_id        0
response_tweet_text      0
response_created_at      0
label                    0
dtype: int64

In [214]:
display(check_nulls(germanwings_nr_res))

Unnamed: 0,main_tweet_id,in_reply_to_status_id,in_reply_to_user_id,user_id,response_tweet_id,response_tweet_text,response_created_at,label
714,580330078682353665,,,581831550,580331920665526272,@KimJongFunk @ChristianJansse @planefinder Chr...,Tue Mar 24 11:34:46 +0000 2015,non-rumour


In [221]:
germanwings_nr_res=replace_vals(germanwings_nr_res)
germanwings_nr_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1764 entries, 0 to 1763
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   main_tweet_id          1764 non-null   object
 1   in_reply_to_status_id  1764 non-null   object
 2   in_reply_to_user_id    1764 non-null   object
 3   user_id                1764 non-null   object
 4   response_tweet_id      1764 non-null   object
 5   response_tweet_text    1764 non-null   object
 6   response_created_at    1764 non-null   object
 7   label                  1764 non-null   object
dtypes: object(8)
memory usage: 110.4+ KB


In [228]:
display(germanwings_nr_res[germanwings_nr_res['main_tweet_id']=='580330078682353665'])

Unnamed: 0,main_tweet_id,in_reply_to_status_id,in_reply_to_user_id,user_id,response_tweet_id,response_tweet_text,response_created_at,label
713,580330078682353665,5.803300786823537e+17,100936791.0,19067011,580330648839278592,@planefinder isn’t -5312 ft/min a steep decent...,Tue Mar 24 11:29:43 +0000 2015,non-rumour
714,580330078682353665,,,581831550,580331920665526272,@KimJongFunk @ChristianJansse @planefinder Chr...,Tue Mar 24 11:34:46 +0000 2015,non-rumour


In [227]:
germanwings_nr_res=to_str(germanwings_nr_res)
germanwings_nr_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1764 entries, 0 to 1763
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   main_tweet_id          1764 non-null   object
 1   in_reply_to_status_id  1764 non-null   object
 2   in_reply_to_user_id    1764 non-null   object
 3   user_id                1764 non-null   object
 4   response_tweet_id      1764 non-null   object
 5   response_tweet_text    1764 non-null   object
 6   response_created_at    1764 non-null   object
 7   label                  1764 non-null   object
dtypes: object(8)
memory usage: 110.4+ KB


#### Sydney

In [18]:
sydneysiege_nr_res = fetch_response_tweets('sydneysiege', 'non-rumours','non-rumour')
print(sydneysiege_nr_res.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14621 entries, 0 to 14620
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   main_tweet_id          14621 non-null  object
 1   response_tweet_id      14621 non-null  object
 2   user_id                14621 non-null  object
 3   in_reply_to_status_id  14621 non-null  object
 4   in_reply_to_user_id    14621 non-null  object
 5   response_tweet_text    14621 non-null  object
 6   retweet_count          14621 non-null  int64 
 7   favorite_count         14621 non-null  int64 
 8   response_created_at    14621 non-null  object
 9   label                  14621 non-null  int64 
dtypes: int64(3), object(7)
memory usage: 1.1+ MB
None


  response_tweet_data['label'] = response_tweet_data['label'].replace({'rumour': 1, 'non-rumour': 0})


In [173]:
sydneysiege_nr_res.isna().sum()

main_tweet_id            0
in_reply_to_status_id    5
in_reply_to_user_id      5
user_id                  0
response_tweet_id        0
response_tweet_text      0
response_created_at      0
label                    0
dtype: int64

In [174]:
display(check_nulls(sydneysiege_nr_res))

Unnamed: 0,main_tweet_id,in_reply_to_status_id,in_reply_to_user_id,user_id,response_tweet_id,response_tweet_text,response_created_at,label
4423,544439380329644032,,,1299846277,544440783924105216,.@MedievalSwan @TelegraphNews Why not condemn ...,Mon Dec 15 10:36:12 +0000 2014,non-rumour
4426,544439380329644032,,,90367713,544441948476825600,@MedievalSwan @TelegraphNews @AHart1974 Ever r...,Mon Dec 15 10:40:50 +0000 2014,non-rumour
4433,544439380329644032,,,2780946635,544466702223044608,"@MedievalSwan haha, what an outrageous comment...",Mon Dec 15 12:19:12 +0000 2014,non-rumour
12898,544424261163569152,,,24140490,544435660946284544,@emotivezarry Fuck you I know more than you an...,Mon Dec 15 10:15:51 +0000 2014,non-rumour
12920,544424261163569152,,,116317899,544628048398925824,@emotivezarry @bigbosco John is a shitstain on...,Mon Dec 15 23:00:20 +0000 2014,non-rumour


In [175]:
sydneysiege_nr_res=replace_vals(sydneysiege_nr_res)
sydneysiege_nr_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14621 entries, 0 to 14620
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   main_tweet_id          14621 non-null  object
 1   in_reply_to_status_id  14621 non-null  object
 2   in_reply_to_user_id    14621 non-null  object
 3   user_id                14621 non-null  int64 
 4   response_tweet_id      14621 non-null  int64 
 5   response_tweet_text    14621 non-null  object
 6   response_created_at    14621 non-null  object
 7   label                  14621 non-null  object
dtypes: int64(2), object(6)
memory usage: 913.9+ KB


In [176]:
sydneysiege_nr_res=to_str(sydneysiege_nr_res)
sydneysiege_nr_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14621 entries, 0 to 14620
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   main_tweet_id          14621 non-null  object
 1   in_reply_to_status_id  14621 non-null  object
 2   in_reply_to_user_id    14621 non-null  object
 3   user_id                14621 non-null  object
 4   response_tweet_id      14621 non-null  object
 5   response_tweet_text    14621 non-null  object
 6   response_created_at    14621 non-null  object
 7   label                  14621 non-null  object
dtypes: object(8)
memory usage: 913.9+ KB


## Miscellenious

In [2]:
import os, json
import pandas as pd

# Define the directory path
directory_path = "phemernrdataset/pheme-rnr-dataset/ottawashooting/rumours/"

#create an empty DataFrame
source_Tweet_data = pd.DataFrame()
user_id = []
tweet_id = []
tweet_text = []
favorite_count = []
retweet_count = []
created_at = []
# Get the list of folders in the directory
folders = [folder for folder in os.listdir(directory_path) if os.path.isdir(os.path.join(directory_path, folder))]

# Get the first folder name
first_folder_name = folders[0] if folders else None
#print(len(folders))
print("First folder name:", first_folder_name)
index = 0
for file_name in folders:
    new_path = directory_path + file_name+'/' + 'source-tweet/' + file_name + '.json'
    f = open(new_path)
    data = json.load(f)
    #print(data)
    user_id.append(data['user']['id'])
    tweet_id.append(data['id'])
    tweet_text.append(data['text'])
    favorite_count.append(data['favorite_count'])
    retweet_count.append(data['retweet_count'])
    created_at.append(data['created_at'])

#create a df
data = {'user_id': user_id,'tweet_id':tweet_id, 'tweet_text':tweet_text, 'favorite_count':favorite_count, 
        'retweet_count':retweet_count, 'created_at':created_at}

source_Tweet_data = pd.DataFrame(data)
source_Tweet_data['label'] = 'rumor'
print("Process Complete")
print(source_Tweet_data.info())

First folder name: 524944399890124801
Process Complete
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 470 entries, 0 to 469
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   user_id         470 non-null    int64 
 1   tweet_id        470 non-null    int64 
 2   tweet_text      470 non-null    object
 3   favorite_count  470 non-null    int64 
 4   retweet_count   470 non-null    int64 
 5   created_at      470 non-null    object
 6   label           470 non-null    object
dtypes: int64(4), object(3)
memory usage: 25.8+ KB
None


In [19]:
response_user_id = []
response_tweet_id = []
in_reply_to_status_id = []
in_reply_to_user_id = []
response_tweet_text = []
response_retweet_count = []
response_created_at = []
main_thread_id = []

#loop through all the folders in the directory
for folder in folders:
    path = directory_path + folder+'/' + 'reactions/'
    json_files = sorted([pos_json for pos_json in os.listdir(path)
             if pos_json.endswith('.json')])
    
    for index, js in enumerate(json_files):

        try:
            #open the file using the given directory
            with open(os.path.join(path, js)) as file:
                #load data from json file
                data = json.load(file)

                #convert to string first before storing in array
                response_user_id.append(data['user']['id'])
                main_thread_id.append(folder)
                response_tweet_id.append(data['id'])
                in_reply_to_status_id.append(data['in_reply_to_status_id'])
                in_reply_to_user_id.append(data['in_reply_to_user_id'])
                response_tweet_text.append(data['text'])
                response_retweet_count.append(data['retweet_count'])
                response_created_at.append(data['created_at'])

        except Exception as inst:
            print(type(inst))    # the exception type
            print(inst.args)     # arguments stored in .args
            print(inst) 


r_user_id = map(str, r_user_id)

#create a df
data = {'main_tweet_id': main_thread_id,
        'in_reply_to_status_id':in_reply_to_status_id, 
        'in_reply_to_user_id':in_reply_to_user_id, 
        'user_id': response_user_id,
        'response_tweet_id':response_tweet_id, 
        'response_tweet_text':response_tweet_text,'response_tweet_text':response_tweet_text,
        'response_created_at':response_created_at}

response_tweet_data = pd.DataFrame(data)
print("Process Complete")
print(response_tweet_data.info())
            

Process Complete
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5966 entries, 0 to 5965
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   main_tweet_id          5966 non-null   object 
 1   in_reply_to_status_id  5950 non-null   float64
 2   in_reply_to_user_id    5950 non-null   float64
 3   user_id                5966 non-null   int64  
 4   response_tweet_id      5966 non-null   int64  
 5   response_tweet_text    5966 non-null   object 
 6   response_created_at    5966 non-null   object 
dtypes: float64(2), int64(2), object(3)
memory usage: 326.4+ KB
None


In [20]:
response_user_id = []
response_tweet_id = []
in_reply_to_status_id = []
in_reply_to_user_id = []
response_tweet_text = []
response_retweet_count = []
response_created_at = []
main_thread_id = []

#loop through all the folders in the directory
for folder in folders:
    path = directory_path + folder+'/' + 'reactions/'
    json_files = sorted([pos_json for pos_json in os.listdir(path)
             if pos_json.endswith('.json')])
    
    for index, js in enumerate(json_files):

        try:
            #open the file using the given directory
            with open(os.path.join(path, js)) as file:
                #load data from json file
                data = json.load(file)

                #convert to string first before storing in array
                response_user_id.append(data['user']['id'])
                main_thread_id.append(folder)
                response_tweet_id.append(data['id'])
                in_reply_to_status_id.append(data['in_reply_to_status_id'])
                in_reply_to_user_id.append(data['in_reply_to_user_id'])
                response_tweet_text.append(data['text'])
                response_retweet_count.append(data['retweet_count'])
                response_created_at.append(data['created_at'])

        except Exception as inst:
            print(type(inst))    # the exception type
            print(inst.args)     # arguments stored in .args
            print(inst) 


r_user_id = map(str, r_user_id)

#create a df
data = {'main_tweet_id': main_thread_id,
        #'in_reply_to_status_id':in_reply_to_status_id, 
        #'in_reply_to_user_id':in_reply_to_user_id, 
        'user_id': response_user_id,
        'response_tweet_id':response_tweet_id, 
        #'response_tweet_text':response_tweet_text,'response_tweet_text':response_tweet_text,
        'response_created_at':response_created_at}

response_tweet_data = pd.DataFrame(data)
print("Process Complete")
print(response_tweet_data.info())

Process Complete
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5966 entries, 0 to 5965
Data columns (total 4 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   main_tweet_id        5966 non-null   object
 1   user_id              5966 non-null   int64 
 2   response_tweet_id    5966 non-null   int64 
 3   response_created_at  5966 non-null   object
dtypes: int64(2), object(2)
memory usage: 186.6+ KB
None


In [21]:
response_tweet_data.to_csv('rumor_response_data.csv', index=False)
print('File created')

File created
