## Data Extraction and Feature Engineering

In [26]:
import pandas as pd # importing the Pandas library
import os, json # the os module and json package

In [None]:
from datetime import datetime

In [40]:
def check_nulls(df):
    # check null rows
    null_mask = df.isnull().any(axis=1)
    null_rows = df[null_mask]
    return null_rows

In [23]:
def replace_vals(df):
    columns = ['in_reply_to_status_id', 'in_reply_to_user_id']
    df[columns] = df[columns].fillna('No ID') 
    return df

In [24]:
def fetch_source_df(topic, classtype):
    # Define the directory path
    directory_path = "phemernrdataset/pheme-rnr-dataset/"+topic+"/"+classtype+"/"
    
    #create an empty DataFrame
    source_Tweet_data = pd.DataFrame()
    user_id = []
    tweet_id = []
    tweet_text = []
    retweet_count = []
    favorite_count = []
    followers_count = []
    friends_count = []
    verified = []
    created_at = []
    # Get the list of folders in the directory
    folders = [folder for folder in os.listdir(directory_path) if os.path.isdir(os.path.join(directory_path, folder))]
    
    # Get the first folder name
    first_folder_name = folders[0] if folders else None
    #print(len(folders))
    print("First folder name:", first_folder_name)
    index = 0
    for file_name in folders:
        new_path = directory_path + file_name+'/' + 'source-tweet/' + file_name + '.json'
        f = open(new_path)
        data = json.load(f)
        #print(data)
        user_id.append(data['user']['id_str'])
        tweet_id.append(data['id_str'])
        tweet_text.append(data['text'])
        retweet_count.append(data['retweet_count'])
        favorite_count.append(data['favorite_count'])
        followers_count.append(data['user']['followers_count'])
        friends_count.append(data['user']['friends_count'])
        verified.append(data['user']['verified'])
        created_at.append(data['created_at'])
    
    #create a df
    data = {'user_id': user_id,
            'tweet_id':tweet_id, 
            'tweet_text':tweet_text, 
            'retweet_count':retweet_count,
            'favorite_count':favorite_count,
            'followers_count':followers_count, 
            'friends_count':friends_count, 
            'verified':verified, 
            'created_at':created_at}
    
    df = pd.DataFrame(data)

    if(classtype=='rumours'):
        df['label'] = 'rumor'

    else:
        df['label'] = 'non-rumor'
    
    df['label'] = df['label'].replace({'rumor': 1, 'non-rumor': 0})
    df['verified'] = df['verified'].replace({True: 1, False: 0})
    
    return df

In [19]:
def fetch_response_tweets(topic, classtype, res_type):
    response_user_id = []
    response_tweet_id = []
    in_reply_to_status_id = []
    in_reply_to_user_id = []
    response_tweet_text = []
    response_retweet_count = []
    response_favorite_count = []
    response_created_at = []
    main_thread_id = []
    response_directory_path = "phemernrdataset/pheme-rnr-dataset/"+topic+"/"+classtype+"/"
    # Get the list of folders in the directory
    folders = [folder for folder in os.listdir(response_directory_path) 
               if os.path.isdir(os.path.join(response_directory_path, folder))]
    
    #loop through all the folders in the directory
    for folder in folders:
        path = response_directory_path + folder+'/' + 'reactions/'
        json_files = sorted([pos_json for pos_json in os.listdir(path)
                 if pos_json.endswith('.json')])
        
        for index, js in enumerate(json_files):
    
            try:
                #open the file using the given directory
                with open(os.path.join(path, js)) as file:
                    #load data from json file
                    data = json.load(file)
    
                    #convert to string first before storing in array
                    main_thread_id.append(folder)
                    response_user_id.append(data['user']['id_str'])
                    response_tweet_id.append(data['id_str'])
                    in_reply_to_status_id.append(str(data['in_reply_to_status_id_str']))
                    in_reply_to_user_id.append(str(data['in_reply_to_user_id_str']))
                    response_tweet_text.append(data['text'])
                    response_retweet_count.append(data['retweet_count'])
                    response_favorite_count.append(data['favorite_count'])
                    response_created_at.append(data['created_at'])
    
            except Exception as inst:
                print(type(inst))    # the exception type
                print(inst.args)     # arguments stored in .args
                print(inst) 
    
    
    
    #create a df
    data = {'main_tweet_id': main_thread_id,
            'response_tweet_id':response_tweet_id, 
            'user_id': response_user_id,
            'in_reply_to_status_id':in_reply_to_status_id, 
            'in_reply_to_user_id':in_reply_to_user_id, 
            'response_tweet_text':response_tweet_text,
            'retweet_count': response_retweet_count,
            'favorite_count':response_favorite_count,
            'response_created_at':response_created_at}
    
    response_tweet_data = pd.DataFrame(data)
    response_tweet_data['label'] = res_type
    response_tweet_data['label'] = response_tweet_data['label'].replace({'rumour': 1, 'non-rumour': 0})
    return response_tweet_data

## Add time_delay column

In [2]:
def add_date_time(df, column_to_drop):
    date = []
    time = []
    
    for i in df.index:
        time_str = df.loc[i, column_to_drop]
        #tweet_id = df.loc[i,'tweet_id']
        try:
            # Parse the timestamp string into a datetime object
            timestamp = datetime.strptime(time_str, "%a %b %d %H:%M:%S %z %Y")
        
            # Extract date and time separately
            date.append(timestamp.date())
            time.append(timestamp.time())
        except Exception as inst:
                #print(f'User: {tweet_id}')
                print(type(inst))    # the exception type
                print(inst.args)     # arguments stored in .args
                print(inst) 

    #add new columns
    df['time_posted'] = time
    df['date_posted'] = date
    #after updating df with new column delete unnecessary column
    df = df.drop(column_to_drop, axis=1)
    
    return df

In [79]:
def set_time_delay(df_source, df_response):
    time_elapsed = []

    for i in df_source.index:
        # Get the tweet_id of the main post
        tweet_id = df_source.loc[i, 'tweet_id']
    
        # Get the index of the first response of the current source tweet
        response_indices = df_response.index[df_response['main_tweet_id'] == tweet_id].tolist()

        if response_indices:
            try:
                # Get the time of the first response
                response_index = response_indices[0]
                response_time = df_response.loc[response_index, 'time_posted']
                source_time = df_source.loc[i, 'time_posted']
                
                # Ensure the time values are in the correct format before combining with datetime.min
                datetime1 = datetime.combine(datetime.min, datetime.strptime(str(response_time), '%H:%M:%S').time())
                datetime2 = datetime.combine(datetime.min, datetime.strptime(str(source_time), '%H:%M:%S').time())
        
                # Calculate the time difference and return in minutes
                time_difference = (datetime1 - datetime2).seconds / 60
                time_difference = "{:.4f}".format(time_difference)
                time_elapsed.append(time_difference)
        
            except Exception as inst:
                print(f'User: {tweet_id}')
                print(type(inst))    # the exception type
                print(inst.args)     # arguments stored in .args
                print(inst) 

        else:
            # If no response is found, set the time delay to 'NaT' (Not a Time)
            time_elapsed.append('NaT')
            #print(f'User {tweet_id} not found')
    
    # Assign the calculated time delays to a new column in the source dataframe
    df_source['time_delay (min)'] = time_elapsed
    # Optionally drop columns you no longer need
    df_source = df_source.drop(['time_posted', 'date_posted'], axis=1)
    
    return df_source

### Rumor

#### Germanwings Crash 

In [74]:
germanwings_s_rumors = fetch_source_df('germanwings-crash', 'rumours')
print(germanwings_s_rumors.info())

First folder name: 580387098039046145
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 238 entries, 0 to 237
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   user_id          238 non-null    object
 1   tweet_id         238 non-null    object
 2   tweet_text       238 non-null    object
 3   retweet_count    238 non-null    int64 
 4   favorite_count   238 non-null    int64 
 5   followers_count  238 non-null    int64 
 6   friends_count    238 non-null    int64 
 7   verified         238 non-null    int64 
 8   created_at       238 non-null    object
 9   label            238 non-null    int64 
dtypes: int64(6), object(4)
memory usage: 18.7+ KB
None


  df['label'] = df['label'].replace({'rumor': 1, 'non-rumor': 0})
  df['verified'] = df['verified'].replace({True: 1, False: 0})


In [70]:
germanwings_crash_r_res = fetch_response_tweets('germanwings-crash', 'rumours','rumour')
print(germanwings_crash_r_res.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2256 entries, 0 to 2255
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   main_tweet_id          2256 non-null   object
 1   response_tweet_id      2256 non-null   object
 2   user_id                2256 non-null   object
 3   in_reply_to_status_id  2256 non-null   object
 4   in_reply_to_user_id    2256 non-null   object
 5   response_tweet_text    2256 non-null   object
 6   retweet_count          2256 non-null   int64 
 7   favorite_count         2256 non-null   int64 
 8   response_created_at    2256 non-null   object
 9   label                  2256 non-null   int64 
dtypes: int64(3), object(7)
memory usage: 176.4+ KB
None


  response_tweet_data['label'] = response_tweet_data['label'].replace({'rumour': 1, 'non-rumour': 0})


##### Add date time

In [75]:
germanwings_s_rumors = add_date_time(germanwings_s_rumors, 'created_at')
germanwings_s_rumors.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 238 entries, 0 to 237
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   user_id          238 non-null    object
 1   tweet_id         238 non-null    object
 2   tweet_text       238 non-null    object
 3   retweet_count    238 non-null    int64 
 4   favorite_count   238 non-null    int64 
 5   followers_count  238 non-null    int64 
 6   friends_count    238 non-null    int64 
 7   verified         238 non-null    int64 
 8   label            238 non-null    int64 
 9   time_posted      238 non-null    object
 10  date_posted      238 non-null    object
dtypes: int64(6), object(5)
memory usage: 20.6+ KB


In [76]:
germanwings_crash_r_res = add_date_time(germanwings_crash_r_res, 'response_created_at')
germanwings_crash_r_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2256 entries, 0 to 2255
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   main_tweet_id          2256 non-null   object
 1   response_tweet_id      2256 non-null   object
 2   user_id                2256 non-null   object
 3   in_reply_to_status_id  2256 non-null   object
 4   in_reply_to_user_id    2256 non-null   object
 5   response_tweet_text    2256 non-null   object
 6   retweet_count          2256 non-null   int64 
 7   favorite_count         2256 non-null   int64 
 8   label                  2256 non-null   int64 
 9   time_posted            2256 non-null   object
 10  date_posted            2256 non-null   object
dtypes: int64(3), object(8)
memory usage: 194.0+ KB


In [43]:
import csv

file_path = 'germanwings_response.csv'
ind = []
# Check the number of columns in each row
with open(file_path, 'r', encoding='utf-8') as file:
    reader = csv.reader(file)
    for i, row in enumerate(reader):
        if len(row) != 10:  # Assuming there should be 10 columns
            print(f"Row {i} is malformed: {row}")
            ind.append(i)

Row 168 is malformed: ['581303989406314496', '581305382099779584', '2361372481', '581303989406314496', '112562263', '@GR8_2B_alive']
Row 169 is malformed: []
Row 170 is malformed: [' It makes sense because Obama is flying America into the side of a mountain.', '0', '2', 'Fri Mar 27 04:02:57 +0000 2015', '1']
Row 848 is malformed: ['581287562947395584', '581293812120576000', '2570798579', '581290111410663424', '113655965', '@upma23 @DrShobha If Hindu rapes nun-majority ism']
Row 849 is malformed: ['If moslim rapes nun-secularism ']
Row 850 is malformed: ["Opps....i didn't know!", '0', '0', 'Fri Mar 27 03:16:59 +0000 2015', '1']
Row 2050 is malformed: ['581287795991302144', '581293235290529794', '2570798579', '581287795991302144', '162304163', '@mediacrooks @DrShobha If Hindu rapes nun-majority ism']
Row 2051 is malformed: ['If moslim rapes nun-secularism ']
Row 2052 is malformed: ["Opps....i didn't know!", '0', '0', 'Fri Mar 27 03:14:41 +0000 2015', '1']


In [47]:
display(germanwings_crash_r_res.loc[ind, :])
print(len(ind))

Unnamed: 0,main_tweet_id,response_tweet_id,user_id,in_reply_to_status_id,in_reply_to_user_id,response_tweet_text,retweet_count,favorite_count,label,time_posted,date_posted
168,581303989406314496,581306385247293440,503570249,581303989406314496,112562263,@GR8_2B_alive Too early to make this an issue....,0,1,1,04:06:57,2015-03-27
169,581303989406314496,581307187793956864,277320969,581303989406314496,112562263,@GR8_2B_alive The common denominator for whole...,0,1,1,04:10:08,2015-03-27
170,581303989406314496,581307433680670721,18524109,581303989406314496,112562263,@GR8_2B_alive Read that story. Follow the link...,0,2,1,04:11:07,2015-03-27
848,581287562947395584,581306111422177282,232246010,581290698512596992,87882279,@ganeshmahnar @rvaidya2000 @DrShobha They bel...,0,0,1,04:05:51,2015-03-27
849,581287562947395584,581375064584511488,386089446,581290698512596992,87882279,@ganeshmahnar Peace or pieces? @rvaidya2000 @D...,0,0,1,08:39:51,2015-03-27
850,581287562947395584,581376883935223808,87882279,581375064584511488,386089446,@ShubhamBhuyaat here meaning is eternal peace ...,0,0,1,08:47:05,2015-03-27
2050,581287795991302144,581302424423440384,3037749678,581287795991302144,162304163,@mediacrooks @DrShobha Germany is harbouring k...,0,0,1,03:51:12,2015-03-27
2051,581287795991302144,581306424476618753,1686700296,581287795991302144,162304163,"@mediacrooks Not all, but too many are. Like t...",1,2,1,04:07:06,2015-03-27
2052,581287795991302144,581318336442552320,51310666,581287795991302144,162304163,@mediacrooks @Aks9009Pa @DrShobha oh shit!,1,0,1,04:54:26,2015-03-27


9


##### add time delay

In [57]:
display(germanwings_crash_r_res[germanwings_crash_r_res['main_tweet_id']=='580328783619579904'])

Unnamed: 0,main_tweet_id,response_tweet_id,user_id,in_reply_to_status_id,in_reply_to_user_id,response_tweet_text,retweet_count,favorite_count,label,time_posted,date_posted


In [60]:
df_time = pd.DataFrame()

In [77]:
germanwings_s_rumors = set_time_delay(germanwings_s_rumors, germanwings_crash_r_res)
germanwings_s_rumors.info()

User 581066887200694272 not found
User 580885455257882624 not found
User 580350166005248000 not found
User 580328783619579904 not found
User 580327924978569216 not found
User 580372806178881537 not found
User 580347361039413248 not found
User 580336766663540736 not found
User 581293685041557504 not found
User 580346392431968257 not found
User 580344171019542528 not found
User 580353449721655296 not found
User 580321925265788928 not found
User 581064157870555136 not found
User 580320995266936832 not found
User 580338546856235009 not found
User 580322782392762368 not found
User 580346142183002113 not found
User 580885187837325312 not found
User 581357303405813760 not found
User 580324985148932097 not found
User 580321317527879680 not found
User 580368137796743168 not found
User 580347364562587649 not found
User 581317224096387072 not found
User 580331453889708032 not found
User 580327626419482624 not found
User 580347464907153408 not found
User 580326762673901568 not found
User 580329560

#### Charlie

In [80]:
charlie_source_df = fetch_source_df('charliehebdo', 'rumours')
charlie_source_df.info()

First folder name: 553487971497041920
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458 entries, 0 to 457
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   user_id          458 non-null    object
 1   tweet_id         458 non-null    object
 2   tweet_text       458 non-null    object
 3   retweet_count    458 non-null    int64 
 4   favorite_count   458 non-null    int64 
 5   followers_count  458 non-null    int64 
 6   friends_count    458 non-null    int64 
 7   verified         458 non-null    int64 
 8   created_at       458 non-null    object
 9   label            458 non-null    int64 
dtypes: int64(6), object(4)
memory usage: 35.9+ KB


  df['label'] = df['label'].replace({'rumor': 1, 'non-rumor': 0})
  df['verified'] = df['verified'].replace({True: 1, False: 0})


In [81]:
charlie_r_res = fetch_response_tweets('charliehebdo', 'rumours','rumour')
charlie_r_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6887 entries, 0 to 6886
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   main_tweet_id          6887 non-null   object
 1   response_tweet_id      6887 non-null   object
 2   user_id                6887 non-null   object
 3   in_reply_to_status_id  6887 non-null   object
 4   in_reply_to_user_id    6887 non-null   object
 5   response_tweet_text    6887 non-null   object
 6   retweet_count          6887 non-null   int64 
 7   favorite_count         6887 non-null   int64 
 8   response_created_at    6887 non-null   object
 9   label                  6887 non-null   int64 
dtypes: int64(3), object(7)
memory usage: 538.2+ KB


  response_tweet_data['label'] = response_tweet_data['label'].replace({'rumour': 1, 'non-rumour': 0})


In [82]:
charlie_source_df = add_date_time(charlie_source_df, 'created_at')
charlie_source_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458 entries, 0 to 457
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   user_id          458 non-null    object
 1   tweet_id         458 non-null    object
 2   tweet_text       458 non-null    object
 3   retweet_count    458 non-null    int64 
 4   favorite_count   458 non-null    int64 
 5   followers_count  458 non-null    int64 
 6   friends_count    458 non-null    int64 
 7   verified         458 non-null    int64 
 8   label            458 non-null    int64 
 9   time_posted      458 non-null    object
 10  date_posted      458 non-null    object
dtypes: int64(6), object(5)
memory usage: 39.5+ KB


In [85]:
charlie_r_res = add_date_time(charlie_r_res, 'response_created_at')
charlie_r_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6887 entries, 0 to 6886
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   main_tweet_id          6887 non-null   object
 1   response_tweet_id      6887 non-null   object
 2   user_id                6887 non-null   object
 3   in_reply_to_status_id  6887 non-null   object
 4   in_reply_to_user_id    6887 non-null   object
 5   response_tweet_text    6887 non-null   object
 6   retweet_count          6887 non-null   int64 
 7   favorite_count         6887 non-null   int64 
 8   label                  6887 non-null   int64 
 9   time_posted            6887 non-null   object
 10  date_posted            6887 non-null   object
dtypes: int64(3), object(8)
memory usage: 592.0+ KB


In [87]:
charlie_source_df = set_time_delay(charlie_source_df, charlie_r_res)
charlie_source_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458 entries, 0 to 457
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   user_id           458 non-null    object
 1   tweet_id          458 non-null    object
 2   tweet_text        458 non-null    object
 3   retweet_count     458 non-null    int64 
 4   favorite_count    458 non-null    int64 
 5   followers_count   458 non-null    int64 
 6   friends_count     458 non-null    int64 
 7   verified          458 non-null    int64 
 8   label             458 non-null    int64 
 9   time_delay (min)  458 non-null    object
dtypes: int64(6), object(4)
memory usage: 35.9+ KB


#### Ottawa

In [88]:
ottawa_rs_df = fetch_source_df('ottawashooting', 'rumours')
ottawa_rs_df.info()

First folder name: 524944399890124801
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 470 entries, 0 to 469
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   user_id          470 non-null    object
 1   tweet_id         470 non-null    object
 2   tweet_text       470 non-null    object
 3   retweet_count    470 non-null    int64 
 4   favorite_count   470 non-null    int64 
 5   followers_count  470 non-null    int64 
 6   friends_count    470 non-null    int64 
 7   verified         470 non-null    int64 
 8   created_at       470 non-null    object
 9   label            470 non-null    int64 
dtypes: int64(6), object(4)
memory usage: 36.8+ KB


  df['label'] = df['label'].replace({'rumor': 1, 'non-rumor': 0})
  df['verified'] = df['verified'].replace({True: 1, False: 0})


In [91]:
ottawa_r_res = fetch_response_tweets('ottawashooting', 'rumours','rumour')
ottawa_r_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5966 entries, 0 to 5965
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   main_tweet_id          5966 non-null   object
 1   response_tweet_id      5966 non-null   object
 2   user_id                5966 non-null   object
 3   in_reply_to_status_id  5966 non-null   object
 4   in_reply_to_user_id    5966 non-null   object
 5   response_tweet_text    5966 non-null   object
 6   retweet_count          5966 non-null   int64 
 7   favorite_count         5966 non-null   int64 
 8   response_created_at    5966 non-null   object
 9   label                  5966 non-null   int64 
dtypes: int64(3), object(7)
memory usage: 466.2+ KB


  response_tweet_data['label'] = response_tweet_data['label'].replace({'rumour': 1, 'non-rumour': 0})


In [93]:
ottawa_rs_df = add_date_time(ottawa_rs_df, 'created_at')
ottawa_rs_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 470 entries, 0 to 469
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   user_id          470 non-null    object
 1   tweet_id         470 non-null    object
 2   tweet_text       470 non-null    object
 3   retweet_count    470 non-null    int64 
 4   favorite_count   470 non-null    int64 
 5   followers_count  470 non-null    int64 
 6   friends_count    470 non-null    int64 
 7   verified         470 non-null    int64 
 8   label            470 non-null    int64 
 9   time_posted      470 non-null    object
 10  date_posted      470 non-null    object
dtypes: int64(6), object(5)
memory usage: 40.5+ KB


In [94]:
ottawa_r_res = add_date_time(ottawa_r_res, 'response_created_at')
ottawa_r_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5966 entries, 0 to 5965
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   main_tweet_id          5966 non-null   object
 1   response_tweet_id      5966 non-null   object
 2   user_id                5966 non-null   object
 3   in_reply_to_status_id  5966 non-null   object
 4   in_reply_to_user_id    5966 non-null   object
 5   response_tweet_text    5966 non-null   object
 6   retweet_count          5966 non-null   int64 
 7   favorite_count         5966 non-null   int64 
 8   label                  5966 non-null   int64 
 9   time_posted            5966 non-null   object
 10  date_posted            5966 non-null   object
dtypes: int64(3), object(8)
memory usage: 512.8+ KB


In [121]:
ottawa_rs_df = set_time_delay(ottawa_rs_df, ottawa_r_res)
ottawa_rs_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 470 entries, 0 to 469
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   user_id           470 non-null    object
 1   tweet_id          470 non-null    object
 2   tweet_text        470 non-null    object
 3   retweet_count     470 non-null    int64 
 4   favorite_count    470 non-null    int64 
 5   followers_count   470 non-null    int64 
 6   friends_count     470 non-null    int64 
 7   verified          470 non-null    int64 
 8   label             470 non-null    int64 
 9   time_delay (min)  470 non-null    object
dtypes: int64(6), object(4)
memory usage: 36.8+ KB


#### Ferguson

In [140]:
ferguson_rs_df = fetch_source_df('ferguson', 'rumours')
ferguson_rs_df.info()

First folder name: 500291013521334272
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284 entries, 0 to 283
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   user_id          284 non-null    object
 1   tweet_id         284 non-null    object
 2   tweet_text       284 non-null    object
 3   retweet_count    284 non-null    int64 
 4   favorite_count   284 non-null    int64 
 5   followers_count  284 non-null    int64 
 6   friends_count    284 non-null    int64 
 7   verified         284 non-null    int64 
 8   created_at       284 non-null    object
 9   label            284 non-null    int64 
dtypes: int64(6), object(4)
memory usage: 22.3+ KB


  df['label'] = df['label'].replace({'rumor': 1, 'non-rumor': 0})
  df['verified'] = df['verified'].replace({True: 1, False: 0})


In [141]:
ferguson_r_res = fetch_response_tweets('ferguson', 'rumours','rumour')
ferguson_r_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6195 entries, 0 to 6194
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   main_tweet_id          6195 non-null   object
 1   response_tweet_id      6195 non-null   object
 2   user_id                6195 non-null   object
 3   in_reply_to_status_id  6195 non-null   object
 4   in_reply_to_user_id    6195 non-null   object
 5   response_tweet_text    6195 non-null   object
 6   retweet_count          6195 non-null   int64 
 7   favorite_count         6195 non-null   int64 
 8   response_created_at    6195 non-null   object
 9   label                  6195 non-null   int64 
dtypes: int64(3), object(7)
memory usage: 484.1+ KB


  response_tweet_data['label'] = response_tweet_data['label'].replace({'rumour': 1, 'non-rumour': 0})


In [142]:
ferguson_rs_df = add_date_time(ferguson_rs_df, 'created_at')
ferguson_rs_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284 entries, 0 to 283
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   user_id          284 non-null    object
 1   tweet_id         284 non-null    object
 2   tweet_text       284 non-null    object
 3   retweet_count    284 non-null    int64 
 4   favorite_count   284 non-null    int64 
 5   followers_count  284 non-null    int64 
 6   friends_count    284 non-null    int64 
 7   verified         284 non-null    int64 
 8   label            284 non-null    int64 
 9   time_posted      284 non-null    object
 10  date_posted      284 non-null    object
dtypes: int64(6), object(5)
memory usage: 24.5+ KB


In [143]:
ferguson_r_res = add_date_time(ferguson_r_res, 'response_created_at')
ferguson_r_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6195 entries, 0 to 6194
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   main_tweet_id          6195 non-null   object
 1   response_tweet_id      6195 non-null   object
 2   user_id                6195 non-null   object
 3   in_reply_to_status_id  6195 non-null   object
 4   in_reply_to_user_id    6195 non-null   object
 5   response_tweet_text    6195 non-null   object
 6   retweet_count          6195 non-null   int64 
 7   favorite_count         6195 non-null   int64 
 8   label                  6195 non-null   int64 
 9   time_posted            6195 non-null   object
 10  date_posted            6195 non-null   object
dtypes: int64(3), object(8)
memory usage: 532.5+ KB


In [144]:
ferguson_rs_df = set_time_delay(ferguson_rs_df, ferguson_r_res)
ferguson_rs_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284 entries, 0 to 283
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   user_id           284 non-null    object
 1   tweet_id          284 non-null    object
 2   tweet_text        284 non-null    object
 3   retweet_count     284 non-null    int64 
 4   favorite_count    284 non-null    int64 
 5   followers_count   284 non-null    int64 
 6   friends_count     284 non-null    int64 
 7   verified          284 non-null    int64 
 8   label             284 non-null    int64 
 9   time_delay (min)  284 non-null    object
dtypes: int64(6), object(4)
memory usage: 22.3+ KB


#### Sydney

In [100]:
sydney_sr = fetch_source_df('sydneysiege', 'rumours')
sydney_sr.info()

First folder name: 544284823041212416
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 522 entries, 0 to 521
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   user_id          522 non-null    object
 1   tweet_id         522 non-null    object
 2   tweet_text       522 non-null    object
 3   retweet_count    522 non-null    int64 
 4   favorite_count   522 non-null    int64 
 5   followers_count  522 non-null    int64 
 6   friends_count    522 non-null    int64 
 7   verified         522 non-null    int64 
 8   created_at       522 non-null    object
 9   label            522 non-null    int64 
dtypes: int64(6), object(4)
memory usage: 40.9+ KB


  df['label'] = df['label'].replace({'rumor': 1, 'non-rumor': 0})
  df['verified'] = df['verified'].replace({True: 1, False: 0})


In [101]:
sydney_r_res = fetch_response_tweets('sydneysiege', 'rumours','rumour')
sydney_r_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8154 entries, 0 to 8153
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   main_tweet_id          8154 non-null   object
 1   response_tweet_id      8154 non-null   object
 2   user_id                8154 non-null   object
 3   in_reply_to_status_id  8154 non-null   object
 4   in_reply_to_user_id    8154 non-null   object
 5   response_tweet_text    8154 non-null   object
 6   retweet_count          8154 non-null   int64 
 7   favorite_count         8154 non-null   int64 
 8   response_created_at    8154 non-null   object
 9   label                  8154 non-null   int64 
dtypes: int64(3), object(7)
memory usage: 637.2+ KB


  response_tweet_data['label'] = response_tweet_data['label'].replace({'rumour': 1, 'non-rumour': 0})


In [104]:
sydney_sr = add_date_time(sydney_sr, 'created_at')
sydney_sr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 522 entries, 0 to 521
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   user_id          522 non-null    object
 1   tweet_id         522 non-null    object
 2   tweet_text       522 non-null    object
 3   retweet_count    522 non-null    int64 
 4   favorite_count   522 non-null    int64 
 5   followers_count  522 non-null    int64 
 6   friends_count    522 non-null    int64 
 7   verified         522 non-null    int64 
 8   label            522 non-null    int64 
 9   time_posted      522 non-null    object
 10  date_posted      522 non-null    object
dtypes: int64(6), object(5)
memory usage: 45.0+ KB


In [103]:
sydney_r_res = add_date_time(sydney_r_res, 'response_created_at')
sydney_r_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8154 entries, 0 to 8153
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   main_tweet_id          8154 non-null   object
 1   response_tweet_id      8154 non-null   object
 2   user_id                8154 non-null   object
 3   in_reply_to_status_id  8154 non-null   object
 4   in_reply_to_user_id    8154 non-null   object
 5   response_tweet_text    8154 non-null   object
 6   retweet_count          8154 non-null   int64 
 7   favorite_count         8154 non-null   int64 
 8   label                  8154 non-null   int64 
 9   time_posted            8154 non-null   object
 10  date_posted            8154 non-null   object
dtypes: int64(3), object(8)
memory usage: 700.9+ KB


In [105]:
sydney_sr = set_time_delay(sydney_sr, sydney_r_res)
sydney_sr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 522 entries, 0 to 521
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   user_id           522 non-null    object
 1   tweet_id          522 non-null    object
 2   tweet_text        522 non-null    object
 3   retweet_count     522 non-null    int64 
 4   favorite_count    522 non-null    int64 
 5   followers_count   522 non-null    int64 
 6   friends_count     522 non-null    int64 
 7   verified          522 non-null    int64 
 8   label             522 non-null    int64 
 9   time_delay (min)  522 non-null    object
dtypes: int64(6), object(4)
memory usage: 40.9+ KB


In [129]:
rumor_data.head()

Unnamed: 0,user_id,tweet_id,tweet_text,friends_count,followers_count,is_verified,created_at,label
0,19038934,524944399890124801,Recap: Gunman shot dead inside Parliament buil...,264,366215,True,Wed Oct 22 15:24:32 +0000 2014,rumor
1,410334710,525023642007371776,Soldier killed at war memorial identified as N...,674,18803,True,Wed Oct 22 20:39:25 +0000 2014,rumor
2,8736882,525026219100995584,Soldier killed at war memorial identified as C...,415,543305,True,Wed Oct 22 20:49:39 +0000 2014,rumor
3,2097571,524945684966166528,BREAKING: Two or three gunmen were involved in...,373,2735682,True,Wed Oct 22 15:29:38 +0000 2014,rumor
4,19038934,525051365195014144,All 3 patients injured in #OttawaShooting rele...,264,366198,True,Wed Oct 22 22:29:35 +0000 2014,rumor


In [103]:
from datetime import datetime

In [144]:
rumor_data = add_date_time(rumor_data, 'created_at')
rumor_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 470 entries, 0 to 469
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   user_id          470 non-null    int64 
 1   tweet_id         470 non-null    int64 
 2   tweet_text       470 non-null    object
 3   friends_count    470 non-null    int64 
 4   followers_count  470 non-null    int64 
 5   is_verified      470 non-null    bool  
 6   label            470 non-null    object
 7   time_posted      470 non-null    object
 8   date_posted      470 non-null    object
dtypes: bool(1), int64(4), object(4)
memory usage: 30.0+ KB


In [136]:
rumor_data.head()

Unnamed: 0,user_id,tweet_id,tweet_text,friends_count,followers_count,is_verified,label,time_posted,date_posted
0,19038934,524944399890124801,Recap: Gunman shot dead inside Parliament buil...,264,366215,True,rumor,15:24:32,2014-10-22
1,410334710,525023642007371776,Soldier killed at war memorial identified as N...,674,18803,True,rumor,20:39:25,2014-10-22
2,8736882,525026219100995584,Soldier killed at war memorial identified as C...,415,543305,True,rumor,20:49:39,2014-10-22
3,2097571,524945684966166528,BREAKING: Two or three gunmen were involved in...,373,2735682,True,rumor,15:29:38,2014-10-22
4,19038934,525051365195014144,All 3 patients injured in #OttawaShooting rele...,264,366198,True,rumor,22:29:35,2014-10-22


In [145]:
rumor_responses = pd.read_csv('rumor_response_data.csv')
rumor_responses.head()

Unnamed: 0,main_tweet_id,user_id,response_tweet_id,response_created_at
0,524944399890124801,183777979,524945108291297280,Wed Oct 22 15:27:21 +0000 2014
1,524944399890124801,265023219,524945130113863680,Wed Oct 22 15:27:26 +0000 2014
2,524944399890124801,225191038,524945376911314945,Wed Oct 22 15:28:25 +0000 2014
3,524944399890124801,728394116,524946532416884736,Wed Oct 22 15:33:01 +0000 2014
4,524944399890124801,161439565,524946767209459712,Wed Oct 22 15:33:57 +0000 2014


In [139]:
rumor_responses.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5966 entries, 0 to 5965
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   main_tweet_id      5966 non-null   int64 
 1   user_id            5966 non-null   int64 
 2   response_tweet_id  5966 non-null   int64 
 3   time_posted        5966 non-null   object
 4   date_posted        5966 non-null   object
dtypes: int64(3), object(2)
memory usage: 233.2+ KB


In [146]:
rumor_responses = add_date_time(rumor_responses, 'response_created_at')
rumor_responses.head()

Unnamed: 0,main_tweet_id,user_id,response_tweet_id,time_posted,date_posted
0,524944399890124801,183777979,524945108291297280,15:27:21,2014-10-22
1,524944399890124801,265023219,524945130113863680,15:27:26,2014-10-22
2,524944399890124801,225191038,524945376911314945,15:28:25,2014-10-22
3,524944399890124801,728394116,524946532416884736,15:33:01,2014-10-22
4,524944399890124801,161439565,524946767209459712,15:33:57,2014-10-22


In [125]:
tweet_ids = [524950428598153216,
524994809912504321,
524949315245322241,
524991452527722496,
524952094986350592,
524977076433321987,
525040509937795072,
525039208139087872,
525046462460477440,
524952407625957376]

vals = rumor_responses[rumor_responses['main_tweet_id'].isin(tweet_ids)]
vals

Unnamed: 0,main_tweet_id,user_id,response_tweet_id,time_posted,date_posted
4684,525046462460477440,2669633142,525080115852873728,00:23:49,2014-10-23
4685,525046462460477440,915909072,525080661569589248,00:25:59,2014-10-23
4686,525046462460477440,249889994,525081424945496064,00:29:01,2014-10-23
4687,525046462460477440,1932263718,525081548983652352,00:29:31,2014-10-23
4688,525046462460477440,569188164,525083618700705792,00:37:44,2014-10-23
4689,525046462460477440,2373123864,525087902485999616,00:54:46,2014-10-23
4690,525046462460477440,383932808,525090768634257408,01:06:09,2014-10-23
4691,525046462460477440,1970429648,525092216772231170,01:11:54,2014-10-23
4692,525046462460477440,298598869,525092713180700672,01:13:53,2014-10-23
4693,525046462460477440,383932808,525093318104219648,01:16:17,2014-10-23


In [126]:
val = rumor_data[rumor_data['tweet_id'].isin(tweet_ids)]
val

Unnamed: 0,user_id,tweet_id,tweet_text,friends_count,followers_count,is_verified,label,time_delay
124,49654153,524950428598153216,.@ctvottawa confirms there were 3 separate sho...,108,19633,True,rumor,85132
178,910828603,524994809912504321,Thoughts &amp; prayers go out to the soldier k...,588,510235,False,rumor,74550
204,980908736,524949315245322241,Gun fire exchange in Parliament Hill building ...,161,7756,False,rumor,85397
248,234874463,524991452527722496,RIP to the Canadian soldier killed today in #O...,69,258187,False,rumor,75351
298,99028224,524952094986350592,Our thoughts and prayers are with the Canadian...,276,11865,True,rumor,84734
307,18999969,524977076433321987,Ottawa police and RCMP will be announcing a jo...,296,63044,False,rumor,78778
350,994782234,525040509937795072,RIP Corporal Nathan Cirillo Thank you for serv...,313,2308,False,rumor,63654
366,4970411,525039208139087872,DEVELOPING: #Canada officials identify dead gu...,136,2134176,True,rumor,63965
372,106272927,525046462460477440,Heartbreaking Photo Shows Nathan Cirillo Momen...,1456,1889,False,rumor,8023
421,18060885,524952407625957376,Seeing reports that soldier shot at War Memori...,791,1592,False,rumor,84660


In [147]:
rumor_data = set_time_delay(rumor_data, rumor_responses)
rumor_data.head()

Unnamed: 0,user_id,tweet_id,tweet_text,friends_count,followers_count,is_verified,label,time_delay (min)
0,19038934,524944399890124801,Recap: Gunman shot dead inside Parliament buil...,264,366215,True,rumor,2.8167
1,410334710,525023642007371776,Soldier killed at war memorial identified as N...,674,18803,True,rumor,3.0167
2,8736882,525026219100995584,Soldier killed at war memorial identified as C...,415,543305,True,rumor,14.0167
3,2097571,524945684966166528,BREAKING: Two or three gunmen were involved in...,373,2735682,True,rumor,1.55
4,19038934,525051365195014144,All 3 patients injured in #OttawaShooting rele...,264,366198,True,rumor,0.4


### Non-rumor

#### Germanwings Crash 

In [106]:
germanwings_s_nr = fetch_source_df('germanwings-crash', 'non-rumours')
print(germanwings_s_nr.info())

First folder name: 580330986220687360
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 231 entries, 0 to 230
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   user_id          231 non-null    object
 1   tweet_id         231 non-null    object
 2   tweet_text       231 non-null    object
 3   retweet_count    231 non-null    int64 
 4   favorite_count   231 non-null    int64 
 5   followers_count  231 non-null    int64 
 6   friends_count    231 non-null    int64 
 7   verified         231 non-null    int64 
 8   created_at       231 non-null    object
 9   label            231 non-null    int64 
dtypes: int64(6), object(4)
memory usage: 18.2+ KB
None


  df['label'] = df['label'].replace({'rumor': 1, 'non-rumor': 0})
  df['verified'] = df['verified'].replace({True: 1, False: 0})


In [107]:
germanwings_nr_res = fetch_response_tweets('germanwings-crash', 'non-rumours','non-rumour')
print(germanwings_nr_res.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1764 entries, 0 to 1763
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   main_tweet_id          1764 non-null   object
 1   response_tweet_id      1764 non-null   object
 2   user_id                1764 non-null   object
 3   in_reply_to_status_id  1764 non-null   object
 4   in_reply_to_user_id    1764 non-null   object
 5   response_tweet_text    1764 non-null   object
 6   retweet_count          1764 non-null   int64 
 7   favorite_count         1764 non-null   int64 
 8   response_created_at    1764 non-null   object
 9   label                  1764 non-null   int64 
dtypes: int64(3), object(7)
memory usage: 137.9+ KB
None


  response_tweet_data['label'] = response_tweet_data['label'].replace({'rumour': 1, 'non-rumour': 0})


In [108]:
germanwings_s_nr = add_date_time(germanwings_s_nr, 'created_at')
germanwings_s_nr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 231 entries, 0 to 230
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   user_id          231 non-null    object
 1   tweet_id         231 non-null    object
 2   tweet_text       231 non-null    object
 3   retweet_count    231 non-null    int64 
 4   favorite_count   231 non-null    int64 
 5   followers_count  231 non-null    int64 
 6   friends_count    231 non-null    int64 
 7   verified         231 non-null    int64 
 8   label            231 non-null    int64 
 9   time_posted      231 non-null    object
 10  date_posted      231 non-null    object
dtypes: int64(6), object(5)
memory usage: 20.0+ KB


In [109]:
germanwings_nr_res = add_date_time(germanwings_nr_res, 'response_created_at')
germanwings_nr_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1764 entries, 0 to 1763
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   main_tweet_id          1764 non-null   object
 1   response_tweet_id      1764 non-null   object
 2   user_id                1764 non-null   object
 3   in_reply_to_status_id  1764 non-null   object
 4   in_reply_to_user_id    1764 non-null   object
 5   response_tweet_text    1764 non-null   object
 6   retweet_count          1764 non-null   int64 
 7   favorite_count         1764 non-null   int64 
 8   label                  1764 non-null   int64 
 9   time_posted            1764 non-null   object
 10  date_posted            1764 non-null   object
dtypes: int64(3), object(8)
memory usage: 151.7+ KB


In [110]:
germanwings_s_nr = set_time_delay(germanwings_s_nr, germanwings_nr_res)
germanwings_s_nr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 231 entries, 0 to 230
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   user_id           231 non-null    object
 1   tweet_id          231 non-null    object
 2   tweet_text        231 non-null    object
 3   retweet_count     231 non-null    int64 
 4   favorite_count    231 non-null    int64 
 5   followers_count   231 non-null    int64 
 6   friends_count     231 non-null    int64 
 7   verified          231 non-null    int64 
 8   label             231 non-null    int64 
 9   time_delay (min)  231 non-null    object
dtypes: int64(6), object(4)
memory usage: 18.2+ KB


#### Charlie

In [111]:
charlie_s_nr = fetch_source_df('charliehebdo', 'non-rumours')
charlie_s_nr.info()

First folder name: 552829395690201088
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1621 entries, 0 to 1620
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   user_id          1621 non-null   object
 1   tweet_id         1621 non-null   object
 2   tweet_text       1621 non-null   object
 3   retweet_count    1621 non-null   int64 
 4   favorite_count   1621 non-null   int64 
 5   followers_count  1621 non-null   int64 
 6   friends_count    1621 non-null   int64 
 7   verified         1621 non-null   int64 
 8   created_at       1621 non-null   object
 9   label            1621 non-null   int64 
dtypes: int64(6), object(4)
memory usage: 126.8+ KB


  df['label'] = df['label'].replace({'rumor': 1, 'non-rumor': 0})
  df['verified'] = df['verified'].replace({True: 1, False: 0})


In [112]:
charlie_nr_res = fetch_response_tweets('charliehebdo', 'non-rumours','non-rumour')
charlie_nr_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29302 entries, 0 to 29301
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   main_tweet_id          29302 non-null  object
 1   response_tweet_id      29302 non-null  object
 2   user_id                29302 non-null  object
 3   in_reply_to_status_id  29302 non-null  object
 4   in_reply_to_user_id    29302 non-null  object
 5   response_tweet_text    29302 non-null  object
 6   retweet_count          29302 non-null  int64 
 7   favorite_count         29302 non-null  int64 
 8   response_created_at    29302 non-null  object
 9   label                  29302 non-null  int64 
dtypes: int64(3), object(7)
memory usage: 2.2+ MB


  response_tweet_data['label'] = response_tweet_data['label'].replace({'rumour': 1, 'non-rumour': 0})


In [113]:
charlie_s_nr = add_date_time(charlie_s_nr, 'created_at')
charlie_s_nr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1621 entries, 0 to 1620
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   user_id          1621 non-null   object
 1   tweet_id         1621 non-null   object
 2   tweet_text       1621 non-null   object
 3   retweet_count    1621 non-null   int64 
 4   favorite_count   1621 non-null   int64 
 5   followers_count  1621 non-null   int64 
 6   friends_count    1621 non-null   int64 
 7   verified         1621 non-null   int64 
 8   label            1621 non-null   int64 
 9   time_posted      1621 non-null   object
 10  date_posted      1621 non-null   object
dtypes: int64(6), object(5)
memory usage: 139.4+ KB


In [114]:
charlie_nr_res = add_date_time(charlie_nr_res, 'response_created_at')
charlie_nr_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29302 entries, 0 to 29301
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   main_tweet_id          29302 non-null  object
 1   response_tweet_id      29302 non-null  object
 2   user_id                29302 non-null  object
 3   in_reply_to_status_id  29302 non-null  object
 4   in_reply_to_user_id    29302 non-null  object
 5   response_tweet_text    29302 non-null  object
 6   retweet_count          29302 non-null  int64 
 7   favorite_count         29302 non-null  int64 
 8   label                  29302 non-null  int64 
 9   time_posted            29302 non-null  object
 10  date_posted            29302 non-null  object
dtypes: int64(3), object(8)
memory usage: 2.5+ MB


In [115]:
charlie_s_nr = set_time_delay(charlie_s_nr, charlie_nr_res)
charlie_s_nr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1621 entries, 0 to 1620
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   user_id           1621 non-null   object
 1   tweet_id          1621 non-null   object
 2   tweet_text        1621 non-null   object
 3   retweet_count     1621 non-null   int64 
 4   favorite_count    1621 non-null   int64 
 5   followers_count   1621 non-null   int64 
 6   friends_count     1621 non-null   int64 
 7   verified          1621 non-null   int64 
 8   label             1621 non-null   int64 
 9   time_delay (min)  1621 non-null   object
dtypes: int64(6), object(4)
memory usage: 126.8+ KB


#### Ottawa

In [116]:
ottawa_s_nr = fetch_source_df('ottawashooting', 'non-rumours')
ottawa_s_nr.info()

First folder name: 524941318389501953
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 420 entries, 0 to 419
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   user_id          420 non-null    object
 1   tweet_id         420 non-null    object
 2   tweet_text       420 non-null    object
 3   retweet_count    420 non-null    int64 
 4   favorite_count   420 non-null    int64 
 5   followers_count  420 non-null    int64 
 6   friends_count    420 non-null    int64 
 7   verified         420 non-null    int64 
 8   created_at       420 non-null    object
 9   label            420 non-null    int64 
dtypes: int64(6), object(4)
memory usage: 32.9+ KB


  df['label'] = df['label'].replace({'rumor': 1, 'non-rumor': 0})
  df['verified'] = df['verified'].replace({True: 1, False: 0})


In [117]:
ottawa_nr_res = fetch_response_tweets('ottawashooting', 'non-rumours','non-rumour')
ottawa_nr_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5428 entries, 0 to 5427
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   main_tweet_id          5428 non-null   object
 1   response_tweet_id      5428 non-null   object
 2   user_id                5428 non-null   object
 3   in_reply_to_status_id  5428 non-null   object
 4   in_reply_to_user_id    5428 non-null   object
 5   response_tweet_text    5428 non-null   object
 6   retweet_count          5428 non-null   int64 
 7   favorite_count         5428 non-null   int64 
 8   response_created_at    5428 non-null   object
 9   label                  5428 non-null   int64 
dtypes: int64(3), object(7)
memory usage: 424.2+ KB


  response_tweet_data['label'] = response_tweet_data['label'].replace({'rumour': 1, 'non-rumour': 0})


In [118]:
ottawa_s_nr = add_date_time(ottawa_s_nr, 'created_at')
ottawa_s_nr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 420 entries, 0 to 419
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   user_id          420 non-null    object
 1   tweet_id         420 non-null    object
 2   tweet_text       420 non-null    object
 3   retweet_count    420 non-null    int64 
 4   favorite_count   420 non-null    int64 
 5   followers_count  420 non-null    int64 
 6   friends_count    420 non-null    int64 
 7   verified         420 non-null    int64 
 8   label            420 non-null    int64 
 9   time_posted      420 non-null    object
 10  date_posted      420 non-null    object
dtypes: int64(6), object(5)
memory usage: 36.2+ KB


In [119]:
ottawa_nr_res = add_date_time(ottawa_nr_res, 'response_created_at')
ottawa_nr_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5428 entries, 0 to 5427
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   main_tweet_id          5428 non-null   object
 1   response_tweet_id      5428 non-null   object
 2   user_id                5428 non-null   object
 3   in_reply_to_status_id  5428 non-null   object
 4   in_reply_to_user_id    5428 non-null   object
 5   response_tweet_text    5428 non-null   object
 6   retweet_count          5428 non-null   int64 
 7   favorite_count         5428 non-null   int64 
 8   label                  5428 non-null   int64 
 9   time_posted            5428 non-null   object
 10  date_posted            5428 non-null   object
dtypes: int64(3), object(8)
memory usage: 466.6+ KB


In [120]:
ottawa_s_nr = set_time_delay(ottawa_s_nr, ottawa_nr_res)
ottawa_s_nr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 420 entries, 0 to 419
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   user_id           420 non-null    object
 1   tweet_id          420 non-null    object
 2   tweet_text        420 non-null    object
 3   retweet_count     420 non-null    int64 
 4   favorite_count    420 non-null    int64 
 5   followers_count   420 non-null    int64 
 6   friends_count     420 non-null    int64 
 7   verified          420 non-null    int64 
 8   label             420 non-null    int64 
 9   time_delay (min)  420 non-null    object
dtypes: int64(6), object(4)
memory usage: 32.9+ KB


#### Ferguson

In [122]:
ferguson_s_nr = fetch_source_df('ferguson', 'non-rumours')
ferguson_s_nr.info()

First folder name: 499409455814287360
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 859 entries, 0 to 858
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   user_id          859 non-null    object
 1   tweet_id         859 non-null    object
 2   tweet_text       859 non-null    object
 3   retweet_count    859 non-null    int64 
 4   favorite_count   859 non-null    int64 
 5   followers_count  859 non-null    int64 
 6   friends_count    859 non-null    int64 
 7   verified         859 non-null    int64 
 8   created_at       859 non-null    object
 9   label            859 non-null    int64 
dtypes: int64(6), object(4)
memory usage: 67.2+ KB


  df['label'] = df['label'].replace({'rumor': 1, 'non-rumor': 0})
  df['verified'] = df['verified'].replace({True: 1, False: 0})


In [123]:
ferguson_nr_res = fetch_response_tweets('ferguson', 'non-rumours','non-rumour')
ferguson_nr_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16837 entries, 0 to 16836
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   main_tweet_id          16837 non-null  object
 1   response_tweet_id      16837 non-null  object
 2   user_id                16837 non-null  object
 3   in_reply_to_status_id  16837 non-null  object
 4   in_reply_to_user_id    16837 non-null  object
 5   response_tweet_text    16837 non-null  object
 6   retweet_count          16837 non-null  int64 
 7   favorite_count         16837 non-null  int64 
 8   response_created_at    16837 non-null  object
 9   label                  16837 non-null  int64 
dtypes: int64(3), object(7)
memory usage: 1.3+ MB


  response_tweet_data['label'] = response_tweet_data['label'].replace({'rumour': 1, 'non-rumour': 0})


In [125]:
ferguson_s_nr = add_date_time(ferguson_s_nr, 'created_at')
ferguson_s_nr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 859 entries, 0 to 858
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   user_id          859 non-null    object
 1   tweet_id         859 non-null    object
 2   tweet_text       859 non-null    object
 3   retweet_count    859 non-null    int64 
 4   favorite_count   859 non-null    int64 
 5   followers_count  859 non-null    int64 
 6   friends_count    859 non-null    int64 
 7   verified         859 non-null    int64 
 8   label            859 non-null    int64 
 9   time_posted      859 non-null    object
 10  date_posted      859 non-null    object
dtypes: int64(6), object(5)
memory usage: 73.9+ KB


In [124]:
ferguson_nr_res = add_date_time(ferguson_nr_res, 'response_created_at')
ferguson_nr_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16837 entries, 0 to 16836
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   main_tweet_id          16837 non-null  object
 1   response_tweet_id      16837 non-null  object
 2   user_id                16837 non-null  object
 3   in_reply_to_status_id  16837 non-null  object
 4   in_reply_to_user_id    16837 non-null  object
 5   response_tweet_text    16837 non-null  object
 6   retweet_count          16837 non-null  int64 
 7   favorite_count         16837 non-null  int64 
 8   label                  16837 non-null  int64 
 9   time_posted            16837 non-null  object
 10  date_posted            16837 non-null  object
dtypes: int64(3), object(8)
memory usage: 1.4+ MB


In [126]:
ferguson_s_nr = set_time_delay(ferguson_s_nr, ferguson_nr_res)
ferguson_s_nr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 859 entries, 0 to 858
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   user_id           859 non-null    object
 1   tweet_id          859 non-null    object
 2   tweet_text        859 non-null    object
 3   retweet_count     859 non-null    int64 
 4   favorite_count    859 non-null    int64 
 5   followers_count   859 non-null    int64 
 6   friends_count     859 non-null    int64 
 7   verified          859 non-null    int64 
 8   label             859 non-null    int64 
 9   time_delay (min)  859 non-null    object
dtypes: int64(6), object(4)
memory usage: 67.2+ KB


#### Sydney

In [127]:
sydney_s_nr = fetch_source_df('sydneysiege', 'non-rumours')
sydney_s_nr.info()

First folder name: 544506678922977280
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   user_id          699 non-null    object
 1   tweet_id         699 non-null    object
 2   tweet_text       699 non-null    object
 3   retweet_count    699 non-null    int64 
 4   favorite_count   699 non-null    int64 
 5   followers_count  699 non-null    int64 
 6   friends_count    699 non-null    int64 
 7   verified         699 non-null    int64 
 8   created_at       699 non-null    object
 9   label            699 non-null    int64 
dtypes: int64(6), object(4)
memory usage: 54.7+ KB


  df['label'] = df['label'].replace({'rumor': 1, 'non-rumor': 0})
  df['verified'] = df['verified'].replace({True: 1, False: 0})


In [128]:
sydney_nr_res = fetch_response_tweets('sydneysiege', 'non-rumours','non-rumour')
sydney_nr_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14621 entries, 0 to 14620
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   main_tweet_id          14621 non-null  object
 1   response_tweet_id      14621 non-null  object
 2   user_id                14621 non-null  object
 3   in_reply_to_status_id  14621 non-null  object
 4   in_reply_to_user_id    14621 non-null  object
 5   response_tweet_text    14621 non-null  object
 6   retweet_count          14621 non-null  int64 
 7   favorite_count         14621 non-null  int64 
 8   response_created_at    14621 non-null  object
 9   label                  14621 non-null  int64 
dtypes: int64(3), object(7)
memory usage: 1.1+ MB


  response_tweet_data['label'] = response_tweet_data['label'].replace({'rumour': 1, 'non-rumour': 0})


In [129]:
sydney_s_nr = add_date_time(sydney_s_nr, 'created_at')
sydney_s_nr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   user_id          699 non-null    object
 1   tweet_id         699 non-null    object
 2   tweet_text       699 non-null    object
 3   retweet_count    699 non-null    int64 
 4   favorite_count   699 non-null    int64 
 5   followers_count  699 non-null    int64 
 6   friends_count    699 non-null    int64 
 7   verified         699 non-null    int64 
 8   label            699 non-null    int64 
 9   time_posted      699 non-null    object
 10  date_posted      699 non-null    object
dtypes: int64(6), object(5)
memory usage: 60.2+ KB


In [130]:
sydney_nr_res = add_date_time(sydney_nr_res, 'response_created_at')
sydney_nr_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14621 entries, 0 to 14620
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   main_tweet_id          14621 non-null  object
 1   response_tweet_id      14621 non-null  object
 2   user_id                14621 non-null  object
 3   in_reply_to_status_id  14621 non-null  object
 4   in_reply_to_user_id    14621 non-null  object
 5   response_tweet_text    14621 non-null  object
 6   retweet_count          14621 non-null  int64 
 7   favorite_count         14621 non-null  int64 
 8   label                  14621 non-null  int64 
 9   time_posted            14621 non-null  object
 10  date_posted            14621 non-null  object
dtypes: int64(3), object(8)
memory usage: 1.2+ MB


In [131]:
sydney_s_nr = set_time_delay(sydney_s_nr, sydney_nr_res)
sydney_s_nr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   user_id           699 non-null    object
 1   tweet_id          699 non-null    object
 2   tweet_text        699 non-null    object
 3   retweet_count     699 non-null    int64 
 4   favorite_count    699 non-null    int64 
 5   followers_count   699 non-null    int64 
 6   friends_count     699 non-null    int64 
 7   verified          699 non-null    int64 
 8   label             699 non-null    int64 
 9   time_delay (min)  699 non-null    object
dtypes: int64(6), object(4)
memory usage: 54.7+ KB


## Sentiment analysis with VADER

In [78]:
pip install vaderSentiment

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m968.9 kB/s[0m eta [36m0:00:00[0mB/s[0m eta [36m0:00:01[0m:01[0m
[?25hInstalling collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2
Note: you may need to restart the kernel to use updated packages.


In [68]:
#download the VADER package
import nltk 
nltk.downloader.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/sct/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [132]:
# import the SentimentIntensityAnalyzer class from the nltk.sentiment.vader module
from nltk.sentiment.vader import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

In [133]:
def get_sentiments(df_in):
    
    neg_sent = []
    pos_sent = []
    neu_sent = []
    comp_sent = []

    #get neg, pos, neu and compound sentiment for each tweet in the df
    for i in df_in.index:
        tweet = df_in.loc[i, 'tweet_text']
        sentiment_dict = analyzer.polarity_scores(tweet)
        neg_sent.append(sentiment_dict['neg'])
        pos_sent.append(sentiment_dict['pos'])
        neu_sent.append(sentiment_dict['neu'])
        comp_sent.append(sentiment_dict['compound'])
    
    
    data = {'neg_sent':neg_sent,'pos_sent':pos_sent, 
            'neu_sent':neu_sent, 'comp_sent':comp_sent}
    df_out = pd.DataFrame(data)
    df = pd.concat([df_in, df_out], axis=1)
    return df


In [134]:
germanwings_s_rumors = get_sentiments(germanwings_s_rumors)
germanwings_s_rumors.head()

Unnamed: 0,user_id,tweet_id,tweet_text,retweet_count,favorite_count,followers_count,friends_count,verified,label,time_delay (min),neg_sent,pos_sent,neu_sent,comp_sent
0,16887175,580387098039046145,"Reports of ""moving body"" amidst #Germanwings w...",38,15,337960,6384,1,1,1.2833,0.0,0.0,1.0,0.0
1,328541805,580320242020290560,BREAKING:148passengers were on board #GermanWi...,43,15,52815,293,0,1,1.5,0.0,0.14,0.86,0.3818
2,7309052,580699430690656256,BREAKING: #Germanwings crash victims include 7...,31,5,893549,2312,1,1,4.1333,0.278,0.0,0.722,-0.6124
3,15513604,580340476949086208,BREAKING: 148 feared dead in crashed #Germanwi...,167,32,418641,1859,1,1,3.5167,0.517,0.0,0.483,-0.8176
4,93735135,580320893668364288,Terrible news... Airbus A320 from Barcelona to...,26,1,11062,233,0,1,3.3667,0.171,0.0,0.829,-0.4767


In [135]:
germanwings_s_nr = get_sentiments(germanwings_s_nr)
germanwings_s_nr.head()

Unnamed: 0,user_id,tweet_id,tweet_text,retweet_count,favorite_count,followers_count,friends_count,verified,label,time_delay (min),neg_sent,pos_sent,neu_sent,comp_sent
0,16973333,580330986220687360,#BREAKING: A helicopter has located the Airbus...,94,9,1103558,1758,1,0,3.8,0.184,0.0,0.816,-0.4019
1,64643056,580343329096904704,#Germanwings crash: Arrivals board at #Dusseld...,141,36,891630,482,1,0,0.5167,0.252,0.0,0.748,-0.4019
2,227837742,580883100533690368,"If one pilot leaves, somebody else is supposed...",121,49,525630,139,1,0,2.65,0.0,0.0,1.0,0.0
3,54756055,580355037504909312,"My thoughts, prayers, and deepest condolences ...",1545,1123,183150,68,0,0,88.3333,0.0,0.0,1.0,0.0
4,16664681,581295422414680065,A look at the cockpit-security measures that p...,49,25,1324856,10772,1,0,0.7167,0.0,0.084,0.916,0.0258


In [136]:
germanwings_df = pd.concat([germanwings_s_rumors, germanwings_s_nr], axis=0, ignore_index=True)
germanwings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 469 entries, 0 to 468
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   user_id           469 non-null    object 
 1   tweet_id          469 non-null    object 
 2   tweet_text        469 non-null    object 
 3   retweet_count     469 non-null    int64  
 4   favorite_count    469 non-null    int64  
 5   followers_count   469 non-null    int64  
 6   friends_count     469 non-null    int64  
 7   verified          469 non-null    int64  
 8   label             469 non-null    int64  
 9   time_delay (min)  469 non-null    object 
 10  neg_sent          469 non-null    float64
 11  pos_sent          469 non-null    float64
 12  neu_sent          469 non-null    float64
 13  comp_sent         469 non-null    float64
dtypes: float64(4), int64(6), object(4)
memory usage: 51.4+ KB


In [137]:
charlie_source_df = get_sentiments(charlie_source_df)
charlie_s_nr = get_sentiments(charlie_s_nr)
charlie_df = pd.concat([charlie_source_df, charlie_s_nr], axis=0, ignore_index=True)
charlie_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2079 entries, 0 to 2078
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   user_id           2079 non-null   object 
 1   tweet_id          2079 non-null   object 
 2   tweet_text        2079 non-null   object 
 3   retweet_count     2079 non-null   int64  
 4   favorite_count    2079 non-null   int64  
 5   followers_count   2079 non-null   int64  
 6   friends_count     2079 non-null   int64  
 7   verified          2079 non-null   int64  
 8   label             2079 non-null   int64  
 9   time_delay (min)  2079 non-null   object 
 10  neg_sent          2079 non-null   float64
 11  pos_sent          2079 non-null   float64
 12  neu_sent          2079 non-null   float64
 13  comp_sent         2079 non-null   float64
dtypes: float64(4), int64(6), object(4)
memory usage: 227.5+ KB


In [138]:
ottawa_rs_df = get_sentiments(ottawa_rs_df)
ottawa_s_nr = get_sentiments(ottawa_s_nr)
ottawa_df = pd.concat([ottawa_rs_df, ottawa_s_nr], axis=0, ignore_index=True)
ottawa_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 890 entries, 0 to 889
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   user_id           890 non-null    object 
 1   tweet_id          890 non-null    object 
 2   tweet_text        890 non-null    object 
 3   retweet_count     890 non-null    int64  
 4   favorite_count    890 non-null    int64  
 5   followers_count   890 non-null    int64  
 6   friends_count     890 non-null    int64  
 7   verified          890 non-null    int64  
 8   label             890 non-null    int64  
 9   time_delay (min)  890 non-null    object 
 10  neg_sent          890 non-null    float64
 11  pos_sent          890 non-null    float64
 12  neu_sent          890 non-null    float64
 13  comp_sent         890 non-null    float64
dtypes: float64(4), int64(6), object(4)
memory usage: 97.5+ KB


In [147]:
ferguson_rs_df = get_sentiments(ferguson_rs_df)
ferguson_s_nr = get_sentiments(ferguson_s_nr)
ferguson_df = pd.concat([ferguson_rs_df, ferguson_s_nr], axis=0, ignore_index=True)
ferguson_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1143 entries, 0 to 1142
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   user_id           1143 non-null   object 
 1   tweet_id          1143 non-null   object 
 2   tweet_text        1143 non-null   object 
 3   retweet_count     1143 non-null   int64  
 4   favorite_count    1143 non-null   int64  
 5   followers_count   1143 non-null   int64  
 6   friends_count     1143 non-null   int64  
 7   verified          1143 non-null   int64  
 8   label             1143 non-null   int64  
 9   time_delay (min)  1143 non-null   object 
 10  neg_sent          1143 non-null   float64
 11  pos_sent          1143 non-null   float64
 12  neu_sent          1143 non-null   float64
 13  comp_sent         1143 non-null   float64
dtypes: float64(4), int64(6), object(4)
memory usage: 125.1+ KB


In [148]:
sydney_sr = get_sentiments(sydney_sr)
sydney_s_nr = get_sentiments(sydney_s_nr)
sydney_df = pd.concat([sydney_sr, sydney_s_nr], axis=0, ignore_index=True)
sydney_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1221 entries, 0 to 1220
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   user_id           1221 non-null   object 
 1   tweet_id          1221 non-null   object 
 2   tweet_text        1221 non-null   object 
 3   retweet_count     1221 non-null   int64  
 4   favorite_count    1221 non-null   int64  
 5   followers_count   1221 non-null   int64  
 6   friends_count     1221 non-null   int64  
 7   verified          1221 non-null   int64  
 8   label             1221 non-null   int64  
 9   time_delay (min)  1221 non-null   object 
 10  neg_sent          1221 non-null   float64
 11  pos_sent          1221 non-null   float64
 12  neu_sent          1221 non-null   float64
 13  comp_sent         1221 non-null   float64
dtypes: float64(4), int64(6), object(4)
memory usage: 133.7+ KB


## Save datasets

In [149]:
germanwings_df.to_csv('germanwings.csv', index=False)
print('File saved')

File saved


In [150]:
charlie_df.to_csv('charlie.csv', index=False)
print('File saved')

File saved


In [151]:
ottawa_df.to_csv('ottawa.csv', index=False)
print('File saved')

File saved


In [152]:
ferguson_df.to_csv('ferguson.csv', index=False)
print('File saved')

File saved


In [153]:
sydney_df.to_csv('sydney.csv', index=False)
print('File saved')

File saved
