In [253]:
import pandas as pd
import numpy as np
import re
import nltk
import joblib
from textblob import TextBlob
from sklearn.preprocessing import StandardScaler

**Reading CSV and processing text**

In [254]:
def pre_process_text(df):
    text = list(df['text'])
    sentiment = []
    for i in range(len(text)):
        if type(text[i]) != float:
            text[i] = " ".join(filter(lambda x:x[0]!='@', text[i].split()))
            text[i] = " ".join(filter(lambda x:x[0:4]!='http', text[i].split()))
            text[i] = " ".join(filter(lambda x:x[0:4]!='&amp', text[i].split()))
            text[i] = re.sub(r'[^a-zA-Z ]','',text[i])
            text[i] = " ".join(filter(lambda x:x[0:1]!='Q', text[i].split()))
            text[i] = " ".join(filter(lambda x:x[0:1]!='A', text[i].split()))
        sentiment.append(TextBlob(str(text[i])).sentiment.polarity)
    return text, sentiment

In [255]:
def prepare_df(path):
    sources = joblib.load("Data/" + path)
    sources['text'], sources['polarity'] = pre_process_text(sources)
    sources.text = sources.text + " " + sources.hashtag
    return sources


In [256]:
train_sources = prepare_df('train_sources.data')
train_replies = prepare_df('train_replies.data')
dev_sources = prepare_df('dev_sources.data')
dev_replies = prepare_df('dev_replies.data')
test_sources = prepare_df('test_sources.data')
test_replies = prepare_df('test_replies.data')
train_sources

Unnamed: 0_level_0,user,verified,followers,text,retweets,favorites,statuses_count,hashtag,class,polarity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1218255692831903744,WHO,True,11173284.0,Is there a treatment for a novel coronavirus i...,202.0,265.0,69221.0,coronavirus,nonrumour,0.25
1218269428166602753,WHO,True,11173285.0,What can I do to protect myself from coronavir...,2007.0,2057.0,69221.0,coronavirus,nonrumour,0.00
1219979825848442880,WHO,True,11173284.0,Who are the members of an International Health...,20.0,43.0,69221.0,coronavirus,nonrumour,0.00
1219981282928668672,WHO,True,11173285.0,How are the members of an International Health...,21.0,49.0,69221.0,coronavirus,nonrumour,0.00
1219982106757148673,WHO,True,11173285.0,How are people appointed to the expert advisor...,19.0,44.0,69221.0,coronavirus,nonrumour,0.00
...,...,...,...,...,...,...,...,...,...,...
676367888543031296,nycjim,True,211869.0,North Carolina town rejects solar panels for s...,329.0,170.0,137128.0,,nonrumour,0.00
676718762830221312,Complex,True,2574209.0,Texas plumber sues car dealership after his tr...,170.0,258.0,375499.0,,rumour,0.00
676870737932742656,ABC,True,17466165.0,Plumber suing car dealership for M after truck...,157.0,99.0,384381.0,,rumour,0.00
677099574855639044,ABC,True,17466165.0,Plumber suing car dealership for M after truck...,104.0,80.0,384381.0,,rumour,0.00


Checking data is in correct format/dtypes

In [257]:
for df in [train_sources, train_replies, dev_sources, dev_replies, test_sources, test_replies]:
    print(df.dtypes)



user               object
verified           object
followers         float64
text               object
retweets          float64
favorites         float64
statuses_count    float64
hashtag            object
class              object
polarity          float64
dtype: object
user               object
verified             bool
followers           int64
text               object
retweets            int64
favorites           int64
statuses_count      int64
hashtag            object
source id          object
source class       object
polarity          float64
dtype: object
user               object
verified           object
followers         float64
text               object
retweets          float64
favorites         float64
statuses_count    float64
hashtag            object
class              object
polarity          float64
dtype: object
user               object
verified             bool
followers           int64
text               object
retweets            int64
favorites           in

In [258]:
train_replies['source class'].value_counts()


nonrumour    15785
rumour       12700
Name: source class, dtype: int64

**Tokenize and lemmatize text**

In [259]:
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords, wordnet

tt = TweetTokenizer()
stopwords = set(stopwords.words('english')) #note: stopwords are all in lowercase
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()

def lemmatize(word):
    lemma = lemmatizer.lemmatize(word,'v')
    if lemma == word:
        lemma = lemmatizer.lemmatize(word,'n')
    return lemma

def create_tokens(df):
    tokens_list = []
    for _id, row in df.iterrows():
        text = row['text']
        # tokenize tweet
        tokens = tt.tokenize(text)
        # convert to lowercase
        tokens = [tok.lower() for tok in tokens]
        # remove stopwords
        tokens = [tok for tok in tokens if tok not in stopwords]
        # lemmatize
        tokens = [lemmatize(tok) for tok in tokens]
        
        tokens_list.append(tokens)

    return tokens_list

In [260]:
train_sources.insert(4, 'tokens', create_tokens(train_sources))
dev_sources.insert(4, 'tokens', create_tokens(dev_sources))
test_sources.insert(4, 'tokens', create_tokens(test_sources))
train_sources.head()

Unnamed: 0_level_0,user,verified,followers,text,tokens,retweets,favorites,statuses_count,hashtag,class,polarity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1218255692831903744,WHO,True,11173284.0,Is there a treatment for a novel coronavirus i...,"[treatment, novel, coronavirus, infection, spe...",202.0,265.0,69221.0,coronavirus,nonrumour,0.25
1218269428166602753,WHO,True,11173285.0,What can I do to protect myself from coronavir...,"[protect, coronavirus, coronavirus]",2007.0,2057.0,69221.0,coronavirus,nonrumour,0.0
1219979825848442880,WHO,True,11173284.0,Who are the members of an International Health...,"[member, international, health, regulation, em...",20.0,43.0,69221.0,coronavirus,nonrumour,0.0
1219981282928668672,WHO,True,11173285.0,How are the members of an International Health...,"[member, international, health, regulation, em...",21.0,49.0,69221.0,coronavirus,nonrumour,0.0
1219982106757148673,WHO,True,11173285.0,How are people appointed to the expert advisor...,"[people, appoint, expert, advisory, panel, int...",19.0,44.0,69221.0,coronavirus,nonrumour,0.0


In [261]:
train_replies.insert(4, 'tokens', create_tokens(train_replies))
dev_replies.insert(4, 'tokens', create_tokens(dev_replies))
test_replies.insert(4, 'tokens', create_tokens(test_replies))
train_replies.head()

Unnamed: 0_level_0,user,verified,followers,text,tokens,retweets,favorites,statuses_count,hashtag,source id,source class,polarity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1250219116993974272,ucoptempe,False,410,Can eating garlic help prevent infection with ...,"[eat, garlic, help, prevent, infection, new, c...",0,0,26613,COVID19Malaysia,1250219300389974016,nonrumour,0.136364
1250219437027766273,ucoptempe,False,410,Do vaccines against pneumonia protect you agai...,"[vaccine, pneumonia, protect, new, coronavirus]",0,0,26613,,1250219300389974016,nonrumour,0.136364
1250219620939657216,ucoptempe,False,410,Can spraying alcohol or chlorine all over your...,"[spray, alcohol, chlorine, body, kill, new, co...",0,0,26613,Chamber,1250219300389974016,nonrumour,0.136364
1250219777185873922,ucoptempe,False,410,How effective are thermal scanners in detectin...,"[effective, thermal, scanner, detect, people, ...",0,0,26613,,1250219300389974016,nonrumour,0.368182
1250219894429208577,ucoptempe,False,410,Can an ultraviolet disinfection lamp kill the ...,"[ultraviolet, disinfection, lamp, kill, new, c...",0,0,26613,,1250219300389974016,nonrumour,0.136364


**Counting users**

In [262]:
# Finds users with multiple tweets in training data.
source_user_counts = train_sources.user.value_counts()
replies_user_counts = train_replies.user.value_counts()
source_user_counts

WHO              22
ABC              16
Independent      15
WHOWPRO          13
mashable         11
                 ..
ClarkeMicah       1
kamalkano         1
62jerseygirl      1
aparanjape        1
topislamicnet     1
Name: user, Length: 1099, dtype: int64

**Finding users with multiple tweets and removing users with only one tweet**

In [263]:
def find_multi_tweet_users(counts):
    multiple_tweet_users = []
    for id, count in counts.iteritems():
        if count > 1:
            multiple_tweet_users.append(id)
    return multiple_tweet_users

source_multiple_tweet_users = find_multi_tweet_users(source_user_counts)
replies_multiple_tweet_users = find_multi_tweet_users(replies_user_counts)
source_multiple_tweet_users


['WHO',
 'ABC',
 'Independent',
 'WHOWPRO',
 'mashable',
 'FoxNews',
 'ucoptempe',
 'RT_com',
 'BuzzFeed',
 'WSJ',
 'AP',
 'TIME',
 'pepesikanteri',
 'teawithdev',
 'CNN',
 'HuffPost',
 'IshiwiMyVoice',
 'OzgurTufekci',
 'eteobong_edem',
 'cnnbrk',
 'Fact',
 'CianSOBrien',
 'lutzmache',
 'Ausbones',
 'NBCNews',
 'eric_tablizo',
 'enews',
 'Wissenzoro',
 'UNICEFJamaica',
 'cnni',
 'TMZ',
 'gustasticgus',
 'deray',
 'YoungIndiaFDN',
 'BamboVaneHope',
 'nytimes',
 'Complex',
 'BreakingNews',
 'ndmaindia',
 'WHOSEARO',
 'CBCNews',
 'sagadkopaba',
 'RepUrJersey',
 'Agalib48',
 'ThePoke',
 'WHOUganda',
 'Agali_GCFR',
 'CBCAlerts',
 'AVONHMO',
 'UmerBasharat7',
 'BBCBreaking',
 'RoySparringa',
 'WHONigeria',
 'YourAnonNews',
 'KKMPutrajaya',
 'usweekly',
 'CP24',
 'Cosmopolitan',
 'fursid',
 'washingtonpost',
 'AlbuniDouaa',
 'TwitchyTeam',
 'Telegraph',
 'CNNPolitics',
 'ALOKdelhi6',
 'EyeRadioJuba',
 'CBCTheNational',
 'SavioSt',
 'glowyrosecare',
 'guardian',
 'ChinaDaily',
 'CBSNews',
 'D

In [264]:
len(source_multiple_tweet_users)

156

In [265]:
single_name = 'Single or unseen user'
train_sources.loc[train_sources['user'].isin(source_multiple_tweet_users) == False, 'user'] = single_name
train_replies.loc[train_replies['user'].isin(replies_multiple_tweet_users) == False, 'user'] = single_name
dev_sources.loc[dev_sources['user'].isin(source_multiple_tweet_users) == False, 'user'] = single_name
dev_replies.loc[dev_replies['user'].isin(replies_multiple_tweet_users) == False, 'user'] = single_name
test_sources.loc[test_sources['user'].isin(source_multiple_tweet_users) == False, 'user'] = single_name
test_replies.loc[test_replies['user'].isin(replies_multiple_tweet_users) == False, 'user'] = single_name
train_sources

Unnamed: 0_level_0,user,verified,followers,text,tokens,retweets,favorites,statuses_count,hashtag,class,polarity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1218255692831903744,WHO,True,11173284.0,Is there a treatment for a novel coronavirus i...,"[treatment, novel, coronavirus, infection, spe...",202.0,265.0,69221.0,coronavirus,nonrumour,0.25
1218269428166602753,WHO,True,11173285.0,What can I do to protect myself from coronavir...,"[protect, coronavirus, coronavirus]",2007.0,2057.0,69221.0,coronavirus,nonrumour,0.00
1219979825848442880,WHO,True,11173284.0,Who are the members of an International Health...,"[member, international, health, regulation, em...",20.0,43.0,69221.0,coronavirus,nonrumour,0.00
1219981282928668672,WHO,True,11173285.0,How are the members of an International Health...,"[member, international, health, regulation, em...",21.0,49.0,69221.0,coronavirus,nonrumour,0.00
1219982106757148673,WHO,True,11173285.0,How are people appointed to the expert advisor...,"[people, appoint, expert, advisory, panel, int...",19.0,44.0,69221.0,coronavirus,nonrumour,0.00
...,...,...,...,...,...,...,...,...,...,...,...
676367888543031296,nycjim,True,211869.0,North Carolina town rejects solar panels for s...,"[north, carolina, town, reject, solar, panel, ...",329.0,170.0,137128.0,,nonrumour,0.00
676718762830221312,Complex,True,2574209.0,Texas plumber sues car dealership after his tr...,"[texas, plumber, sue, car, dealership, truck, ...",170.0,258.0,375499.0,,rumour,0.00
676870737932742656,ABC,True,17466165.0,Plumber suing car dealership for M after truck...,"[plumber, sue, car, dealership, truck, trade, ...",157.0,99.0,384381.0,,rumour,0.00
677099574855639044,ABC,True,17466165.0,Plumber suing car dealership for M after truck...,"[plumber, sue, car, dealership, truck, trade, ...",104.0,80.0,384381.0,,rumour,0.00


**Feature Encoding**

In [266]:
def encode_df(df):
    encoded = df
    encoded['verified']=encoded['verified'].map({True:1, False:0})

    followers_encoded = []
    list_followers = list(encoded.followers)

    for n in list_followers:
        if n <1000:
            followers_encoded.append(1)
        elif n<5000:
            followers_encoded.append(2)
        elif n<20000:
            followers_encoded.append(3)
        elif n<50000:
            followers_encoded.append(4)
        elif n<200000:
            followers_encoded.append(5)
        else:
            followers_encoded.append(6)
        
    encoded.followers = followers_encoded
    
    return encoded

In [267]:
train_sources = encode_df(train_sources)
dev_sources = encode_df(dev_sources)
test_sources = encode_df(test_sources)
train_sources.head()

Unnamed: 0_level_0,user,verified,followers,text,tokens,retweets,favorites,statuses_count,hashtag,class,polarity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1218255692831903744,WHO,1,6,Is there a treatment for a novel coronavirus i...,"[treatment, novel, coronavirus, infection, spe...",202.0,265.0,69221.0,coronavirus,nonrumour,0.25
1218269428166602753,WHO,1,6,What can I do to protect myself from coronavir...,"[protect, coronavirus, coronavirus]",2007.0,2057.0,69221.0,coronavirus,nonrumour,0.0
1219979825848442880,WHO,1,6,Who are the members of an International Health...,"[member, international, health, regulation, em...",20.0,43.0,69221.0,coronavirus,nonrumour,0.0
1219981282928668672,WHO,1,6,How are the members of an International Health...,"[member, international, health, regulation, em...",21.0,49.0,69221.0,coronavirus,nonrumour,0.0
1219982106757148673,WHO,1,6,How are people appointed to the expert advisor...,"[people, appoint, expert, advisory, panel, int...",19.0,44.0,69221.0,coronavirus,nonrumour,0.0


In [268]:
train_replies = encode_df(train_replies)
dev_replies = encode_df(dev_replies)
test_replies = encode_df(test_replies)
train_replies.head()

Unnamed: 0_level_0,user,verified,followers,text,tokens,retweets,favorites,statuses_count,hashtag,source id,source class,polarity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1250219116993974272,ucoptempe,0,1,Can eating garlic help prevent infection with ...,"[eat, garlic, help, prevent, infection, new, c...",0,0,26613,COVID19Malaysia,1250219300389974016,nonrumour,0.136364
1250219437027766273,ucoptempe,0,1,Do vaccines against pneumonia protect you agai...,"[vaccine, pneumonia, protect, new, coronavirus]",0,0,26613,,1250219300389974016,nonrumour,0.136364
1250219620939657216,ucoptempe,0,1,Can spraying alcohol or chlorine all over your...,"[spray, alcohol, chlorine, body, kill, new, co...",0,0,26613,Chamber,1250219300389974016,nonrumour,0.136364
1250219777185873922,ucoptempe,0,1,How effective are thermal scanners in detectin...,"[effective, thermal, scanner, detect, people, ...",0,0,26613,,1250219300389974016,nonrumour,0.368182
1250219894429208577,ucoptempe,0,1,Can an ultraviolet disinfection lamp kill the ...,"[ultraviolet, disinfection, lamp, kill, new, c...",0,0,26613,,1250219300389974016,nonrumour,0.136364


In [269]:
train_sources.drop(['text','hashtag'],axis=1,inplace=True)
dev_sources.drop(['text','hashtag'],axis=1,inplace=True)
test_sources.drop(['text','hashtag'],axis=1,inplace=True)
train_sources

Unnamed: 0_level_0,user,verified,followers,tokens,retweets,favorites,statuses_count,class,polarity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1218255692831903744,WHO,1,6,"[treatment, novel, coronavirus, infection, spe...",202.0,265.0,69221.0,nonrumour,0.25
1218269428166602753,WHO,1,6,"[protect, coronavirus, coronavirus]",2007.0,2057.0,69221.0,nonrumour,0.00
1219979825848442880,WHO,1,6,"[member, international, health, regulation, em...",20.0,43.0,69221.0,nonrumour,0.00
1219981282928668672,WHO,1,6,"[member, international, health, regulation, em...",21.0,49.0,69221.0,nonrumour,0.00
1219982106757148673,WHO,1,6,"[people, appoint, expert, advisory, panel, int...",19.0,44.0,69221.0,nonrumour,0.00
...,...,...,...,...,...,...,...,...,...
676367888543031296,nycjim,1,6,"[north, carolina, town, reject, solar, panel, ...",329.0,170.0,137128.0,nonrumour,0.00
676718762830221312,Complex,1,6,"[texas, plumber, sue, car, dealership, truck, ...",170.0,258.0,375499.0,rumour,0.00
676870737932742656,ABC,1,6,"[plumber, sue, car, dealership, truck, trade, ...",157.0,99.0,384381.0,rumour,0.00
677099574855639044,ABC,1,6,"[plumber, sue, car, dealership, truck, trade, ...",104.0,80.0,384381.0,rumour,0.00


In [270]:
train_replies.drop(['text','hashtag','source id'],axis=1,inplace=True)
dev_replies.drop(['text','hashtag','source id'],axis=1,inplace=True)
test_replies.drop(['text','hashtag'],axis=1,inplace=True)
train_replies

Unnamed: 0_level_0,user,verified,followers,tokens,retweets,favorites,statuses_count,source class,polarity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1250219116993974272,ucoptempe,0,1,"[eat, garlic, help, prevent, infection, new, c...",0,0,26613,nonrumour,0.136364
1250219437027766273,ucoptempe,0,1,"[vaccine, pneumonia, protect, new, coronavirus]",0,0,26613,nonrumour,0.136364
1250219620939657216,ucoptempe,0,1,"[spray, alcohol, chlorine, body, kill, new, co...",0,0,26613,nonrumour,0.136364
1250219777185873922,ucoptempe,0,1,"[effective, thermal, scanner, detect, people, ...",0,0,26613,nonrumour,0.368182
1250219894429208577,ucoptempe,0,1,"[ultraviolet, disinfection, lamp, kill, new, c...",0,0,26613,nonrumour,0.136364
...,...,...,...,...,...,...,...,...,...
1241078443179155457,lynneSimpkin,0,1,"[good, work, new, home, school, timetable]",0,1,5118,nonrumour,0.418182
1241085785060847617,lynneSimpkin,0,1,"[best, timetable, ive, hear, minecraft, bad, r...",0,1,5118,nonrumour,-0.066667
1241086391267872770,rosierawle,0,2,"[exactly, need, come, class, haha]",0,1,5507,nonrumour,0.225000
1241086505566732289,lynneSimpkin,0,1,"[might, send, twice, accident]",0,1,5118,nonrumour,0.000000


In [271]:
def count_tokens(df):
    token_counts = {}
    for token_list in list(df['tokens']):
        for token in token_list:
            if token in token_counts:
                token_counts[token] += 1
            else:
                token_counts[token] = 1
    return {k: v for k, v in sorted(token_counts.items(), key=lambda item: item[1],reverse=True)}
            
replies_uni_counts = count_tokens(train_replies)
source_uni_counts = count_tokens(train_sources)
replies_uni_counts

{'covid': 2338,
 'coronavirus': 2237,
 'people': 1976,
 'new': 1668,
 'get': 1275,
 'like': 1197,
 'virus': 1104,
 'say': 1101,
 'dont': 1014,
 'go': 1005,
 'make': 917,
 'one': 914,
 'know': 881,
 'u': 814,
 'think': 792,
 'rt': 768,
 'would': 737,
 'need': 732,
 'see': 712,
 'good': 702,
 'take': 669,
 'kill': 635,
 'im': 635,
 'thats': 632,
 'test': 628,
 'time': 606,
 'prevent': 605,
 'work': 603,
 'hand': 598,
 'day': 554,
 'die': 547,
 'use': 530,
 'spread': 527,
 'someone': 520,
 'spider': 513,
 'disease': 507,
 'care': 501,
 'death': 491,
 'give': 487,
 'want': 477,
 'look': 476,
 'come': 473,
 'even': 460,
 'still': 444,
 'right': 444,
 'protect': 440,
 'person': 435,
 'stop': 433,
 'report': 432,
 'infect': 429,
 'help': 424,
 'infection': 420,
 'thank': 419,
 'treat': 418,
 'find': 417,
 'really': 414,
 'case': 410,
 'million': 410,
 'well': 404,
 'way': 403,
 'news': 400,
 'wear': 400,
 'may': 396,
 'woman': 395,
 'fuck': 388,
 'oh': 383,
 '19': 379,
 'thing': 377,
 'paul':

Create new columns for token one-hot encoding

In [272]:

source_top_tokens = list(source_uni_counts.keys())[0:500]

new_cols = [x for x in list(train_sources.columns) if x != 'class'] + source_top_tokens + ['class']
train_sources = train_sources.reindex(columns=new_cols, fill_value=0)
dev_sources = dev_sources.reindex(columns=new_cols, fill_value=0)
test_sources = test_sources.reindex(columns=new_cols, fill_value=0)
train_sources.head()

Unnamed: 0_level_0,user,verified,followers,tokens,retweets,favorites,statuses_count,polarity,covid,coronavirus,...,extremely,frediboat,likely,coronavirusupdates,bad,coronafreepakistan,th,sir,wrong,class
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1218255692831903744,WHO,1,6,"[treatment, novel, coronavirus, infection, spe...",202.0,265.0,69221.0,0.25,0,0,...,0,0,0,0,0,0,0,0,0,nonrumour
1218269428166602753,WHO,1,6,"[protect, coronavirus, coronavirus]",2007.0,2057.0,69221.0,0.0,0,0,...,0,0,0,0,0,0,0,0,0,nonrumour
1219979825848442880,WHO,1,6,"[member, international, health, regulation, em...",20.0,43.0,69221.0,0.0,0,0,...,0,0,0,0,0,0,0,0,0,nonrumour
1219981282928668672,WHO,1,6,"[member, international, health, regulation, em...",21.0,49.0,69221.0,0.0,0,0,...,0,0,0,0,0,0,0,0,0,nonrumour
1219982106757148673,WHO,1,6,"[people, appoint, expert, advisory, panel, int...",19.0,44.0,69221.0,0.0,0,0,...,0,0,0,0,0,0,0,0,0,nonrumour


In [273]:

reply_top_tokens = list(replies_uni_counts.keys())[0:500]

new_cols = [x for x in list(train_replies.columns) if x != 'source class']+ reply_top_tokens +['source class']
train_replies = train_replies.reindex(columns=new_cols, fill_value=0)
dev_replies = dev_replies.reindex(columns=new_cols, fill_value=0)
test_replies = test_replies.reindex(columns=new_cols, fill_value=0)
train_replies

Unnamed: 0_level_0,user,verified,followers,tokens,retweets,favorites,statuses_count,polarity,covid,coronavirus,...,chlorine,weather,turn,study,kind,saw,provide,behead,hospital,source class
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1250219116993974272,ucoptempe,0,1,"[eat, garlic, help, prevent, infection, new, c...",0,0,26613,0.136364,0,0,...,0,0,0,0,0,0,0,0,0,nonrumour
1250219437027766273,ucoptempe,0,1,"[vaccine, pneumonia, protect, new, coronavirus]",0,0,26613,0.136364,0,0,...,0,0,0,0,0,0,0,0,0,nonrumour
1250219620939657216,ucoptempe,0,1,"[spray, alcohol, chlorine, body, kill, new, co...",0,0,26613,0.136364,0,0,...,0,0,0,0,0,0,0,0,0,nonrumour
1250219777185873922,ucoptempe,0,1,"[effective, thermal, scanner, detect, people, ...",0,0,26613,0.368182,0,0,...,0,0,0,0,0,0,0,0,0,nonrumour
1250219894429208577,ucoptempe,0,1,"[ultraviolet, disinfection, lamp, kill, new, c...",0,0,26613,0.136364,0,0,...,0,0,0,0,0,0,0,0,0,nonrumour
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1241078443179155457,lynneSimpkin,0,1,"[good, work, new, home, school, timetable]",0,1,5118,0.418182,0,0,...,0,0,0,0,0,0,0,0,0,nonrumour
1241085785060847617,lynneSimpkin,0,1,"[best, timetable, ive, hear, minecraft, bad, r...",0,1,5118,-0.066667,0,0,...,0,0,0,0,0,0,0,0,0,nonrumour
1241086391267872770,rosierawle,0,2,"[exactly, need, come, class, haha]",0,1,5507,0.225000,0,0,...,0,0,0,0,0,0,0,0,0,nonrumour
1241086505566732289,lynneSimpkin,0,1,"[might, send, twice, accident]",0,1,5118,0.000000,0,0,...,0,0,0,0,0,0,0,0,0,nonrumour


In [274]:

def onehot_encode_tokens(df, top_users):
    for i, row in df.iterrows():
        for token in row['tokens']:
            if token in top_users:
                df.at[i, token] = 1
    df.drop(['tokens'],axis=1,inplace=True)
    return df
train_sources = onehot_encode_tokens(train_sources, source_top_tokens)
dev_sources = onehot_encode_tokens(dev_sources, source_top_tokens)
test_sources = onehot_encode_tokens(test_sources, source_top_tokens)

train_replies = onehot_encode_tokens(train_replies, reply_top_tokens)
dev_replies = onehot_encode_tokens(dev_replies, reply_top_tokens)
test_replies = onehot_encode_tokens(test_replies, reply_top_tokens)
train_replies

Unnamed: 0_level_0,user,verified,followers,retweets,favorites,statuses_count,polarity,covid,coronavirus,people,...,chlorine,weather,turn,study,kind,saw,provide,behead,hospital,source class
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1250219116993974272,ucoptempe,0,1,0,0,26613,0.136364,1,1,0,...,0,0,0,0,0,0,0,0,0,nonrumour
1250219437027766273,ucoptempe,0,1,0,0,26613,0.136364,0,1,0,...,0,0,0,0,0,0,0,0,0,nonrumour
1250219620939657216,ucoptempe,0,1,0,0,26613,0.136364,0,1,0,...,1,0,0,0,0,0,0,0,0,nonrumour
1250219777185873922,ucoptempe,0,1,0,0,26613,0.368182,0,1,1,...,0,0,0,0,0,0,0,0,0,nonrumour
1250219894429208577,ucoptempe,0,1,0,0,26613,0.136364,0,1,0,...,0,0,0,0,0,0,0,0,0,nonrumour
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1241078443179155457,lynneSimpkin,0,1,0,1,5118,0.418182,0,0,0,...,0,0,0,0,0,0,0,0,0,nonrumour
1241085785060847617,lynneSimpkin,0,1,0,1,5118,-0.066667,0,0,0,...,0,0,0,0,0,0,0,0,0,nonrumour
1241086391267872770,rosierawle,0,2,0,1,5507,0.225000,0,0,0,...,0,0,0,0,0,0,0,0,0,nonrumour
1241086505566732289,lynneSimpkin,0,1,0,1,5118,0.000000,0,0,0,...,0,0,0,0,0,0,0,0,0,nonrumour


In [275]:

print(len(train_sources.columns))
print(len(dev_sources.columns))
print(len(test_sources.columns))


508
508
508


One-hot encoding for users

In [276]:
"""
def add_dummies(df, dummy_columns):

    dummies = pd.get_dummies(df['user'], columns=dummy_columns)
    
    df = pd.concat([df, dummies], axis=1)
    
    return df
source_dummies = pd.get_dummies(train_sources['user'])
#train_sources = add_dummies(train_sources, source_dummies.columns)

"""


"\ndef add_dummies(df, dummy_columns):\n\n    dummies = pd.get_dummies(df['user'], columns=dummy_columns)\n    \n    df = pd.concat([df, dummies], axis=1)\n    \n    return df\nsource_dummies = pd.get_dummies(train_sources['user'])\n#train_sources = add_dummies(train_sources, source_dummies.columns)\n\n"

In [277]:
"""
dev_sources = add_dummies(dev_sources, source_dummies)
test_sources = add_dummies(test_sources, source_dummies)

reply_dummies = pd.get_dummies(train_replies['user'])

train_replies = add_dummies(train_replies, reply_dummies.columns)
dev_replies = add_dummies(dev_replies, reply_dummies.columns)
test_replies = add_dummies(test_replies, reply_dummies.columns)
train_replies

"""

"\ndev_sources = add_dummies(dev_sources, source_dummies)\ntest_sources = add_dummies(test_sources, source_dummies)\n\nreply_dummies = pd.get_dummies(train_replies['user'])\n\ntrain_replies = add_dummies(train_replies, reply_dummies.columns)\ndev_replies = add_dummies(dev_replies, reply_dummies.columns)\ntest_replies = add_dummies(test_replies, reply_dummies.columns)\ntrain_replies\n\n"

In [278]:
print(len(train_sources.columns))
print(len(dev_sources.columns))
print(len(test_sources.columns))

508
508
508


In [279]:
train_sources.drop(['user'],axis=1,inplace=True)
dev_sources.drop(['user'],axis=1,inplace=True)
test_sources.drop(['user'],axis=1,inplace=True)
train_replies.drop(['user'],axis=1,inplace=True)
dev_replies.drop(['user'],axis=1,inplace=True)
test_replies.drop(['user'],axis=1,inplace=True)

In [280]:
def count_rumours(df):
    rumour_count = {'nonrumour': 0, 'rumour':0}
    for id, x in df['source class'].iteritems():
        rumour_count[x]+=1
    return rumour_count
print("Train replies rumors: ")   
print(count_rumours(train_replies))
print("Dev replies rumors: ")   
print(count_rumours(dev_replies))

Train replies rumors: 
{'nonrumour': 15785, 'rumour': 12700}
Dev replies rumors: 
{'nonrumour': 6195, 'rumour': 3727}


In [281]:
test_sources

Unnamed: 0_level_0,verified,followers,retweets,favorites,statuses_count,polarity,covid,coronavirus,new,virus,...,extremely,frediboat,likely,coronavirusupdates,bad,coronafreepakistan,th,sir,wrong,class
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1249529725019738113,0,1,0,0,1297,0.000000,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
663722803489808384,1,6,80,74,466664,0.000000,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1245172237977767936,0,1,0,0,708,0.000000,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1244004581010550785,0,2,0,0,69134,0.333333,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1245764856835342338,1,5,14,32,18066,0.000000,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
553061846081896449,1,6,3042,2369,57041,0.000000,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
523863683928457216,1,6,1169,626,334838,0.000000,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1278345269751492611,0,2,10,54,16214,0.000000,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1249209381813714944,0,1,0,0,2175,0.230000,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


**Scaling**

In [282]:
scaler = StandardScaler()
for feature in ['retweets', 'favorites', 'statuses_count']:
    train_sources[feature] = scaler.fit_transform(train_sources[feature].values.reshape(-1,1))
    dev_sources[feature] = scaler.transform(dev_sources[feature].values.reshape(-1,1))
    test_sources[feature] = scaler.transform(test_sources[feature].values.reshape(-1,1))

    train_replies[feature] = scaler.fit_transform(train_replies[feature].values.reshape(-1,1))
    dev_replies[feature] = scaler.transform(dev_replies[feature].values.reshape(-1,1))
    test_replies[feature] = scaler.transform(test_replies[feature].values.reshape(-1,1))
train_sources

Unnamed: 0_level_0,verified,followers,retweets,favorites,statuses_count,polarity,covid,coronavirus,new,virus,...,extremely,frediboat,likely,coronavirusupdates,bad,coronafreepakistan,th,sir,wrong,class
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1218255692831903744,1,6,0.207176,0.394158,-0.241400,0.25,0,1,0,0,...,0,0,0,0,0,0,0,0,0,nonrumour
1218269428166602753,1,6,5.190195,4.844430,-0.241400,0.00,0,1,0,0,...,0,0,0,0,0,0,0,0,0,nonrumour
1219979825848442880,1,6,-0.295267,-0.157160,-0.241400,0.00,0,1,0,0,...,0,0,0,0,0,0,0,0,0,nonrumour
1219981282928668672,1,6,-0.292506,-0.142259,-0.241400,0.00,0,1,0,0,...,0,0,0,0,0,0,0,0,0,nonrumour
1219982106757148673,1,6,-0.298027,-0.154676,-0.241400,0.00,0,1,0,0,...,0,0,0,0,0,0,0,0,0,nonrumour
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
676367888543031296,1,6,0.557782,0.158234,0.101724,0.00,0,0,0,0,...,0,0,0,0,0,0,0,0,0,nonrumour
676718762830221312,1,6,0.118835,0.376774,1.306176,0.00,0,0,0,0,...,0,0,0,0,0,0,0,0,0,rumour
676870737932742656,1,6,0.082946,-0.018089,1.351055,0.00,0,0,0,0,...,0,0,0,0,0,0,0,0,0,rumour
677099574855639044,1,6,-0.063370,-0.065273,1.351055,0.00,0,0,0,0,...,0,0,0,0,0,0,0,0,0,rumour


In [283]:
indicies = joblib.load('Data/indexs.data')
indicies

['1246482832316301319',
 '1252279738099433473',
 '1236050255394877440',
 '1235582115900796928',
 '1258787515592572928',
 '518827403452637184',
 '489829414704648192',
 '580348081100734464',
 '1248121143808098305',
 '1258487399014858752',
 '555072815154475008',
 '629503919098429440',
 '1229732608889802753',
 '489836441120145408',
 '1240570885662289920',
 '1234884616479051777',
 '1244004581010550785',
 '1248902780556693506',
 '1249529725019738113',
 '1248769432748466177',
 '1250659810044829696',
 '1271506581352562688',
 '525049639016615937',
 '376874273601630208',
 '1268842488980221953',
 '524923462398513152',
 '1251522113141215232',
 '524975705206304769',
 '1242485261734301702',
 '633949800761700352',
 '1267237406056558592',
 '550038665653542913',
 '535252460148113409',
 '1274828323508621312',
 '1230539788488495108',
 '1240721296511598592',
 '1241107812551458816',
 '1269660260299669504',
 '1240341381773549568',
 '1259739644981215235',
 '1278262917360693248',
 '663722803489808384',
 '5249

In [284]:
test_sources.index = test_sources.index.map(str)
test_sources2 = test_sources.reindex(indicies)
test_sources2

Unnamed: 0_level_0,verified,followers,retweets,favorites,statuses_count,polarity,covid,coronavirus,new,virus,...,extremely,frediboat,likely,coronavirusupdates,bad,coronafreepakistan,th,sir,wrong,class
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1246482832316301319,1,6,-0.350480,-0.254013,0.236958,0.000000,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1252279738099433473,0,1,-0.350480,-0.263946,-0.330739,-0.800000,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1236050255394877440,0,3,-0.350480,-0.258980,-0.583543,0.000000,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
1235582115900796928,0,1,-0.339437,-0.256496,-0.590637,0.258333,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1258787515592572928,0,1,-0.350480,-0.263946,-0.246094,0.200000,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
427944719612915712,1,6,0.013929,-0.201861,1.563664,0.000000,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
531206167302012929,1,6,0.113313,0.510878,-0.100541,-0.200000,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
553099685888790528,1,6,0.582628,0.284888,0.249009,0.000000,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1222928724112396288,1,6,-0.298027,-0.144743,-0.241400,0.000000,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [285]:
working_path = "Processed Data/"
joblib.dump(train_sources, working_path + "pp_train_sources.data")
joblib.dump(dev_sources, working_path + "pp_dev_sources.data")
joblib.dump(test_sources2, working_path + "pp_test_sources.data")

joblib.dump(train_replies, working_path + "pp_train_replies.data")
joblib.dump(dev_replies, working_path + "pp_dev_replies.data")
joblib.dump(test_replies, working_path + "pp_test_replies.data")

['Processed Data/pp_test_replies.data']