# 1. CLEAN AND COMBINE DATA SOURCES

In [406]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import re
import utils
import data_cleaning

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### a. Disinfo and Infodemic Data
Disinfo: https://github.com/gtziafas/nlp4ifchallenge/tree/main/data/english

Infodemic: https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/XYK2UE

In [407]:
disinfo = pd.read_csv('../data/covid19_disinfo/covid19_disinfo_binary_english_train_old.tsv', sep='\t')
infodemic = pd.read_csv('../data/covid19_infordemic/covid19_infodemic_english_data.tsv', sep='\t')
disinfodemic = pd.concat([disinfo, infodemic], axis=0).drop_duplicates()
#utils.check_tweet_len(disinfodemic)
#utils.check_tweet_len(disinfodemic, labels=['q2_label'])

In [408]:
disinfo.q2_label.value_counts()

no     460
yes     39
Name: q2_label, dtype: int64

In [409]:
infodemic.q2_label.value_counts()

2_no_probably_contains_no_false_info      177
1_no_definitely_contains_no_false_info     46
3_not_sure                                 45
4_yes_probably_contains_false_info         25
5_yes_definitely_contains_false_info       12
Name: q2_label, dtype: int64

### b. Cassie and Linh's manual labelling

In [410]:
manual_labeled_data = data_cleaning.clean_manual_data()
utils.check_tweet_len(manual_labeled_data)
utils.check_tweet_len(manual_labeled_data, labels=['q2_label'])

                  len_tweet                                               \
                      count        mean         std    min    25%    50%   
q2_label q4_label                                                          
0        0             82.0  208.804878   56.159262   72.0  168.0  227.0   
         1              3.0  161.333333  107.584076   83.0  100.0  117.0   
1        0              1.0  199.000000         NaN  199.0  199.0  199.0   
         1             46.0  197.500000   68.263786   54.0  156.0  204.5   

                                  
                      75%    max  
q2_label q4_label                 
0        0         256.75  291.0  
         1         200.50  284.0  
1        0         199.00  199.0  
         1         257.00  290.0  
         len_tweet                                                         
             count        mean        std   min    25%    50%    75%    max
q2_label                                                              

  manual_labeled_data = manual_labeled_data.append(temp)
  manual_labeled_data = manual_labeled_data.append(temp)
  manual_labeled_data = manual_labeled_data.append(temp)
  manual_labeled_data = manual_labeled_data.append(temp)
  manual_labeled_data = manual_labeled_data.append(temp)
  manual_labeled_data = manual_labeled_data.append(temp)
  manual_labeled_data = manual_labeled_data.append(temp)


In [411]:
manual_labeled_data.head()

Unnamed: 0,tweet_text,q2_label,q4_label,tidy_tweet,len_tweet
0,Social media posts have claimed without eviden...,0,0,social media posts have claimed without eviden...,244
11,An image made to look like a statement from fo...,0,0,an image made to look like a statement from fo...,167
26,A baseless social media post ties the assassin...,0,0,a baseless social media post ties the assassin...,215
83,"Monkeypox is not the same as shingles, nor is ...",0,0,monkeypox is not the same as shingles nor is i...,153
87,As global health authorities investigate an un...,0,0,as global health authorities investigate an un...,244


### Merge disinfo and B for training data

In [412]:
manual_labeled_data['tweet_text'] = manual_labeled_data.tidy_tweet

In [413]:
manual_labeled_data.columns

Index(['tweet_text', 'q2_label', 'q4_label', 'tidy_tweet', 'len_tweet'], dtype='object')

In [414]:
manual_labeled_data_new = manual_labeled_data[['tweet_text', 'q2_label', 'q4_label']]
for col in disinfo:
    if col not in manual_labeled_data_new.columns:
        print(f"adding column: {col}")
        manual_labeled_data_new[col] = np.nan

adding column: tweet_no
adding column: q1_label
adding column: q3_label
adding column: q5_label
adding column: q6_label
adding column: q7_label


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  manual_labeled_data_new[col] = np.nan


In [415]:
disinfo.columns

Index(['tweet_no', 'tweet_text', 'q1_label', 'q2_label', 'q3_label',
       'q4_label', 'q5_label', 'q6_label', 'q7_label'],
      dtype='object')

In [416]:
manual_labeled_data_new_2 = manual_labeled_data_new[['tweet_no', 'tweet_text', 'q1_label', 'q2_label', 'q3_label',
       'q4_label', 'q5_label', 'q6_label', 'q7_label']]

In [417]:
manual_labeled_data_new_2.columns

Index(['tweet_no', 'tweet_text', 'q1_label', 'q2_label', 'q3_label',
       'q4_label', 'q5_label', 'q6_label', 'q7_label'],
      dtype='object')

In [418]:
manual_labeled_data_new_2.shape, disinfo.shape

((132, 9), (869, 9))

In [419]:
disinfo_manual_merge = pd.concat([disinfo, manual_labeled_data_new_2], axis=0)

In [420]:
disinfo_manual_merge.shape

(1001, 9)

In [421]:
disinfo.dtypes

tweet_no       int64
tweet_text    object
q1_label      object
q2_label      object
q3_label      object
q4_label      object
q5_label      object
q6_label      object
q7_label      object
dtype: object

In [422]:
disinfo_manual_merge.dtypes

tweet_no      float64
tweet_text     object
q1_label       object
q2_label       object
q3_label       object
q4_label       object
q5_label       object
q6_label       object
q7_label       object
dtype: object

In [423]:
disinfo_manual_merge['tweet_no'] = disinfo_manual_merge.tweet_no.fillna(999).astype('int')

In [424]:
disinfo_manual_merge.dtypes

tweet_no       int64
tweet_text    object
q1_label      object
q2_label      object
q3_label      object
q4_label      object
q5_label      object
q6_label      object
q7_label      object
dtype: object

In [425]:
disinfo_manual_merge.reset_index(inplace=True)

In [426]:
disinfo_manual_merge['index'].describe()

count    1001.000000
mean      432.584416
std       301.806844
min         0.000000
25%       193.000000
50%       405.000000
75%       643.000000
max      1989.000000
Name: index, dtype: float64

In [427]:
disinfo_manual_merge.index = pd.RangeIndex(len(disinfo_manual_merge.index))

In [428]:
disinfo_manual_merge['tweet_no'] = disinfo_manual_merge.index + 1

In [429]:
#disinfo_manual_merge.drop([877], axis=0, inplace=True)

In [430]:
disinfo_manual_merge.drop('index', axis=1, inplace=True)

In [431]:
disinfo.dtypes

tweet_no       int64
tweet_text    object
q1_label      object
q2_label      object
q3_label      object
q4_label      object
q5_label      object
q6_label      object
q7_label      object
dtype: object

In [432]:
disinfo_manual_merge.tweet_no.describe()

count    1001.000000
mean      501.000000
std       289.108111
min         1.000000
25%       251.000000
50%       501.000000
75%       751.000000
max      1001.000000
Name: tweet_no, dtype: float64

In [433]:
for i in range(len(disinfo_manual_merge)):
    if disinfo_manual_merge.tweet_no.values[i]=='':
        print(i)

In [434]:
np.where(disinfo_manual_merge.applymap(lambda x: x == ''))

(array([], dtype=int64), array([], dtype=int64))

In [435]:
disinfo_manual_merge.q1_label.value_counts()

yes    569
no     300
Name: q1_label, dtype: int64

In [436]:
encode_di ={1: 'yes', 0: 'no', 'yes': 'yes', 'no': 'no'}

In [437]:
disinfo_manual_merge['q2_label'] = disinfo_manual_merge['q2_label'].map(encode_di)
disinfo_manual_merge['q3_label'] = disinfo_manual_merge['q3_label'].map(encode_di)
disinfo_manual_merge['q4_label'] = disinfo_manual_merge['q4_label'].map(encode_di)
disinfo_manual_merge['q5_label'] = disinfo_manual_merge['q5_label'].map(encode_di)
disinfo_manual_merge['q6_label'] = disinfo_manual_merge['q6_label'].map(encode_di)

In [438]:
disinfo_manual_merge.q4_label.value_counts()

no     492
yes    205
Name: q4_label, dtype: int64

In [439]:
disinfo.q4_label.value_counts()

no     409
yes    156
Name: q4_label, dtype: int64

In [440]:
#disinfo_manual_merge.drop(876, axis=0, inplace=True)

In [441]:
disinfo_manual_merge.to_csv('../data/covid19_disinfo/disinfo_manual.tsv', sep = '\t', index=False, na_rep=np.nan)

In [321]:
disinfo_manual_merge.iloc[876,:].tweet_text

'what do we know about the new omicron mutant it s a descendent of the earlier super contagious stealth omicron and has quickly gained ground in the united states t co zhbebh zl'

--- The END --