# COVID-19 News Dataset
#### 10k+ social media posts and articles on COVID-19 manually labelled as true or false by a team
https://arxiv.org/abs/2011.03327

### Necessery Imports

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly
pd.set_option('display.max_rows', 500, "display.max_colwidth", None)
plotly.offline.init_notebook_mode(connected=True)

## 1. Loading the datasets and merging them into one

In [2]:
COVID_df_1 = pd.read_csv('Initial_datasets/COVID_train.csv', usecols = ['tweet', 'label'], low_memory=False)
COVID_df_2 = pd.read_csv('Initial_datasets/english_test_with_labels.csv', usecols = ['tweet', 'label'], low_memory=False)
COVID_df_3 = pd.read_csv('Initial_datasets/Constraint_val.csv', usecols = ['tweet', 'label'], low_memory=False)

COVID_df = pd.concat([COVID_df_1, COVID_df_2, COVID_df_3], ignore_index=True)
print("size od dataset: ", COVID_df.shape)
COVID_df.head()

size od dataset:  (10700, 2)


Unnamed: 0,tweet,label
0,The CDC currently reports 99031 deaths. In general the discrepancies in death counts between different sources are small and explicable. The death toll stands at roughly 100000 people today.,real
1,States reported 1121 deaths a small rise from last Tuesday. Southern states reported 640 of those deaths. https://t.co/YASGRTT4ux,real
2,Politically Correct Woman (Almost) Uses Pandemic as Excuse Not to Reuse Plastic Bag https://t.co/thF8GuNFPe #coronavirus #nashville,fake
3,#IndiaFightsCorona: We have 1524 #COVID testing laboratories in India and as on 25th August 2020 36827520 tests have been done : @ProfBhargava DG @ICMRDELHI #StaySafe #IndiaWillWin https://t.co/Yh3ZxknnhZ,real
4,Populous states can generate large case counts but if you look at the new cases per million today 9 smaller states are showing more cases per million than California or Texas: AL AR ID KS KY LA MS NV and SC. https://t.co/1pYW6cWRaS,real


## 2. Examining the dataset

In [None]:
fig = px.histogram(COVID_df, x='label').update_xaxes(categoryarray=['real', 'fake'])
fig.update_layout(bargap=0.2)
fig.show()

### 2.1 URLs, Hashtags and Mentions

#### Delete Links, Hashtags and @ (mentions) at the end of tweets:

Many entries include not only the claim we are interested in but also links to some resources, mentions and hashtags. Big portions of those come at the end of the tweet so by deleting them, the claim is cleaned from unncessery characters

In [5]:
COVID_df['modified_tweet'] = COVID_df['tweet']

# Keep deleting the last word of a tweet if it's a link, hashtag (#) or a mention (@). 
# This usually does not influence the claim
stop = False
while stop == False:
    prev_tweets = COVID_df['modified_tweet'].copy(deep=True)
    COVID_df['modified_tweet'] = np.where((COVID_df['modified_tweet'].str.lower().str.rsplit(' ', 1).str[1].str.contains('http') == True) | 
                                         (COVID_df['modified_tweet'].str.lower().str.rsplit(' ', 1).str[1].str.contains('#') == True) | 
                                         (COVID_df['modified_tweet'].str.lower().str.rsplit(' ', 1).str[1].str.contains('@') == True), 
                                         COVID_df['modified_tweet'].str.rsplit(' ', 1).str[0], COVID_df['modified_tweet']) 

    if prev_tweets.equals(COVID_df['modified_tweet']):
        stop = True

#### Delete remaining entries with mentions (@) and link
These entries have mentions and links in the middle of the tweet, therefore there wouldn't be useful when training our model, as we assume the input won't include any arbitrary text

In [6]:
# link in the middle or at the front: 167, delete
COVID_df = COVID_df.drop(COVID_df.loc[COVID_df['modified_tweet'].str.lower().str.contains('http') == True].index)

# mention (@) still in the modified_tweet: 1443, delete - hard to encode
COVID_df = COVID_df.drop(COVID_df.loc[COVID_df['modified_tweet'].str.lower().str.contains('@') == True].index)

#### Examining remaining hashtags
These entries have mentions and links in the middle of the tweet, therefore there wouldn't be useful when training our model, as we assume the input won't include any arbitrary text

In [7]:
# Replacing the most frequent #COVID19 hashtag to normal word before further analysis and formatting
COVID_df['modified_tweet'] = COVID_df['modified_tweet'].str.replace('#COVID19 ','COVID-19 ')
COVID_df['modified_tweet'] = COVID_df['modified_tweet'].str.replace('#COVID-19 ','COVID-19 ')
COVID_df['modified_tweet'] = COVID_df['modified_tweet'].str.replace('#COVID_19 ','COVID-19 ')
COVID_df['modified_tweet'] = COVID_df['modified_tweet'].str.replace('#COVID ','COVID-19 ')

In [8]:
# Drop the entries that include hashtags in the form of "CoronaVirusUpdate" i.e. 
# entries that have hashtags with multiple words chained together (around 800 rows)
COVID_df = COVID_df.drop(COVID_df.loc[(COVID_df['modified_tweet'].str.contains(r'#[A-Z][a-z]+[A-Z]'))].index)

In [9]:
# Dropping entries that have #COVID19 and other stuff attatched to it like #COVIDNigeria
COVID_df = COVID_df.drop(COVID_df.loc[COVID_df['modified_tweet'].str.lower().str.contains('#covid19[^ ,?.-]')].index)

In [10]:
# Dropping entries that have 1 or more hashtags at the beginning of the claim (80)
COVID_df = COVID_df.drop(COVID_df.loc[COVID_df['modified_tweet'].str.match(r'#[a-zA-Z\d]+ #*')].index)

In [11]:
# Entries with hashtag and ":" 35 -> Drop the first word
COVID_df['modified_tweet'] = np.where(COVID_df['modified_tweet'].str.match('#[a-zA-Z\d]+:'), 
                                      COVID_df['modified_tweet'].str.split(' ', 1).str[1], 
                                      COVID_df['modified_tweet'])

In [12]:
# Still 522 entries have # in them (middle of the sentence) -> Drop the #
COVID_df['modified_tweet'] = COVID_df['modified_tweet'].str.replace('#','')

### 2.2 Removing Duplicates

In [13]:
# Removind duplicates (around 120)
COVID_df = COVID_df.drop_duplicates(subset='modified_tweet', keep='first')

### 2.3 Other Special Characters

#### &amp

In [14]:
# Entries containing '&amp' (300) -> Drop
COVID_df = COVID_df.drop(COVID_df[COVID_df['modified_tweet'].str.contains('&amp')].index)

#### \n

In [15]:
# Tweets containing '\n' (180) -> Drop 
COVID_df = COVID_df.drop(COVID_df[COVID_df['modified_tweet'].str.contains('\n')].index)

#### &

In [16]:
# Tweets containing ' & ' -> Replace with ' and ' instead (25)
COVID_df['modified_tweet'] = COVID_df['modified_tweet'].str.replace(' & ',' and ')
# Drop the tweets that have no spacing around '&' (the rest)
COVID_df = COVID_df.drop(COVID_df[COVID_df['modified_tweet'].str.contains('&')].index)

#### ;

In [17]:
# Tweets containing '; [a-z]' (42) -> Switch to '. [a-z]' 
COVID_df['modified_tweet'] = np.where(COVID_df['modified_tweet'].str.contains(r'; [a-z]+'), 
                                      COVID_df['modified_tweet'].str.replace('; ',', '), 
                                      COVID_df['modified_tweet'])


# Tweets containing '; [A-Z]' (17) -> Switch to '. [A-Z]' 
COVID_df['modified_tweet'] = np.where(COVID_df['modified_tweet'].str.contains(r'; [A-Z]+'), 
                                      COVID_df['modified_tweet'].str.replace('; ','. '), 
                                      COVID_df['modified_tweet'])

# Rest of the tweets containing ';' (mostly gibbrish) -> Drop
COVID_df = COVID_df.drop(COVID_df[COVID_df['modified_tweet'].str.contains(';')].index)

#### Last sentence check

In [18]:
# After out prevoious formatting we need to look at the end of the tweets

# Tweets that end with : (500) -> Deleting the last sentence
COVID_df['modified_tweet'] = COVID_df['modified_tweet'].str.replace(r"(?<=[.!?])[^.!?]*?:\s*$", "", regex=True)

#### Non-ascii characters

In [19]:
# Deleting non-ascii characters from the strings
COVID_df['modified_tweet'] = COVID_df['modified_tweet'].astype(str).apply(lambda x: x.encode('ascii', 'ignore').decode('ascii'))

#### First character check

In [20]:
# SENTENCE wirth unusual starting character (340):
# - multiple starting with _
# - multiple starting with ?
COVID_df.loc[COVID_df['modified_tweet'].str.match(r'[^a-zA-Z0-9"$]')].shape[0]

COVID_df['modified_tweet'] = np.where(COVID_df['modified_tweet'].str.startswith('_'), 
                                      COVID_df['modified_tweet'].str.replace('_',''), 
                                      COVID_df['modified_tweet'])

COVID_df['modified_tweet'] = np.where(COVID_df['modified_tweet'].str.startswith('?'), 
                                      COVID_df['modified_tweet'].str.replace(r'[?]+','', regex=True), 
                                      COVID_df['modified_tweet'])

#### ???

In [21]:
# 166 entries with '???' -> delete the ???
COVID_df[COVID_df['modified_tweet'].str.lower().str.contains('??', regex=False)].head(100)

COVID_df['modified_tweet'] = np.where(COVID_df['modified_tweet'].str.contains('???', regex=False), 
                                      COVID_df['modified_tweet'].str.replace('???', '', regex=False), 
                                      COVID_df['modified_tweet'])

### 2.4 Sentence length
There are two main reasons to use logarithmic scales in charts and graphs. The first is to respond to skewness towards large values; i.e., cases in which one or a few points are much larger than the bulk of the data. The second is to show percent change or multiplicative factors. 

In [None]:
COVID_df['tweet_count'] = COVID_df['modified_tweet'].str.len()

fig = px.histogram(COVID_df, x='tweet_count') # With log scale to see a better distribution
fig.update_layout(bargap=0.2)
fig.show()

# 6000 entries with modified_tweet character count less than 200
# COVID_df = COVID_df[COVID_df['modified_tweet'].str.len() < 200]

### 2.5 True/Fake distribution

In [None]:
fig = px.histogram(COVID_df, x='label').update_xaxes(categoryarray=['real', 'fake'])
fig.update_layout(bargap=0.2)
fig.show()

# Change the type to boolean type
COVID_df['label'] = (COVID_df['label'] == 'real').astype(int)

## 3. Saving the dataset

In [24]:
COVID_df_final = COVID_df[['modified_tweet', 'label']]
COVID_df_final.rename(columns = {'modified_tweet': 'claim', 'label': 'claim_veracity'}, inplace=True)

In [25]:
COVID_df_final.to_csv('CovidFakeNews_Final.csv', encoding='utf-8')