# Johdanto Datatieteeseen 2021 -practical work
### *author: Ilpo Viertola*

In [2]:
# Normal stuff
import pandas as pd
import numpy as np

# For Twitter API
import tweepy
import ast

# Tweet preprocessing
import nltk
nltk.download('stopwords')  # Download stopwords (not downloaded if up to date)
nltk.download('wordnet')    # Download wordnet for lemmatizer (not downloaded if up to date)
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import html
import string

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ilpoviertola/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ilpoviertola/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Reading the data
**Read the data in from [Kaggle csv-files](https://www.kaggle.com/elvinagammed/covid19-fake-news-dataset-nlp) and with Twitter API.**

In [3]:
csv_path = '/Users/ilpoviertola/OneDrive - TUNI.fi/Kurssimateriaaleja/JODA/datasets/covid19_fake_news'
# Data to train the model with
train_df = pd.read_csv(csv_path + '/Constraint_Train.csv')
train_df.head()
# Data to test the model with
test_df = pd.read_csv(csv_path + '/Constraint_Test.csv')
test_df.head()
# Data to validate the model with
val_df = pd.read_csv(csv_path + '/Constraint_Val.csv')
val_df.head()

Unnamed: 0,id,tweet,label
0,1,Chinese converting to Islam after realising th...,fake
1,2,11 out of 13 people (from the Diamond Princess...,fake
2,3,"COVID-19 Is Caused By A Bacterium, Not Virus A...",fake
3,4,Mike Pence in RNC speech praises Donald Trump’...,fake
4,5,6/10 Sky's @EdConwaySky explains the latest #C...,real


**Next Twitter data. First lets authenticate ourselves so we can use Twitter API.**

In [4]:
# Fetch Twitter API-keys from a local file.
key_file = open('twitter.key', 'r')
keys = ast.literal_eval(key_file.read())
key_file.close()

auth = tweepy.OAuthHandler(keys['API'], keys['API_secret'])
auth.set_access_token(keys['Access_token'], keys['Access_token_secret'])
api = tweepy.API(auth)

**Create DataFrame where the tweets are stored and fetch tweets.**

In [5]:
# Create DataFrame for tweets
tweet_df = pd.DataFrame(columns=['username', 'description', 'location', 'following', 'followers', 'totaltweets', 'retweetcount', 'text', 'hashtags'])

# Get tweets
hashtag = '#covid19'
d_since = '2021-04-07'
limit = 250
tweets = tweepy.Cursor(api.search, q=hashtag, lang='en', since=d_since, tweet_mode='extended').items(limit)
tweets_list = [tweet for tweet in tweets]

**Process tweets and add them to DataFrame. We'll exclude retweets.**

In [6]:
for tweet in tweets_list:
    # Data about tweets
    username = tweet.user.screen_name
    description = tweet.user.description
    location = tweet.user.location
    following = tweet.user.friends_count
    followers = tweet.user.followers_count
    totaltweets = tweet.user.statuses_count
    retweetcount = tweet.retweet_count
    hashtags = tweet.entities['hashtags']
    
    # Let's ignore all retweets
    if not tweet.retweeted and ('RT @' not in tweet.full_text):

        text = tweet.full_text
        hashtext = list()
        for j in range(0, len(hashtags)):
            hashtext.append(hashtags[j]['text'])
            
        # Lisätään data DataFrameen.
        ith_tweet = [username, description, location, following, followers, totaltweets, 
                    retweetcount, text, hashtext]
        tweet_df.loc[len(tweet_df)] = ith_tweet

tweet_df.head()

Unnamed: 0,username,description,location,following,followers,totaltweets,retweetcount,text,hashtags
0,RWT_patientexp,Patient Experience Team @RWT_NHS. We provide a...,,322,191,199,0,Last but not least we share the last of our @R...,"[volunteers, COVID19, compassion]"
1,UMRio_ONERio,Through rugby and education UMRio [ONERio] pro...,"São Gonçalo, Rio de Janeiro",382,227,283,0,#Brazil recorded its 2nd daily high in #COVID1...,"[Brazil, COVID19]"
2,MonicaPicc2021,"Passionate about environmental, health and hum...",United Kingdom,261,136,1915,0,"New record: Brazil confirms 4,249 deaths by co...","[Brazil, pandemic, Brasil, COVID19]"
3,NST_Online,"News, views and up-to-date reports from Malays...",Malaysia,428,742757,244729,0,"#NSTnation Sarawak logged 555 new cases, the s...","[NSTnation, Covid19, Infection, Rise, Pandemic..."
4,AlasdairSampso1,Democrat: Scottish; cyclist: fly fisher: Covid...,"Scotland, EU",281,524,14664,1,#Sturgeon said there’ll be #COVID19 inquiry \n...,"[Sturgeon, COVID19]"


## Check for Null-values 

In [7]:
print('Null values in training data? ' + str(train_df.isnull().values.any()))
print('Null values in testing data? ' + str(test_df.isnull().values.any()))
print('Null values in validation data? ' + str(val_df.isnull().values.any()))
print('Null values in Twitter data? ' + str(tweet_df.isnull().values.any()))

Null values in training data? False
Null values in testing data? False
Null values in validation data? False
Null values in Twitter data? False


## Data exploration
**Check column names**

In [8]:
print('Training & validation data\'s columns:')
print(train_df.columns.values)
print(val_df.columns.values)
print('Test data\'s columns:')
print(test_df.columns.values)
print('Twitter data\'s columns:')
print(tweet_df.columns.values)

Training & validation data's columns:
['id' 'tweet' 'label']
['id' 'tweet' 'label']
Test data's columns:
['id' 'tweet']
Twitter data's columns:
['username' 'description' 'location' 'following' 'followers' 'totaltweets'
 'retweetcount' 'text' 'hashtags']


**Columns are ok. Next check datatypes**

In [9]:
print('Datatypes for training data: \n' + str(train_df.dtypes) + '\n')
print('Datatypes for validation data: \n' + str(val_df.dtypes) + '\n')
print('Datatypes for testing data: \n' + str(test_df.dtypes) + '\n')
print('Datatypes for Twitter data: \n' + str(tweet_df.dtypes) + '\n')

Datatypes for training data: 
id        int64
tweet    object
label    object
dtype: object

Datatypes for validation data: 
id        int64
tweet    object
label    object
dtype: object

Datatypes for testing data: 
id        int64
tweet    object
dtype: object

Datatypes for Twitter data: 
username        object
description     object
location        object
following       object
followers       object
totaltweets     object
retweetcount    object
text            object
hashtags        object
dtype: object



**Twitter dataset needs some datatype modifications.**

In [10]:
tweet_df = tweet_df.astype({'following': 'int32', 'followers': 'int32', 
                            'totaltweets': 'int32', 'retweetcount': 'int32'})
print('New datatypes for Twitter data: \n' + str(tweet_df.dtypes) + '\n')

New datatypes for Twitter data: 
username        object
description     object
location        object
following        int32
followers        int32
totaltweets      int32
retweetcount     int32
text            object
hashtags        object
dtype: object



In [11]:
print('\nExample tweet from training data: ')
print(train_df['tweet'][5])
print('\nExample tweet from Twitter data: ')
print(tweet_df['text'][5])


Example tweet from training data: 
Covid Act Now found "on average each person in Illinois with COVID-19 is infecting 1.11 other people. Data shows that the infection growth rate has declined over time this factors in the stay-at-home order and other restrictions put in place." https://t.co/hhigDd24fE

Example tweet from Twitter data: 
#Handwashing is one of the most essential ways to fight #COVID19 but 295 million children in sub-Saharan Africa do not have access to basic handwashing facilities in their schools.

Read @UNICEF’s new report to explores how governments can address this.
https://t.co/1haCuta7pV


**Tweets typically contain links, other people's usernames, hashtags and emojis. These must be cleaned before training the model...**

In [12]:
print('Training data labels: \n', train_df['label'].value_counts())
print('\nValidation data labels: \n', val_df['label'].value_counts())

Training data labels: 
 real    3360
fake    3060
Name: label, dtype: int64

Validation data labels: 
 real    1120
fake    1020
Name: label, dtype: int64


**Datasets are balanced, meaning they contain approximately as much fake and real news. These values must be binarycoded in the future.**  
  
## Tweet preprocessing aka. feature extraction

In [16]:
puncs = string.punctuation
stopws = stopwords.words('english')
print(punc)
print(stopw)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only

In [28]:
def tweet_cleaner(tweets):
    for i in range(0, len(tweets)):
        tweet = tweets[i]

        emoji_pattern = re.compile(pattern = '['
            u'\U0001F600-\U0001F64F'  # emoticons
            u'\U0001F300-\U0001F5FF'  # symbols & pictographs
            u'\U0001F680-\U0001F6FF'  # transport & map symbols
            u'\U0001F1E0-\U0001F1FF'  # flags (iOS)
                           ']+', flags = re.UNICODE)

        tweet = html.unescape(tweet)    # Remove leftover HTML elements
        tweet = re.sub(r'@\w+', ' ', tweet) # Remove mentions to other people
        tweet = re.sub(r'http\S+', ' ', tweet)  # Remove links
        tweet = emoji_pattern.sub(r'', tweet)   # Remove emojis
        
        tweet = ''.join([punc for punc in tweet if not punc in puncs])   # Remove punctuation
        tweet = tweet.lower()   # Lowercase text
    
        tweetWord = tweet.split()   # Split to words

        lemmatiser = WordNetLemmatizer()
        tweetWord = [lemmatiser.lemmatize(word, pos='v') for word in tweetWord] # Lemmatize words

        tweets[i] = ''.join([word + ' ' for word in tweetWord if not word in stopws]) # Exclude stopwords
        
    return tweets 

In [20]:
train_df['clean_tweet'] = tweet_cleaner(train_df['tweet'].copy())
train_df.head()

Unnamed: 0,id,tweet,label,clean_tweet
0,1,The CDC currently reports 99031 deaths. In gen...,real,cdc currently report 99031 deaths general disc...
1,2,States reported 1121 deaths a small rise from ...,real,state report 1121 deaths small rise last tuesd...
2,3,Politically Correct Woman (Almost) Uses Pandem...,fake,politically correct woman almost use pandemic ...
3,4,#IndiaFightsCorona: We have 1524 #COVID testin...,real,indiafightscorona 1524 covid test laboratories...
4,5,Populous states can generate large case counts...,real,populous state generate large case count look ...


In [21]:
test_df['clean_tweet'] = tweet_cleaner(test_df['tweet'].copy())
test_df.head()

Unnamed: 0,id,tweet,clean_tweet
0,1,Our daily update is published. States reported...,daily update publish state report 734k test 39...
1,2,Alfalfa is the only cure for COVID-19.,alfalfa cure covid19
2,3,President Trump Asked What He Would Do If He W...,president trump ask would catch coronavirus do...
3,4,States reported 630 deaths. We are still seein...,state report 630 deaths still see solid nation...
4,5,This is the sixth time a global health emergen...,sixth time global health emergency declare int...


In [22]:
val_df['clean_tweet'] = tweet_cleaner(val_df['tweet'].copy())
val_df.head()

Unnamed: 0,id,tweet,label,clean_tweet
0,1,Chinese converting to Islam after realising th...,fake,chinese convert islam realise muslim affect co...
1,2,11 out of 13 people (from the Diamond Princess...,fake,11 13 people diamond princess cruise ship inti...
2,3,"COVID-19 Is Caused By A Bacterium, Not Virus A...",fake,covid19 cause bacterium virus treat aspirin
3,4,Mike Pence in RNC speech praises Donald Trump’...,fake,mike pence rnc speech praise donald trump’s co...
4,5,6/10 Sky's @EdConwaySky explains the latest #C...,real,610 sky explain latest covid19 data government...


In [25]:
tweet_df['clean_tweet'] = tweet_cleaner(tweet_df['text'].copy())
tweet_df.head()

Unnamed: 0,username,description,location,following,followers,totaltweets,retweetcount,text,hashtags,clean_tweet
0,RWT_patientexp,Patient Experience Team @RWT_NHS. We provide a...,,322,191,199,0,Last but not least we share the last of our @R...,"[volunteers, COVID19, compassion]",last least share last volunteer stories featur...
1,UMRio_ONERio,Through rugby and education UMRio [ONERio] pro...,"São Gonçalo, Rio de Janeiro",382,227,283,0,#Brazil recorded its 2nd daily high in #COVID1...,"[Brazil, COVID19]",brazil record 2nd daily high covid19 deaths 3 ...
2,MonicaPicc2021,"Passionate about environmental, health and hum...",United Kingdom,261,136,1915,0,"New record: Brazil confirms 4,249 deaths by co...","[Brazil, pandemic, Brasil, COVID19]",new record brazil confirm 4249 deaths covid19 ...
3,NST_Online,"News, views and up-to-date reports from Malays...",Malaysia,428,742757,244729,0,"#NSTnation Sarawak logged 555 new cases, the s...","[NSTnation, Covid19, Infection, Rise, Pandemic...",nstnation sarawak log 555 new case state highe...
4,AlasdairSampso1,Democrat: Scottish; cyclist: fly fisher: Covid...,"Scotland, EU",281,524,14664,1,#Sturgeon said there’ll be #COVID19 inquiry \n...,"[Sturgeon, COVID19]",sturgeon say there’ll covid19 inquiry manipula...


**Remove rows that have blank clean_tweets. (This could be the case if the tweet only contained e.g. a link)**

In [29]:
train_df['clean_tweet'].replace('', np.nan, inplace=True)
test_df['clean_tweet'].replace('', np.nan, inplace=True)
val_df['clean_tweet'].replace('', np.nan, inplace=True)
tweet_df['clean_tweet'].replace('', np.nan, inplace=True)

train_df.dropna(subset=['clean_tweet'], inplace=True)
test_df.dropna(subset=['clean_tweet'], inplace=True)
val_df.dropna(subset=['clean_tweet'], inplace=True)
tweet_df.dropna(subset=['clean_tweet'], inplace=True)

**Compare "dirty" and "clean" tweet**

In [33]:
print('Some dirty tweet:\n', train_df['tweet'][150])
print('\nClean version:\n', train_df['clean_tweet'][150])

Some dirty tweet:
 Thirty-nine GPs and specialists have written to the BMJ calling for action on long COVID. https://t.co/4Y5kGv3pF3 https://t.co/jTc1OucOmw

Clean version:
 thirtynine gps specialists write bmj call action long covid 


**Binarycode label-colum values to is_real-column in train_df and val_df. 0 = fake, 1 = real**

In [32]:
train_df['is_real'] = pd.get_dummies(train_df['label'])['real']
val_df['is_real'] = pd.get_dummies(val_df['label'])['real']
val_df.head()

Unnamed: 0,id,tweet,label,clean_tweet,is_real
0,1,Chinese converting to Islam after realising th...,fake,chinese convert islam realise muslim affect co...,0
1,2,11 out of 13 people (from the Diamond Princess...,fake,11 13 people diamond princess cruise ship inti...,0
2,3,"COVID-19 Is Caused By A Bacterium, Not Virus A...",fake,covid19 cause bacterium virus treat aspirin,0
3,4,Mike Pence in RNC speech praises Donald Trump’...,fake,mike pence rnc speech praise donald trump’s co...,0
4,5,6/10 Sky's @EdConwaySky explains the latest #C...,real,610 sky explain latest covid19 data government...,1
