In [13]:
import numpy as np
import pandas as pd
import tweepy
import matplotlib.pyplot as plt
import nltk
# Download nltk-packages (not downloaded if up to date)
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from sklearn.model_selection import cross_val_score, cross_val_predict, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix
import os


# Load the training dataset to dataframe.
dataset_path = '/Users/ilpoviertola/OneDrive - TUNI.fi/Kurssimateriaaleja/JODA/datasets/covid19_fake_news'
train_df = pd.read_csv(dataset_path+'/Constraint_train.csv')
train_df.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ilpoviertola/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ilpoviertola/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,id,tweet,label
0,1,The CDC currently reports 99031 deaths. In gen...,real
1,2,States reported 1121 deaths a small rise from ...,real
2,3,Politically Correct Woman (Almost) Uses Pandem...,fake
3,4,#IndiaFightsCorona: We have 1524 #COVID testin...,real
4,5,Populous states can generate large case counts...,real


In [14]:
# Examine training dataset.

print('Datatypes: \n' + str(train_df.dtypes) + '\n')
print('Amount of NaN-values = ' + str(train_df.isna().sum().sum()) + '\n')
print('Is id unique? = ' + str(train_df['id'].is_unique))

Datatypes: 
id        int64
tweet    object
label    object
dtype: object

Amount of NaN-values = 0

Is id unique? = True


Everything seems to be ok. No none-values and every row has unique id. Let's alter the training dataset a bit now!

In [15]:
# Alter training dataset a bit.
train_df_copy = train_df.copy()

# Change label column to real where 1 indicates real news and 0 false news.
dummy = pd.get_dummies(train_df['label'])
train_df_copy = pd.concat([train_df_copy, dummy], axis=1)
train_df_copy = train_df_copy.drop(['fake', 'label', 'id'], axis=1)
print("Real (1) and fake (0) news amounts in training dataset:")
print(train_df_copy['real'].value_counts())

# Split dataframe into two lists containing tweets (x_train) and the real value (y_train).
x_train = train_df_copy['tweet'].tolist()
y_train = train_df_copy['real'].tolist()

train_df_copy.head()

Real (1) and fake (0) news amounts in training dataset:
1    3360
0    3060
Name: real, dtype: int64


Unnamed: 0,tweet,real
0,The CDC currently reports 99031 deaths. In gen...,1
1,States reported 1121 deaths a small rise from ...,1
2,Politically Correct Woman (Almost) Uses Pandem...,0
3,#IndiaFightsCorona: We have 1524 #COVID testin...,1
4,Populous states can generate large case counts...,1


No we have changed the old 'label'-colum to 'real' and binary coded its values. This means that if the given tweet is real news, 'real'-columns value is 1. If the tweet is fake news this value is 0. We also split the dataframe into two lists: x_train and y_train. x_train contains the tweets in the same order that y_train contains the real/fake value.

In [16]:
# Preprocess text

def preprocess_text(text):
    # Tokenise words
    tokeniser = RegexpTokenizer(r'\w+')
    tokens = tokeniser.tokenize(text)

    # Lowercase and lemmatise (e.g. Driving -> drive)
    lemmatiser = WordNetLemmatizer()
    lemmas = [lemmatiser.lemmatize(token.lower(), pos='v') for token in tokens]

    # Remove stopwords
    keywords = [lemma for lemma in lemmas if lemma not in stopwords.words('english')]

    return keywords

vectorizer = TfidfVectorizer(analyzer=preprocess_text)
train_df_tfidf = vectorizer.fit_transform(x_train)
train_df_tfidf.shape

(6420, 16267)

We will used TfidfVectorizer here to convert our tweets into Term Frquency - Inverse Document Frequency -feature matrix. Short explanation about TF-IDF:   
Words that are common in every tweet (such as what, when, but,...) rank low even though they may appear many times, since they don’t mean much to that document in particular.  
However, if the word 'hoax' appears many times in a document, while not appearing many times in others, it probably means that it’s very relevant in our case.  

preprocess_text(text) function takes a tweet and preprocesses it:   
1. Tokenization: Tokens (in our case unique words) are smaller pieces of text, which are used as features of a tweet.  
2. Lowercasing and lemmatization: Every word will be lowercased, so You and you will not be two features in our feature matrix. Lemmatization aims to return the base or dictionary form of a word, which is known as the lemma.  
3. Stopwords: A stopword is a commonly used word like 'a', 'an' or 'the'. These words are not so important to our classification process so they can be removed.
   
train_df_tfidf is now a feature matrix where:
- amount of rows = amount of tweets in the dataset
- amount of columns = amount of features aka. unique words in the dataset

In [17]:
# Train the model
sgd_clf = SGDClassifier(random_state=123)
sgf_clf_scores = cross_val_score(sgd_clf, train_df_tfidf, y_train, cv=5, scoring='accuracy')
print(sgf_clf_scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (sgf_clf_scores.mean(), sgf_clf_scores.std() * 2))

[0.94781931 0.93302181 0.94548287 0.9470405  0.93613707]
Accuracy: 0.94 (+/- 0.01)


Used model is SGDClassifier, which is linear classifier that uses sthocastic gradient descent learning. This works well with our sparse-matrix (feature-matrix).

In [18]:
# Improve the model & run it with new params
grid = {'fit_intercept': [True,False],
        'early_stopping': [True, False],
        'loss' : ['hinge', 'log', 'squared_hinge'],
        'penalty' : ['l2', 'l1', 'none']}
search = GridSearchCV(estimator=sgd_clf, param_grid=grid, cv=5)
search.fit(train_df_tfidf, y_train)
print(search.best_params_)

grid_sgd_clf_scores = cross_val_score(search.best_estimator_, train_df_tfidf, y_train, cv=5)
print(grid_sgd_clf_scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (grid_sgd_clf_scores.mean(), grid_sgd_clf_scores.std() * 2))


{'early_stopping': False, 'fit_intercept': True, 'loss': 'hinge', 'penalty': 'l2'}
[0.94781931 0.93302181 0.94548287 0.9470405  0.93613707]
Accuracy: 0.94 (+/- 0.01)


After computing the baseline-model, we can fine tune it with GridsearchCV. GridsearchCV performs and exhaustive search over specified parameter values for an estimator. E.g. we give a bunch of different loss-functions for GridsearchCV and it returns the one that generates a best results.  

In [19]:
pipe = Pipeline([('vectoriser', vectorizer),
                 ('classifier', search.best_estimator_)])
pipe.fit(x_train, y_train)

Pipeline(steps=[('vectoriser',
                 TfidfVectorizer(analyzer=<function preprocess_text at 0x7fc857d9dcb0>)),
                ('classifier', SGDClassifier(random_state=123))])

After fine tuning the model, we can create a pipeline which integrates the creation of feature matrix (data transforamtion) and model into a single pipeline.  
The pipeline first transforms the unstructured data to a feature matrix and then fits the preprocessed data to the model.

In [20]:
# Predictions
test_df = pd.read_csv(dataset_path+'/Constraint_Val.csv')
x_test = test_df['tweet'].tolist()
test_df = pd.concat([test_df, pd.get_dummies(test_df['label'])], axis=1)
test_df = test_df.drop(['fake', 'label', 'id'], axis=1)
y_test = test_df['real'].tolist()

y_test_pred = pipe.predict(x_test)
print("Accuracy: %0.2f" % (accuracy_score(y_test, y_test_pred)))
print(confusion_matrix(y_test, y_test_pred))

Accuracy: 0.94
[[ 944   76]
 [  60 1060]]


Now we have introduced our model to a dataset that it has not seen before! We can use the accuracy_score-function to determine how correct our model was. We predict the result (fake/real) for the input (tweet) and then compare these results to the real classes of those tweets in order to obtain the model's accuracy.  
The confusion matrix below the accuracy value tells us how many ACTUAL reals our model PREDICTED as fakes and other way around. Below is illustration:  
  
|               | PRED. FAKE    | PRED. REAL  |
| ------------- |:-------------:| :----------:|
| ACTUAL FAKE   | \#            | \#          |
| ACTUAL REAL   | \#            | \#          |

## Now, let's get data from Twitter!

In [25]:
consumer_key = ''
consumer_secret = ''
access_key = ''
access_key_secret = ''

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_key_secret)
api = tweepy.API(auth)

In [26]:
tweet_df = pd.DataFrame(columns=['username', 'description', 'location', 'following',
                               'followers', 'totaltweets', 'retweetcount', 'text', 'hashtags'])
tweets = tweepy.Cursor(api.search, q='#covid19', lang='en', since='2021-03-01', tweet_mode='extended').items(10)
tweets_list = [tweet for tweet in tweets]
for tweet in tweets_list:
    username = tweet.user.screen_name
    description = tweet.user.description
    location = tweet.user.location
    following = tweet.user.friends_count
    followers = tweet.user.followers_count
    totaltweets = tweet.user.statuses_count
    retweetcount = tweet.retweet_count
    hashtags = tweet.entities['hashtags']
        
    # Retweets can be distinguished by a retweeted_status attribute,
    # in case it is an invalid reference, except block will be executed
    try:
        text = tweet.retweeted_status.full_text
    except AttributeError:
        text = tweet.full_text
    hashtext = list()
    for j in range(0, len(hashtags)):
        hashtext.append(hashtags[j]['text'])
        
    # Here we are appending all the extracted information in the DataFrame
    ith_tweet = [username, description, location, following,
                    followers, totaltweets, retweetcount, text, hashtext]
    tweet_df.loc[len(tweet_df)] = ith_tweet


In [28]:
tweet_text_list = tweet_df['text'].tolist()
tweet_pred = pipe.predict(tweet_text_list)
tweet_pred_df = pd.DataFrame({'tweet':tweet_text_list, 'is real?':tweet_pred})

print(tweet_pred_df)

                                                                                                                                                                                                                                                                                                                 tweet  \
0  The evolution of Yan Limeng was carefully designed by Guo Wengui and Stephen K. Bannon. They fueled her deep-rooted belief that the virus is a product of genetic engineering, and she accepts the evidence she provides regardless of right or wrong.\n#StevenBannon\n#YanLiMeng\n#COVID19 https://t.co/htj7jWczCw   
1  The evolution of Yan Limeng was carefully designed by Guo Wengui and Stephen K. Bannon. They fueled her deep-rooted belief that the virus is a product of genetic engineering, and she accepts the evidence she provides regardless of right or wrong.\n#StevenBannon\n#YanLiMeng\n#COVID19 https://t.co/htj7jWczCw   
2  A quick remember that the #COVID19 comes from #China no