# reCOVery analysis

Here we do some analysis and modelling with the reCOVery dataset https://github.com/apurvamulay/ReCOVery, used under Attribution-NonCommercial-ShareAlike 4.0 International licence.

## Import dependencies:
 - pandas
 - numpy
 - url library
 - string io
 - re (regular expression)
 - spacy
 - matplotlib
 - seaborn

We also initialise the spacy nlp object in English here

In [None]:
import pandas as pd
import numpy as np
import re

import urllib.request
from io import StringIO

from collections import defaultdict, Counter

import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import spacy

nlp = spacy.load('en_core_web_sm')

## Get data

Read data from URL, load into dataframe and preview

In [None]:
URL = 'https://raw.githubusercontent.com/apurvamulay/ReCOVery/master/dataset/recovery-news-data.csv'

response = urllib.request.urlopen(URL)
data = response.read()
text = data.decode('utf-8')

# Create dataframe
df = pd.read_csv(StringIO(text), sep=',') # index_col=[0, 1, 2, 3

#uncomment this lines to read from local source for offline work
#df = pd.read_csv('FakeCovid_July2020.csv')

df.head()

## Data cleaning

Set all body text to lower case, remove troublesome characters and remove repeated characters.

In [None]:
def text_clean(x):
    #all lower case and remove slashes and underscores
    x = str(x).lower().replace('\ ', '').replace('_', ' ')
    # remove repeated characters
    x = re.sub(r'([a-z])\1{3,}', r'\1\1', x)
    return x

df['body_text'] = df['body_text'].apply(lambda x: text_clean(x))

## Text processing and initial analysis

Create a corpus of covid news articles from the body text of each entry in the data frame

In [None]:
covid_arts = [nlp(art) for art in df['body_text']]

Function to locate entities matching a given tag

In [None]:
def find_entity_occurences(doc,tag = 'ORG'):
    """
    Return a list of actors from `doc` with corresponding occurences.
    
    :param doc: Spacy NLP parsed list of articles
    :return: list of tuples in form
        [('elizabeth', 622), ('darcy', 312), ('jane', 286), ('bennet', 266)]
    """
    
    found_entities = Counter()
    for art in doc:
        for ent in art.ents:
            if ent.label_ == tag:
                found_entities[ent.lemma_] += 1
              
    return found_entities.most_common()

Print lists of the top 20 most mentioned organisations, people and locations

In [None]:
print(find_entity_occurences(covid_arts,'ORG')[:20])
print(find_entity_occurences(covid_arts,'PERSON')[:20])
print(find_entity_occurences(covid_arts,'GPE')[:20])

### Create list of common entities
We can either crab the most common entities as identified by Spacy, or we can define four ourselves a list of entities that we think are important. These operate essentially like filters to identify articles/documents that are relevant or otherwise germane to our investigation

In [None]:
common_groups = [item[0] for item in find_entity_occurences(covid_arts,'ORG')[:20]]
common_locations = [item[0] for item in find_entity_occurences(covid_arts,'GPE')[:20]]

In [None]:
common_groups = [
    'afp',
    'cdc',
    'world health organisation',
    'who',
    'cnn',
    'fox news',
    'new york times',
    'trump administration',
    'the white house',
    'congress',
    'senate'
]

common_locations = [
    'india',
    'england', 
    'united states', 
    'us', 
    'uk', 
    'china',
    'italy',
    'spain',
    'canada',
    'europe',
    'asia',
    'america'
]

Count the co-incidence of various entities within the corpus

In [None]:
group_location_dict = defaultdict(Counter)

for art in covid_arts:
    
    group_candidates = []
    location_candidates = []
    
    for ent in art.ents:
        if ent.label_ == 'ORG':
            group_candidates.append(ent.lemma_)
        if ent.label_ == 'GPE':
            location_candidates.append(ent.lemma_)
            
    groups = []
    locations = []
    
    for ent in group_candidates:
        if ent in common_groups and ent not in groups:
            groups.append(ent)
    for loc in location_candidates:
        if loc in common_locations and loc not in locations:
            locations.append(loc)
            
    for found_entity in groups:
        for found_location in locations:
            group_location_dict[found_entity][found_location] += 1

In [None]:
# Transform the dictionary into a pandas DataFrame and fill NaN values with zeroes
group_location_df = pd.DataFrame.from_dict(dict(group_location_dict), dtype=int)
group_location_full_df = group_location_df.fillna(value=0).astype(int)
# Show DF to console
group_location_full_df

In [None]:
# Seaborn can transform a DataFrame directly into a figure

fig, ax = plt.subplots(figsize=(14,7))
hmap = sns.heatmap(group_location_full_df, ax=ax, annot=True, fmt='d', cmap='YlGnBu', cbar=False)

# Add features using the under the hood plt interface
plt.title('Global distribution of groups appearing in fake news')
plt.xticks(rotation=30)
plt.show()

In [None]:
fig.savefig('fake_news.png')

## Article Classification models

Here we try some typical classification models and use the 'reliability' column to do supervised learning. 

### Support-Vector Machine (SVM)

Import sklearn tools for SVM

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

Set up the Td-Idf vectoriser.

Our input vector will be the vectorised body-text of each article and the output vector is the 'reliability' (0 for unreliable, 1 for reliable).

In [None]:
tfidf = TfidfVectorizer(max_features=5000)

X = df['body_text']
y = df['reliability']

X = tfidf.fit_transform(X)
# X

Create training and test sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

Create model, train then test

In [None]:
clf = LinearSVC()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

Report performance

In [None]:
print(classification_report(y_test, y_pred))

### Logsistic regression
TBC!

### k means / knn clustering
TBC!

## Topic modelling

Here we use the gensim library to perform topic modelling. We can save and load previously trained models using the following code snippet:

    import os
    import tempfile

    with tempfile.NamedTemporaryFile(prefix='model-', suffix='.lsi', delete=False) as tmp:
        lsi_model.save(tmp.name)  # same for tfidf, lda, ...

    loaded_lsi_model = models.LsiModel.load(tmp.name)

    os.unlink(tmp.name)

Import gensim NLP tools

In [None]:
import gensim

from gensim import models
#from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel
from gensim.models.wrappers import LdaMallet
from gensim.corpora import Dictionary
import pyLDAvis.gensim

Dump all text into a single string and separate out articles with new lines (we just demo with titles for now...)

In [None]:
text=df.title.str.cat(sep='\n')

Process text using SpaCy library

In [None]:
doc = nlp(text)

Filter out stop-words that are not necessarily instructive for gaining deeper insight.
That is, it's clear that news articles about coronavirus will mention coronavirus a lot, so we wish to ignore this for now so it does not dominate the vector space.

In [None]:
my_stop_words = [u" ",u'pandemic',u'coronavirus',u'novel_coronavirus',u'novel',u'covid-19']
for stopword in my_stop_words:
    lexeme = nlp.vocab[stopword]
    lexeme.is_stop = True

In [None]:
texts, article = [], []
for w in doc:
    # if it's not a stop word or punctuation mark, add it to an article!
    if w.text != '\n' and not w.is_stop and not w.is_punct and not w.like_num:
        # we add the lematized version of the word
        article.append(w.lemma_)
    # if we find a new line, move onto the next article
    if w.text == '\n':
        texts.append(article)
        article = []

Some bigrams might occur like "New York" so let's handle those automatically here...

In [None]:
bigram = gensim.models.Phrases(texts)
texts = [bigram[line] for line in texts]
dictionary = Dictionary(texts)

Create bag-of-words corpus

In [None]:
corpus = [dictionary.doc2bow(text) for text in texts]

Vectorise the corpus

In [None]:
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

### LSI Model

In [None]:
lsimodel = LsiModel(corpus=corpus_tfidf, num_topics=10, id2word=dictionary)

In [None]:
lsimodel.show_topics(num_topics=5)  # Showing only the top 5 topics

### LDA Model

N.B. According to Gensim documentation, LDA works with BOW vectorised corpus, but it will accept a Tf-Idf vectorised corpus anyway.

In [None]:
ldamodel = LdaModel(corpus=corpus, num_topics=10, id2word=dictionary)

In [None]:
ldamodel.show_topics()

#### LDA Visualisation

In [None]:
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)

### HDP model

In [None]:
hdpmodel = models.HdpModel(corpus=corpus_tfidf, id2word=dictionary)

In [None]:
hdpmodel.show_topics()