# Topic modelling on fake_covid data

We use Gensim to perform some basic topic modelling on text from the fake_covid data set from https://github.com/Gautamshahi/FakeCovid

## Import dependencies:
 - pandas
 - numpy
 - url library
 - string io
 - re (regular expression)
 - gensim

In [None]:
import pandas as pd
import numpy as np

import urllib.request
from io import StringIO

import matplotlib.pyplot as plt
import gensim
import spacy

from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel
from gensim.models.wrappers import LdaMallet
from gensim.corpora import Dictionary
import pyLDAvis.gensim

import os, re, operator, warnings
warnings.filterwarnings('ignore')  # Let's not pay heed to them right now

## Get Data

Load and preview data from CSV file

In [None]:
URL = 'https://raw.githubusercontent.com/Gautamshahi/FakeCovid/master/data/FakeCovid_July2020.csv'

response = urllib.request.urlopen(URL)
data = response.read()
text = data.decode('utf-8')

# Create dataframe
df = pd.read_csv(StringIO(text), sep=',') # index_col=[0, 1, 2, 3

#uncomment this lines to read from local source for offline work
#df = pd.read_csv('FakeCovid_July2020.csv')

df.head()

## Data cleaning

Clean up some of the abbreviations in the dataset

In [None]:
df["lang"]= df["lang"].replace('en', "English")
df["lang"]= df["lang"].replace('es', "Spanish")
df["lang"]= df["lang"].replace('fr', "French")
df["lang"]= df["lang"].replace('pt', "Portuguese")
df["lang"]= df["lang"].replace('tr', "Turkish")
df["lang"]= df["lang"].replace('hi', "Hindi")
df["lang"]= df["lang"].replace('zh-tw', "Chinese")
df["lang"]= df["lang"].replace('hr', "Croatian")
df["lang"]= df["lang"].replace('te', "Telugu")
df["lang"]= df["lang"].replace('it', "Italian")
df["lang"]= df["lang"].replace('mk', "Macedonian")
df["lang"]= df["lang"].replace('de', "German")
df["lang"]= df["lang"].replace('ar', "Arabic")
df["lang"]= df["lang"].replace('id', "Indonesian")
df["lang"]= df["lang"].replace('ml', "Malayalam")
df["lang"]= df["lang"].replace('ja', "Japanese")
df["lang"]= df["lang"].replace('ta', "Tamil")
df["lang"]= df["lang"].replace('ko', "Korean")
df["lang"]= df["lang"].replace('lt', "Lithuanian")
df["lang"]= df["lang"].replace('pl', "Polish")
df["lang"]= df["lang"].replace('da', "Danish")
df["lang"]= df["lang"].replace('mr', "Marathi")
df["lang"]= df["lang"].replace('tl', "Tagalog")
df["lang"]= df["lang"].replace('ru', "Russian")
df["lang"]= df["lang"].replace('nl', "Dutch")
df["lang"]= df["lang"].replace('fa', "Persian")
df["lang"]= df["lang"].replace('bn', "Bengali")
df["lang"]= df["lang"].replace('el', "Greek")
df["lang"]= df["lang"].replace('lv', "Latvian")
df["lang"]= df["lang"].replace('gu', "Gujarati")
df["lang"]= df["lang"].replace('et', "Estonian")
df["lang"]= df["lang"].replace('uk', "Ukrainian")
df["lang"]= df["lang"].replace('ur', "Urdu")
df["lang"]= df["lang"].replace('th', "Thai")
df["lang"]= df["lang"].replace('ca', "Catalan")
df["lang"]= df["lang"].replace('vi', "Vietnamese")
df["lang"]= df["lang"].replace('fi', "Finnish")

Just focusing on explicitly fake news in English for now...

In [None]:
df2 = df.loc[df['lang'] == 'English'].copy()
df2['class']= df2['class'].replace('FALSE', 'False')
df2['class']= df2['class'].replace('false', 'False')
df3 = df2.loc[df2['class'] == 'False'].copy()

## Text processing

Clean the text in the content_text column
Make it all lower case, remove numbers and remove some special characters

In [None]:
def text_clean(x):
    #all lower case and remove slashes and underscores
    x = str(x).lower().replace('\\', '').replace('_', ' ').replace('/ ','')
    # remove repeated characters
    x = re.sub(r'([a-z])\1{3,}', r'\1\1', x)
    return x

df3['title'] = df3['title'].apply(lambda x: text_clean(x))

Dump all text into a single object for analysis (just titles for now)

In [None]:
text=df3.title.str.cat(sep=' ')

Process text using SpaCy library

In [None]:
nlp = spacy.load('en_core_web_sm')
doc = nlp(text)

Filter out stop-words that are not necessarily instructive for gaining deeper insight.
That is, it's clear that fake news articles about coronavirus will mention coronavirus a lot, so we wish to ignore this for now so it does not dominate the BOW vector space.

In [None]:
my_stop_words = [u" ",u'claim',u'people',u'show',u'kill',u'pandemic',u'coronavirus',u'novel_coronavirus',u'novel',u'covid-19']
for stopword in my_stop_words:
    lexeme = nlp.vocab[stopword]
    lexeme.is_stop = True

In [None]:
texts, article = [], []
for w in doc:
    # if it's not a stop word or punctuation mark, add it to our article!
    if w.text != '\n' and not w.is_stop and not w.is_punct and not w.like_num:
        # we add the lematized version of the word
        article.append(w.lemma_)
    # if it's a new line, it means we're onto our next document
    if w.text == '\n':
        texts.append(article)
        article = []

Some bigrams might occur like "New York" so let's handle those automatically here...

In [None]:
bigram = gensim.models.Phrases(texts)
texts = [bigram[line] for line in texts]
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

## Modelling

### LSI Model

In [None]:
lsimodel = LsiModel(corpus=corpus, num_topics=10, id2word=dictionary)

In [None]:
lsimodel.show_topics(num_topics=5)  # Showing only the top 5 topics

### LDA Model

In [None]:
ldamodel = LdaModel(corpus=corpus, num_topics=10, id2word=dictionary)

In [None]:
ldamodel.show_topics()

### LDA Visualisation

In [None]:
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)

In [None]:
keywords = Counter()
for chunk in doc.noun_chunks:
    if nlp.vocab[chunk.lemma_].prob < - 8: # probablity value -8 is arbitrarily selected threshold
        keywords[chunk.lemma_] += 1

keywords.most_common(20)