Import all dependencies

These include:
 - pandas
 - numpy
 - url library
 - string io
 - re (regular expression)
 - sklearn
     - tfidf vectoriser
     - train test split
     - linear Support Vector Machine
     - classification report

In [5]:
import pandas as pd
import numpy as np

import urllib.request
from io import StringIO

import matplotlib.pyplot as plt
import gensim
import spacy

from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel
from gensim.models.wrappers import LdaMallet
from gensim.corpora import Dictionary
import pyLDAvis.gensim

import os, re, operator, warnings
warnings.filterwarnings('ignore')  # Let's not pay heed to them right now

Load data from CSV file

In [6]:
URL = 'https://raw.githubusercontent.com/Gautamshahi/FakeCovid/master/data/FakeCovid_July2020.csv'

response = urllib.request.urlopen(URL)
data = response.read()
text = data.decode('utf-8')

# Create dataframe
df = pd.read_csv(StringIO(text), sep=',') # index_col=[0, 1, 2, 3

#uncomment this lines to read from local source for offline work
#df = pd.read_csv('FakeCovid_July2020.csv')

df.head()

Unnamed: 0,ID,ref_category_title,ref_url,pageid,verifiedby,country,class,title,published_date,country1,country2,country3,country4,article_source,ref_source,source_title,content_text,category,lang
0,FC1,FALSE: The coronavirus is an amplified bacteri...,https://www.poynter.org/?ifcn_misinformation=t...,https://www.poynter.org/ifcn-covid-19-misinfor...,La Silla Vacía,Colombia,False,The coronavirus is an amplified bacteria rela...,2020/06/17,Colombia,,,,https://lasillavacia.com/detector-video-falso-...,poynter,Detector a video falso que dice que el Covid e...,La Silla Vacía usa Cookies para mejorar la exp...,,es
1,FC2,FALSE: A law allows people to go for a run dur...,https://www.poynter.org/?ifcn_misinformation=a...,https://www.poynter.org/ifcn-covid-19-misinfor...,Newtral.es,Spain,False,A law allows people to go for a run during th...,2020/04/09,Spain,,,,https://www.newtral.es/la-broma-de-que-a-los-r...,poynter,La broma de que a los “runners” se les permite...,En los últimos días nos ha llegado una consult...,,es
2,FC3,False: Chinese converting to Islam after reali...,https://www.poynter.org/?ifcn_misinformation=c...,https://www.poynter.org/ifcn-covid-19-misinfor...,FactCrescendo,India,False,Chinese converting to Islam after realizing t...,2020/02/20,India,,,,https://english.factcrescendo.com/2020/02/20/c...,poynter,Are Chinese people converting to Islam in fear...,"The fact behind every news!, Ever since the Wo...",,en
3,FC4,False: Bat market and bat meat are being sold ...,https://www.poynter.org/?ifcn_misinformation=b...,https://www.poynter.org/ifcn-covid-19-misinfor...,France 24 Observers,France,False,Bat market and bat meat are being sold in Wuhan.,2020/01/27,France,,,,https://observers.france24.com/fr/20200130-int...,poynter,"La soupe à la chauve-souris, un plat prisé en ...","عربي, English, Français, Contribuer, فارسی, عر...",,fr
4,FC5,False: You can self-diagnose COVID-19 by holdi...,https://www.poynter.org/?ifcn_misinformation=y...,https://www.poynter.org/ifcn-covid-19-misinfor...,Agência Lupa,Brazil,False,You can self-diagnose COVID-19 by holding you...,2020/03/16,Brazil,,,,https://piaui.folha.uol.com.br/lupa/2020/03/16...,poynter,#Verificamos: É falso que quem consegue prende...,", “O novo CORONA VÍRUS pode não mostrar sinais...",,pt


Get a list of the column names

In [7]:
list(df.columns.values)

['ID',
 'ref_category_title',
 'ref_url',
 'pageid',
 'verifiedby',
 'country',
 'class',
 'title',
 'published_date',
 'country1',
 'country2',
 'country3',
 'country4',
 'article_source',
 'ref_source',
 'source_title',
 'content_text',
 'category',
 'lang']

Clean up some of the abbreviations in the dataset

In [8]:
df["lang"]= df["lang"].replace('en', "English")
df["lang"]= df["lang"].replace('es', "Spanish")
df["lang"]= df["lang"].replace('fr', "French")
df["lang"]= df["lang"].replace('pt', "Portuguese")
df["lang"]= df["lang"].replace('tr', "Turkish")
df["lang"]= df["lang"].replace('hi', "Hindi")
df["lang"]= df["lang"].replace('zh-tw', "Chinese")
df["lang"]= df["lang"].replace('hr', "Croatian")
df["lang"]= df["lang"].replace('te', "Telugu")
df["lang"]= df["lang"].replace('it', "Italian")
df["lang"]= df["lang"].replace('mk', "Macedonian")
df["lang"]= df["lang"].replace('de', "German")
df["lang"]= df["lang"].replace('ar', "Arabic")
df["lang"]= df["lang"].replace('id', "Indonesian")
df["lang"]= df["lang"].replace('ml', "Malayalam")
df["lang"]= df["lang"].replace('ja', "Japanese")
df["lang"]= df["lang"].replace('ta', "Tamil")
df["lang"]= df["lang"].replace('ko', "Korean")
df["lang"]= df["lang"].replace('lt', "Lithuanian")
df["lang"]= df["lang"].replace('pl', "Polish")
df["lang"]= df["lang"].replace('da', "Danish")
df["lang"]= df["lang"].replace('mr', "Marathi")
df["lang"]= df["lang"].replace('tl', "Tagalog")
df["lang"]= df["lang"].replace('ru', "Russian")
df["lang"]= df["lang"].replace('nl', "Dutch")
df["lang"]= df["lang"].replace('fa', "Persian")
df["lang"]= df["lang"].replace('bn', "Bengali")
df["lang"]= df["lang"].replace('el', "Greek")
df["lang"]= df["lang"].replace('lv', "Latvian")
df["lang"]= df["lang"].replace('gu', "Gujarati")
df["lang"]= df["lang"].replace('et', "Estonian")
df["lang"]= df["lang"].replace('uk', "Ukrainian")
df["lang"]= df["lang"].replace('ur', "Urdu")
df["lang"]= df["lang"].replace('th', "Thai")
df["lang"]= df["lang"].replace('ca', "Catalan")
df["lang"]= df["lang"].replace('vi', "Vietnamese")
df["lang"]= df["lang"].replace('fi', "Finnish")
df.head()

Unnamed: 0,ID,ref_category_title,ref_url,pageid,verifiedby,country,class,title,published_date,country1,country2,country3,country4,article_source,ref_source,source_title,content_text,category,lang
0,FC1,FALSE: The coronavirus is an amplified bacteri...,https://www.poynter.org/?ifcn_misinformation=t...,https://www.poynter.org/ifcn-covid-19-misinfor...,La Silla Vacía,Colombia,False,The coronavirus is an amplified bacteria rela...,2020/06/17,Colombia,,,,https://lasillavacia.com/detector-video-falso-...,poynter,Detector a video falso que dice que el Covid e...,La Silla Vacía usa Cookies para mejorar la exp...,,Spanish
1,FC2,FALSE: A law allows people to go for a run dur...,https://www.poynter.org/?ifcn_misinformation=a...,https://www.poynter.org/ifcn-covid-19-misinfor...,Newtral.es,Spain,False,A law allows people to go for a run during th...,2020/04/09,Spain,,,,https://www.newtral.es/la-broma-de-que-a-los-r...,poynter,La broma de que a los “runners” se les permite...,En los últimos días nos ha llegado una consult...,,Spanish
2,FC3,False: Chinese converting to Islam after reali...,https://www.poynter.org/?ifcn_misinformation=c...,https://www.poynter.org/ifcn-covid-19-misinfor...,FactCrescendo,India,False,Chinese converting to Islam after realizing t...,2020/02/20,India,,,,https://english.factcrescendo.com/2020/02/20/c...,poynter,Are Chinese people converting to Islam in fear...,"The fact behind every news!, Ever since the Wo...",,English
3,FC4,False: Bat market and bat meat are being sold ...,https://www.poynter.org/?ifcn_misinformation=b...,https://www.poynter.org/ifcn-covid-19-misinfor...,France 24 Observers,France,False,Bat market and bat meat are being sold in Wuhan.,2020/01/27,France,,,,https://observers.france24.com/fr/20200130-int...,poynter,"La soupe à la chauve-souris, un plat prisé en ...","عربي, English, Français, Contribuer, فارسی, عر...",,French
4,FC5,False: You can self-diagnose COVID-19 by holdi...,https://www.poynter.org/?ifcn_misinformation=y...,https://www.poynter.org/ifcn-covid-19-misinfor...,Agência Lupa,Brazil,False,You can self-diagnose COVID-19 by holding you...,2020/03/16,Brazil,,,,https://piaui.folha.uol.com.br/lupa/2020/03/16...,poynter,#Verificamos: É falso que quem consegue prende...,", “O novo CORONA VÍRUS pode não mostrar sinais...",,Portuguese


Just focusing on explicitly fake news in English for now...

In [56]:
df2 = df.loc[df['lang'] == 'English'].copy()
df2['class']= df2['class'].replace('FALSE', 'False')
df2['class']= df2['class'].replace('false', 'False')
df3 = df2.loc[df2['class'] == 'False'].copy()
df3.head()

Unnamed: 0,ID,ref_category_title,ref_url,pageid,verifiedby,country,class,title,published_date,country1,country2,country3,country4,article_source,ref_source,source_title,content_text,category,lang
2,FC3,False: Chinese converting to Islam after reali...,https://www.poynter.org/?ifcn_misinformation=c...,https://www.poynter.org/ifcn-covid-19-misinfor...,FactCrescendo,India,False,Chinese converting to Islam after realizing t...,2020/02/20,India,,,,https://english.factcrescendo.com/2020/02/20/c...,poynter,Are Chinese people converting to Islam in fear...,"The fact behind every news!, Ever since the Wo...",,English
9,FC10,FALSE: “Governor Andy Beshear has authorized K...,https://www.poynter.org/?ifcn_misinformation=g...,https://www.poynter.org/ifcn-covid-19-misinfor...,PolitiFact,United States,False,“Governor Andy Beshear has authorized Kentuck...,2020/04/29,United States,,,,https://www.politifact.com/factchecks/2020/may...,poynter,"PolitiFact | No, Kentucky teachers won’t be co...","More Info, Trying to focus on school work at h...",,English
10,FC11,False: Photo shows food being distributed to R...,https://www.poynter.org/?ifcn_misinformation=p...,https://www.poynter.org/ifcn-covid-19-misinfor...,AfricaCheck,Kenya,False,Photo shows food being distributed to Rwandan...,2020/03/30,Kenya,,,,https://africacheck.org/fbcheck/food-distribut...,poynter,Food distribution during Rwanda’s coronavirus ...,A photo of hundreds of neat piles of bedding a...,,English
21,FC22,FALSE: Chewing raw onions can cure coronavirus.,https://www.poynter.org/?ifcn_misinformation=c...,https://www.poynter.org/ifcn-covid-19-misinfor...,GhanaFact,Ghana,False,Chewing raw onions can cure coronavirus.,2020/05/29,Ghana,,,,https://ghanafact.com/false-chewing-raw-onions...,poynter,FALSE: Chewing raw onions can cure coronavirus...,"Source: Unknown, Verdict: False, Researched by...",,English
26,FC27,"FALSE: Komal Mishra, a staff nurse at a hospit...",https://www.poynter.org/?ifcn_misinformation=k...,https://www.poynter.org/ifcn-covid-19-misinfor...,FactCrescendo,India,False,"Komal Mishra, a staff nurse at a hospital in ...",2020/04/22,India,,,,https://english.factcrescendo.com/2020/04/22/f...,poynter,Viral Posts about the Death of Nurse Komal Mis...,"The fact behind every news!, A photo of a youn...",,English


Clean the text in the content_text column
Make it all lower case, remove numbers and remove some special characters

In [57]:
def text_clean(x):
    #all lower case and remove slashes and underscores
    x = str(x).lower().replace('\\', '').replace('_', ' ').replace('/ ','')
    #use a magic regular expression to do more cleaning
    x = re.sub("(.)\1{2,}", "\1", x)
    return x

df3['title'] = df3['title'].apply(lambda x: text_clean(x))

In [75]:
text=df3.title.str.cat(sep=' ')

In [76]:
nlp = spacy.load('en_core_web_sm')
doc = nlp(text)

In [90]:
my_stop_words = [u" ",u'claim',u'people',u'show',u'kill',u'pandemic',u'coronavirus',u'novel_coronavirus',u'novel',u'covid-19']
for stopword in my_stop_words:
    lexeme = nlp.vocab[stopword]
    lexeme.is_stop = True

In [91]:
texts, article = [], []
for w in doc:
    # if it's not a stop word or punctuation mark, add it to our article!
    if w.text != '\n' and not w.is_stop and not w.is_punct and not w.like_num:
        # we add the lematized version of the word
        article.append(w.lemma_)
    # if it's a new line, it means we're onto our next document
    if w.text == '\n':
        texts.append(article)
        article = []

In [92]:
bigram = gensim.models.Phrases(texts)
texts = [bigram[line] for line in texts]
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

In [93]:
lsimodel = LsiModel(corpus=corpus, num_topics=10, id2word=dictionary)

In [94]:
lsimodel.show_topics(num_topics=5)  # Showing only the top 5 topics

[(0,
  '0.299*"claim" + 0.237*"video" + 0.201*"india" + 0.197*"lockdown" + 0.168*"post" + 0.154*"china" + 0.146*"die" + 0.135*"vaccine" + 0.134*"country" + 0.130*"cure"'),
 (1,
  '0.190*"health" + 0.134*"claim" + 0.118*"patient" + -0.112*"trump" + -0.112*"china" + 0.103*"virus" + -0.101*"need" + -0.100*"thousand_time" + -0.099*"video_show" + -0.097*"infect"'),
 (2,
  '0.132*"quarantine" + -0.112*"video" + -0.107*"image" + 0.105*"india" + -0.094*"health" + -0.092*"share_thousand" + -0.090*"white" + 0.090*"day" + 0.088*"$" + -0.084*"bill_gates"'),
 (3,
  '-0.162*"virus" + 0.156*"spread" + 0.155*"india" + -0.153*"die" + -0.132*"share" + -0.128*"china" + -0.125*"italy" + -0.114*"cure" + -0.109*"case" + -0.105*"test"'),
 (4,
  '-0.254*"claim" + -0.147*"death" + 0.130*"virus" + 0.128*"lockdown" + 0.110*"kill" + 0.107*"italy" + -0.106*"china" + -0.105*"state" + 0.103*"spread" + 0.103*"indian"')]

In [95]:
ldamodel = LdaModel(corpus=corpus, num_topics=10, id2word=dictionary)

In [96]:
ldamodel.show_topics()

[(0,
  '0.006*"claim" + 0.005*"post" + 0.005*"video" + 0.005*"china" + 0.005*"india" + 0.004*"show" + 0.004*"lockdown" + 0.004*"country" + 0.004*"infect" + 0.004*"vaccine"'),
 (1,
  '0.006*"india" + 0.006*"claim" + 0.005*"video" + 0.005*"lockdown" + 0.005*"china" + 0.004*"die" + 0.004*"post" + 0.004*"vaccine" + 0.004*"patient" + 0.004*"italy"'),
 (2,
  '0.007*"claim" + 0.005*"india" + 0.005*"\n\n\n\n\n\t\t\t\t\t\t\t_false" + 0.005*"video" + 0.005*"lockdown" + 0.004*"country" + 0.004*"cure" + 0.003*"spread" + 0.003*"trump" + 0.003*"die"'),
 (3,
  '0.011*"claim" + 0.009*"video" + 0.008*"lockdown" + 0.007*"india" + 0.006*"china" + 0.006*"vaccine" + 0.006*"virus" + 0.005*"cure" + 0.005*"show" + 0.004*"country"'),
 (4,
  '0.008*"claim" + 0.006*"india" + 0.005*"lockdown" + 0.005*"post" + 0.005*"video" + 0.005*"china" + 0.004*"cure" + 0.004*"die" + 0.004*"show" + 0.004*"country"'),
 (5,
  '0.010*"claim" + 0.010*"video" + 0.008*"lockdown" + 0.007*"india" + 0.006*"post" + 0.006*"photo" + 0.006*

In [97]:
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)