# SpaCy analysis on fake_covid data

We use SpaCy to do some rudimentary analysisof text from the fake_covid data set from https://github.com/Gautamshahi/FakeCovid

## Import dependencies:
 - pandas
 - numpy
 - url library
 - string io
 - re (regular expression)
 - spacy

In [None]:
import pandas as pd
import numpy as np
import re

import urllib.request
from io import StringIO

from collections import defaultdict, Counter

import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import spacy

nlp = spacy.load('en_core_web_sm')

## Get Data

Get data from URL, load into dataframe and preview

In [None]:
URL = 'https://github.com/apurvamulay/ReCOVery/blob/master/dataset/recovery-news-data.csv'

response = urllib.request.urlopen(URL)
data = response.read()
text = data.decode('utf-8')

# Create dataframe
df = pd.read_csv(StringIO(text), sep=',',header=None) # index_col=[0, 1, 2, 3

#uncomment this lines to read from local source for offline work
#df = pd.read_csv('FakeCovid_July2020.csv')

df.head()

## Data cleaning

Filter all english language documents then clean the content text

In [None]:
df2 = df.loc[df['lang'] == 'en'].copy()
def text_clean(x):
    #all lower case and remove slashes and underscores
    x = str(x).lower().replace('\ ', '').replace('_', ' ')
    # remove repeated characters
    x = re.sub(r'([a-z])\1{3,}', r'\1\1', x)
    return x

df2['content_text'] = df2['content_text'].apply(lambda x: text_clean(x))

## Text processing

Process texts with Spacy

In [None]:
covid_arts = [nlp(art) for art in df2['content_text']]

Function to locate entities matching a given tag

In [None]:
def find_entity_occurences(doc,tag = 'ORG'):
    """
    Return a list of actors from `doc` with corresponding occurences.
    
    :param doc: Spacy NLP parsed list of articles
    :return: list of tuples in form
        [('elizabeth', 622), ('darcy', 312), ('jane', 286), ('bennet', 266)]
    """
    
    found_entities = Counter()
    for art in doc:
        for ent in art.ents:
            if ent.label_ == tag:
                found_entities[ent.lemma_] += 1
              
    return found_entities.most_common()

print(find_entity_occurences(covid_arts,'ORG')[:20])
print(find_entity_occurences(covid_arts,'GPE')[:20])

### Create list of common entities
We can either crab the most common entities as identified by Spacy, or we can define four ourselves a list of entities that we think are important. These operate essentially like filters to identify articles/documents that are relevant or otherwise germane to our investigation

In [None]:
common_groups = [item[0] for item in find_entity_occurences(covid_arts,'ORG')[:20]]
common_locations = [item[0] for item in find_entity_occurences(covid_arts,'GPE')[:20]]

In [None]:
common_groups = [
    'afp',
    'cdc',
    'world health organisation',
    'who',
    'cnn',
    'fox news',
    'new york times',
    'trump administration',
    'the white house',
    'congress',
    'senate'
]

common_locations = [
    'india',
    'england', 
    'united states', 
    'us', 
    'uk', 
    'china',
    'italy',
    'spain',
    'canada',
    'europe',
    'asia',
    'america'
]

## Data Analysis

Count the co-incidence of various entities within the corpus

In [None]:
group_location_dict = defaultdict(Counter)

# go through each article in the corpus
for art in covid_arts:
    
    group_candidates = []
    location_candidates = []
    
    # get all entities of potential interest in this article
    for ent in art.ents:
        if ent.label_ == 'ORG':
            group_candidates.append(ent.lemma_)
        if ent.label_ == 'GPE':
            location_candidates.append(ent.lemma_)
            
    groups = []
    locations = []
    
    # add entities to the lists if they are also on our interest lists
    # be careful not to count each entity multiple times!!
    for ent in group_candidates:
        if ent in common_groups and ent not in groups:
            groups.append(ent)
    for loc in location_candidates:
        if loc in common_locations and loc not in locations:
            locations.append(loc)
    
    # using the entities found above, count the co-incidence in this article        
    for found_entity in groups:
        for found_location in locations:
            group_location_dict[found_entity][found_location] += 1

In [None]:
# Transform the dictionary into a pandas DataFrame and fill NaN values with zeroes
group_location_df = pd.DataFrame.from_dict(dict(group_location_dict), dtype=int)
group_location_full_df = group_location_df.fillna(value=0).astype(int)
# Show DF to console
group_location_full_df

## Data visualisation

Using the seaborn library, we can make a slightly more elegant (and exportable) figure than merely displaying the dataframe we created.

In [None]:
# Seaborn can transform a DataFrame directly into a figure

fig, ax = plt.subplots(figsize=(14,7))
hmap = sns.heatmap(group_location_full_df, ax=ax, annot=True, fmt='d', cmap='YlGnBu', cbar=False)

# Add features using the under the hood plt interface
plt.title('Global distribution of groups appearing in fake news')
plt.xticks(rotation=30)
plt.show()

Even more fancy...

In [None]:
# You can also mask all the zero figures using features of the DataFrame
heat_mask = group_location_df.isnull()

fig, ax = plt.subplots(figsize=(14,7))

hmap = sns.heatmap(group_location_full_df, ax=ax, annot=True, fmt='d', cmap='YlGnBu', cbar=False, mask=heat_mask)

# Add features using the under the hood plt interface
sns.axes_style('white')
plt.title('Global distribution of groups appearing in fake news')
plt.xticks(rotation=30)
plt.show()

In [None]:
fig.savefig('fake_news.png')