In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from gensim import corpora
from gensim.models import Phrases
#https://www.datascience.com/resources/notebooks/word-embeddings-in-python
from collections import defaultdict
import nltk
from nltk.corpus import wordnet
import re
import numpy as np
from nltk.corpus import stopwords
import ast

In [2]:
df_snippets = pd.read_csv('data/00_dataset.csv')
df_snippets.head(2)

Unnamed: 0,claim_id,snippet_content,snippet_date,snippet_id,snippet_pagenum,snippet_title,snippet_url,claim_content,claim_date,claim_label,claim_tag,claim_url,date_number
0,3,"News on Japan, Business News, Opinion, Sports,...",17636,0,0,Article expired | The Japan Times,https://www.japantimes.co.jp/article-expired/,Black and Latino people in NYC are arrested at...,17646,True,— PolitiFact New York,/new-york/statements/2018/apr/25/kirsten-gilli...,0
1,3,Get the latest breaking news across the U.S. o...,17636,1,0,"U.S. News | Latest National News, Videos …",https://abcnews.go.com/US/,Black and Latino people in NYC are arrested at...,17646,True,— PolitiFact New York,/new-york/statements/2018/apr/25/kirsten-gilli...,0


In [3]:
## We preprocess quotes and double quotes in the content of our snippets and claims

s_quotes = ['`','´','ʹ','ʻ','ʼ','ʽ','ˊ','ˋ','˴','ʹ','΄','ՙ','՚','՛','՜','՝','‘','’','‛','′','‵', 'ߴ','י' ,'׳', 'ߵ']
d_quotes = ['ʺ','˝','ˮ','˵','˶','ײ','״', '“', '”', '‟', '″', '‴', '‶', '‷','``']

def replace_quotes(st):
    global s_quotes, d_quotes
    for c in s_quotes:
        st = st.replace(c,"'")
    for c in d_quotes:
        st = st.replace(c,'"')
    return st

df_snippets['snippet_content'] = df_snippets['snippet_content'].apply(lambda x: replace_quotes(x))
df_snippets['claim_content'] = df_snippets['claim_content'].apply(lambda x: replace_quotes(x))

In [4]:
## We preprocess gargabe characters in the content of our snippets and claims

garbage = "…[]•™💕🙂🇮🇩→♦☺~∼^★≈≥⌂℠„♫⊕†☆®©¬〇..."

def rm_garbage(st):
    global garbage
    for c in garbage:
        st = st.replace(c,'')
    return st

df_snippets['snippet_content'] = df_snippets['snippet_content'].apply(lambda x: rm_garbage(x))
df_snippets['claim_content'] = df_snippets['claim_content'].apply(lambda x: rm_garbage(x))

In [5]:
## We lower the content of our snippets and claims

df_snippets['snippet_content'] = df_snippets['snippet_content'].apply(lambda x: x.lower())
df_snippets['claim_content'] = df_snippets['claim_content'].apply(lambda x: x.lower())

In [6]:
## We preprocess format errors on the content of our snippets and claims 

df_snippets['snippet_content'] = df_snippets['snippet_content'].apply(lambda x: x.replace('â€™s',''))
df_snippets['snippet_content'] = df_snippets['snippet_content'].apply(lambda x: x.replace('€€€',''))
df_snippets['snippet_content'] = df_snippets['snippet_content'].apply(lambda x: x.replace('â€œ',''))
df_snippets['snippet_content'] = df_snippets['snippet_content'].apply(lambda x: x.replace('â€',''))
df_snippets['snippet_content'] = df_snippets['snippet_content'].apply(lambda x: x.replace('â',''))
df_snippets['snippet_content'] = df_snippets['snippet_content'].apply(lambda x: x.replace('\\n',''))
df_snippets['snippet_content'] = df_snippets['snippet_content'].apply(lambda x: x.replace('\\r',''))
df_snippets['snippet_content'] = df_snippets['snippet_content'].apply(lambda x: x.replace('\\t',''))
df_snippets['snippet_content'] = df_snippets['snippet_content'].apply(lambda x: x.replace('\\',''))

df_snippets['claim_content'] = df_snippets['claim_content'].apply(lambda x: x.replace('â€™s',''))
df_snippets['claim_content'] = df_snippets['claim_content'].apply(lambda x: x.replace('€€€',''))
df_snippets['claim_content'] = df_snippets['claim_content'].apply(lambda x: x.replace('â€œ',''))
df_snippets['claim_content'] = df_snippets['claim_content'].apply(lambda x: x.replace('â€',''))
df_snippets['claim_content'] = df_snippets['claim_content'].apply(lambda x: x.replace('â',''))
df_snippets['claim_content'] = df_snippets['claim_content'].apply(lambda x: x.replace('\\n',''))
df_snippets['claim_content'] = df_snippets['claim_content'].apply(lambda x: x.replace('\\r',''))
df_snippets['claim_content'] = df_snippets['claim_content'].apply(lambda x: x.replace('\\t',''))
df_snippets['claim_content'] = df_snippets['claim_content'].apply(lambda x: x.replace('\\',''))

In [7]:
## We preprocess abbreviations on the content of our snippets and claims

usa_abr = ['u.s.','u.s','u.s…','u.s.-','u.s.—','-u.s.', 
 'u.s..', '\'u.s']
unitednations_abr = ['u.n.']
losangeles_abr = ['l.a.', 'l.a','l.a.','l.a']
unitedkingdom_abr = ['u.k']
newyork_abr = ['n.y.','n.y']
europeanunion_abr = ['e.u']

def replace_abr(st):
    for abr in usa_abr:
        if abr in st:
            st = st.replace(abr,'united states')
    for abr in unitednations_abr:
        if abr in st:
            st = st.replace(abr,'united nations')
    for abr in losangeles_abr:
        if abr in st:
            st = st.replace(abr,'los angeles')
    for abr in unitedkingdom_abr:
        if abr in st:
            st = st.replace(abr, 'united kingdom')
    for abr in newyork_abr:
        if abr in st:
            st = st.replace(abr, 'new york')
    for abr in europeanunion_abr:
        if abr in st:
            st = st.replace(abr, 'european union')
    return st

df_snippets['snippet_content'] = df_snippets['snippet_content'].apply(lambda x: replace_abr(x))
df_snippets['claim_content'] = df_snippets['claim_content'].apply(lambda x: replace_abr(x))

In [8]:
## We preprocess separating characters (e.g. and/or -> and or)

str_sep = '*:•/_–—\\-‐,.|‑…→]―=[~+'

def rm_sep(st):
    for c in str_sep:
        if c in st:
            st = ' '.join(st.split(c))
    return st

df_snippets['snippet_content'] = df_snippets['snippet_content'].apply(lambda x: rm_sep(x))
df_snippets['claim_content'] = df_snippets['claim_content'].apply(lambda x: rm_sep(x))

In [9]:
## We tokenize the contents of our snippets and claims

df_snippets['snippet_content'] = df_snippets['snippet_content'].apply(lambda x: nltk.word_tokenize(x))
df_snippets['claim_content'] = df_snippets['claim_content'].apply(lambda x: nltk.word_tokenize(x))

In [10]:
## We apply pos-tagging on our snippets and claims

df_snippets['snippet_content'] = df_snippets['snippet_content'].apply(lambda x: nltk.pos_tag(x))
df_snippets['claim_content'] = df_snippets['claim_content'].apply(lambda x: nltk.pos_tag(x))

In [11]:
## Here is a list of all the pos-tags

print(list(set([x[1] for lst in df_snippets['claim_content'] for x in lst])))

['PRP$', 'IN', ')', 'PDT', 'RBR', 'NNPS', '.', "''", 'VBD', 'MD', 'UH', '(', 'FW', 'RB', 'TO', '``', '#', 'JJ', 'VB', 'WDT', 'RBS', ':', 'VBG', 'PRP', 'WRB', 'DT', 'NNS', 'WP', 'EX', 'NNP', 'CC', 'RP', 'JJR', 'VBP', 'VBN', 'NN', 'POS', 'CD', 'WP$', '$', 'JJS', 'VBZ']


In [12]:
## We remove pos-tags that we consider irrelevant

worthless_pos_tags = [',', 'PRP$', 'WRB', 'RP', 'TO','(','SYM','WDT', '``', 'WP$', ')', 'EX',
                     'LS', ':', 'WP', 'MD', 'CD', '$', 'IN', '#', "''", 'FW', 'POS', 'DT',
                     '.', 'PDT', 'CC', 'UH', 'PRP']

df_snippets['snippet_content'] = df_snippets['snippet_content']\
.apply(lambda tb: list(filter(lambda x: x[1] not in worthless_pos_tags,tb)))

df_snippets['snippet_content'] = df_snippets['snippet_content'].apply(lambda tb: [x[0] for x in tb])

df_snippets['claim_content'] = df_snippets['claim_content']\
.apply(lambda tb: list(filter(lambda x: x[1] not in worthless_pos_tags,tb)))

df_snippets['claim_content'] = df_snippets['claim_content'].apply(lambda tb: [x[0] for x in tb])

In [13]:
## We keep only words that match a pattern of interest

pattern_interest = re.compile("^[a-z']+$")
df_snippets['snippet_content'] = df_snippets['snippet_content']\
.apply(lambda tb: list(filter(lambda x: pattern_interest.match(x), tb)))
df_snippets['claim_content'] = df_snippets['claim_content']\
.apply(lambda tb: list(filter(lambda x: pattern_interest.match(x), tb)))

In [14]:
## We identify phrases (couple of words appearing frequently together)

bigrams = Phrases(list(df_snippets['snippet_content'].as_matrix()))

df_snippets['snippet_content'] = df_snippets['snippet_content'].apply(lambda tb: bigrams[tb])
df_snippets['claim_content'] = df_snippets['claim_content'].apply(lambda tb: bigrams[tb])



In [15]:
## We remove stopwords

stopWords = set(stopwords.words('english'))

df_snippets['snippet_content'] = df_snippets['snippet_content']\
.apply(lambda tb: list(filter(lambda x: x not in stopWords, tb)))

df_snippets['claim_content'] = df_snippets['claim_content']\
.apply(lambda tb: list(filter(lambda x: x not in stopWords, tb)))

NameError: name 'df_claims' is not defined

In [18]:
## We export our new data set

df_snippets.to_csv('data/02_bow_snippets_claims.csv',index=False)

In [13]:
## We create an index of all the terms in the contents of our snippets and claims

dictionary = corpora.Dictionary(list(df_snippets['snippet_content'].as_matrix()))
dictionary.filter_extremes(no_below=5, no_above=0.5)

194658


In [22]:
## We create different lexicon by selecting the more frequent terms

for nb_word in sorted(list(np.arange(10000,40000,5000)),reverse=True):
    dictionary.filter_extremes(keep_n=nb_word)
    dictionary.save('dictionaries/02_'+str(nb_word/1000)+'.dict')

35000
30000
25000
20000
15000
10000
