## Imports

In [1]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import re

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jabel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Preprocesamiento

In [2]:
# Recogemos	los datos de la base de datos
# y los guardamos en un dataframe
df = pd.read_csv('../data/train.csv')

In [3]:
df.columns

Index(['id', 'title', 'author', 'text', 'label'], dtype='object')

In [4]:
df['title'][0]

'House Dem Aide: We Didn’t Even See Comey’s Letter Until Jason Chaffetz Tweeted It'

In [5]:
# Limpiar texto
def clean_text(text):
    # Eliminar urls
	text = re.sub(r'http\S+', ' ', str(text))
 
 	# Eliminar signos de puntuación
	text = re.sub(r'[^\w\s]', ' ', str(text))
 
	# Eliminar emojis
	emoji_pattern = re.compile("["
			u"\U0001F600-\U0001F64F" # emoticons
			u"\U0001F300-\U0001F5FF" # symbols & pictographs
			u"\U0001F680-\U0001F6FF" # transport & map symbols
			u"\U0001F1E0-\U0001F1FF" # flags (iOS)
			u"\U00002702-\U000027B0"
			u"\U000024C2-\U0001F251"
			"]+", flags=re.UNICODE)
	text = emoji_pattern.sub(r'', text)
 
	# Eliminar contracciones
	text = re.sub(r"won\'t", " will not", text)
	text = re.sub(r"won\'t've", " will not have", text)
	text = re.sub(r"can\'t", " can not", text)
	text = re.sub(r"don\'t", " do not", text)
	text = re.sub(r"can\'t've", " can not have", text)
	text = re.sub(r"ma\'am", " madam", text)
	text = re.sub(r"let\'s", " let us", text)
	text = re.sub(r"ain\'t", " am not", text)
	text = re.sub(r"shan\'t", " shall not", text)
	text = re.sub(r"sha\n't", " shall not", text)
	text = re.sub(r"o\'clock", " of the clock", text)
	text = re.sub(r"y\'all", " you all", text)
	text = re.sub(r"n\'t", " not", text)
	text = re.sub(r"n\'t've", " not have", text)
	text = re.sub(r"\'re", " are", text)
	text = re.sub(r"\'s", " is", text)
	text = re.sub(r"\'d", " would", text)
	text = re.sub(r"\'d've", " would have", text)
	text = re.sub(r"\'ll", " will", text)
	text = re.sub(r"\'ll've", " will have", text)
	text = re.sub(r"\'t", " not", text)
	text = re.sub(r"\'ve", " have", text)
	text = re.sub(r"\'m", " am", text)
	text = re.sub(r"\'re", " are", text)
 
	# Eliminar caracteres especiales
	text = re.sub(r'\W', ' ', str(text))

	# Eliminar palabras con números
	text = re.sub(r'\w*\d\w*', ' ', text)

	# Eliminar espacios en blanco
	text = re.sub(r'\s{2,}', ' ', text)

	# Eliminar stopwords
	stop_words = set(stopwords.words('english'))
	text = text.split()
	text = [w for w in text if not w in stop_words]
	text = " ".join(text)

	# Stemming
	text = text.split()
	stemmer = SnowballStemmer('english')
	stemmed_words = [stemmer.stem(word) for word in text]
	text = " ".join(stemmed_words)
 
	return text

In [6]:
for i in range(0,10):
    print(clean_text(df['title'][i]))

hous dem aid we didn even see comey letter until jason chaffetz tweet it
flynn hillari clinton big woman campus breitbart
whi truth might get you fire
civilian kill in singl us airstrik have been identifi
iranian woman jail fiction unpublish stori woman stone death adulteri
jacki mason hollywood would love trump he bomb north korea lack tran bathroom exclus video breitbart
life life of luxuri elton john favorit shark pictur to stare at dure long transcontinent flight
benoît hamon win french socialist parti presidenti nomin the new york time
excerpt from draft script donald trump q ampa with black church pastor the new york time
a back channel plan ukrain russia courtesi trump associ the new york time


In [7]:
# texto limpio en un nuevo dataframe
df2 = pd.DataFrame()
df2['title'] = df['title'].apply(clean_text)
df2['text'] = df['text'].apply(clean_text)

# tiempo de ejecución => 4'5 minutos


In [8]:
df2['label'] = df['label']

In [10]:
# Calculo de palabras a procesar
#num_palabras_titulos = 0
#num_palabras_textos = 0
#for i in range(0,len(df)):
#	if(type(df2['text'][i]) == str):
#		num_palabras_textos = num_palabras_textos + len(df['text'][i])
#	if(type(df2['title'][i]) == str):
#		num_palabras_titulos = num_palabras_titulos + len(df['title'][i])
#print('Palabras totales titulos: ' + str(num_palabras_titulos))
#print('Palabras por titulo: ' + str(num_palabras_titulos/len(df)))
#print('Palabras totales noticias: ' + str(num_palabras_textos))
#print('Palabras por noticia: ' + str(num_palabras_textos/len(df)))

# Tiempo aproximado de procesamiento de todas las noticias y titulos de entrenamiento => 25 minutos

## Ponderación con Naive Bayes

In [11]:
true_text = {}
fake_text = {}

true = df2[df2['label'] == 0]
fake = df2[df2['label'] == 1]

In [12]:
def crear_diccionario(clase, diccionario):
	for titulo in clase['title']:
		if(type(titulo) == str):
			palabras = titulo.split()
			for palabra in palabras:
				if palabra in diccionario:
					diccionario[palabra] += 1
				else:
					diccionario[palabra] = 1
    
	for titulo in clase['text']:
		if(type(titulo) == str):
			palabras = titulo.split()
			for palabra in palabras:
				if palabra in diccionario:
					diccionario[palabra] += 1
				else:
					diccionario[palabra] = 1
	return diccionario

In [13]:
def conteo(diccionario):
	length = 0
	for key in diccionario:
		length += diccionario[key]
	return length

In [14]:
def conteo_palabras(diccionario, length):
    for term in diccionario:
        diccionario[term] = diccionario[term]/length
    return diccionario

In [15]:
def probabilidades(diccionario, X, initial):
    X = clean_text(X)
    split = X.split()
    probability = initial
    for term in split:
        if term in diccionario:
            probability *= diccionario[term]
            #print(term,diccionario[term])
    return probability

In [16]:
true_text = crear_diccionario(true,true_text)
fake_text = crear_diccionario(fake,fake_text)
true_count = conteo(true_text)
fake_count = conteo(fake_text)
true_text = conteo_palabras(true_text,true_count)
fake_text = conteo_palabras(fake_text,fake_count)
total_count = true_count + fake_count
fake_initial = fake_count/total_count
true_initial = true_count/total_count

In [17]:
def prediccion(X):
	if probabilidades(true_text, X, 1) > probabilidades(fake_text, X, 1):
		return True
	else:
		return False

In [32]:
prediccion('There is a president better than Trump')

True

In [33]:
true_text

{'flynn': 0.00012360697231222133,
 'hillari': 0.0004242923775517286,
 'clinton': 0.0015726469306923804,
 'big': 0.000612174975466305,
 'woman': 0.0004541411723471243,
 'campus': 0.0001692042109873963,
 'breitbart': 0.0011745409191267963,
 'jacki': 1.794590116532991e-05,
 'mason': 2.0875844212730713e-05,
 'hollywood': 0.0001651755392972202,
 'would': 0.004187987343378521,
 'love': 0.0005026683540697,
 'trump': 0.007043583085951527,
 'he': 0.0032112175799512786,
 'bomb': 0.00026222990274237174,
 'north': 0.0006416575273807756,
 'korea': 0.0002862188114429658,
 'lack': 0.00021681578550765933,
 'tran': 3.0031916235858218e-05,
 'bathroom': 8.826453430294915e-05,
 'exclus': 8.62501984578611e-05,
 'video': 0.0005420394637691483,
 'benoît': 1.2818500832378507e-06,
 'hamon': 3.113064487863352e-06,
 'win': 0.0005696908012789934,
 'french': 0.00022359127880477368,
 'socialist': 3.9371109699448276e-05,
 'parti': 0.0012042065924817295,
 'presidenti': 0.0005841573950755349,
 'nomin': 0.0002649767243