## Imports

In [2]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import re

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jabel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Preprocesamiento

In [3]:
# Recogemos	los datos de la base de datos
# y los guardamos en un dataframe
df = pd.read_csv('../data/train.csv')

In [4]:
df.columns

Index(['id', 'title', 'author', 'text', 'label'], dtype='object')

In [5]:
df['title'][0]

'House Dem Aide: We Didn’t Even See Comey’s Letter Until Jason Chaffetz Tweeted It'

In [6]:
# Limpiar texto
def clean_text(text):
	# Eliminar caracteres especiales
	text = re.sub(r'\W', ' ', str(text))

	# Eliminar palabras con números
	text = re.sub(r'\w*\d\w*', ' ', text)

	# Eliminar espacios en blanco
	text = re.sub(r'\s{2,}', ' ', text)

	# Eliminar stopwords
	stop_words = set(stopwords.words('english'))
	text = text.split()
	text = [w for w in text if not w in stop_words]
	text = " ".join(text)

	# Stemming
	text = text.split()
	stemmer = SnowballStemmer('english')
	stemmed_words = [stemmer.stem(word) for word in text]
	text = " ".join(stemmed_words)
 
	return text

In [7]:
for i in range(0,10):
    print(clean_text(df['title'][i]))

hous dem aid we didn even see comey letter until jason chaffetz tweet it
flynn hillari clinton big woman campus breitbart
whi truth might get you fire
civilian kill in singl us airstrik have been identifi
iranian woman jail fiction unpublish stori woman stone death adulteri
jacki mason hollywood would love trump he bomb north korea lack tran bathroom exclus video breitbart
life life of luxuri elton john favorit shark pictur to stare at dure long transcontinent flight
benoît hamon win french socialist parti presidenti nomin the new york time
excerpt from draft script donald trump q ampa with black church pastor the new york time
a back channel plan ukrain russia courtesi trump associ the new york time


In [8]:
len(df)

20800

In [15]:
# texto limpio en un nuevo dataframe
df2 = pd.DataFrame()
df2['title'] = df['title'].apply(clean_text)
df2['text'] = df['text'].apply(clean_text)

# tiempo de ejecución => 3 minutos


In [16]:
df2['label'] = df['label']

In [52]:
# Calculo de palabras a procesar
num_palabras_titulos = 0
num_palabras_textos = 0
for i in range(0,len(df)):
	if(type(df2['text'][i]) == str):
		num_palabras_textos = num_palabras_textos + len(df['text'][i])
	if(type(df2['title'][i]) == str):
		num_palabras_titulos = num_palabras_titulos + len(df['title'][i])
print('Palabras totales titulos: ' + str(num_palabras_titulos))
print('Palabras por titulo: ' + str(num_palabras_titulos/len(df)))
print('Palabras totales noticias: ' + str(num_palabras_textos))
print('Palabras por noticia: ' + str(num_palabras_textos/len(df)))

# Tiempo aproximado de procesamiento de todas las noticias y titulos de entrenamiento => 25 minutos

Palabras totales titulos: 1504372
Palabras por titulo: 72.32557692307692
Palabras totales noticias: 94518924
Palabras por noticia: 4544.179038461539


## Ponderación

In [17]:
true_text = {}
fake_text = {}

true = df2[df2['label'] == 0]
fake = df2[df2['label'] == 1]

In [18]:
def crear_diccionario(clase, diccionario):
	for titulo in clase['title']:
		if(type(titulo) == str):
			palabras = titulo.split()
			for palabra in palabras:
				if palabra in diccionario:
					diccionario[palabra] += 1
				else:
					diccionario[palabra] = 1
	return diccionario

In [19]:
def conteo(diccionario):
	length = 0
	for key in diccionario:
		length += diccionario[key]
	return length

In [20]:
def conteo_palabras(diccionario, length):
    for term in diccionario:
        diccionario[term] = diccionario[term]/length
    return diccionario

In [21]:
def probabilidades(diccionario, X, initial):
    X = clean_text(X)
    split = X.split()
    probability = initial
    for term in split:
        if term in diccionario:
            probability *= diccionario[term]
            #print(term,diccionario[term])
    return probability

In [22]:
true_text = crear_diccionario(true,true_text)
fake_text = crear_diccionario(fake,fake_text)
true_count = conteo(true_text)
fake_count = conteo(fake_text)
true_text = conteo_palabras(true_text,true_count)
fake_text = conteo_palabras(fake_text,fake_count)
total_count = true_count + fake_count
fake_initial = fake_count/total_count
true_initial = true_count/total_count

In [23]:
def prediccion(X):
	if probabilidades(true_text, X, 1) > probabilidades(fake_text, X, 1):
		return True
	else:
		return False

In [27]:
prediccion('Trump is potato')

False