# Imports

In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
nltk.download('stopwords')
nltk.download('punkt')

import numpy as np
np.set_printoptions(threshold=np.inf)

import pandas as pd
import re
from wordcloud import STOPWORDS
from langdetect import detect_langs

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jabel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jabel\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Lectura de Datos

In [2]:
df = pd.read_csv('../data/train.csv',na_values=[''])

In [6]:
# Unimos texto y titulo en text, sabiendo que a veces uno de los dos es null
df['text'] = df['text'].fillna('') + ' ' + df['title'].fillna('')

### Detección de Lenguajes

In [8]:
# Función para descartar noticias que no estén en inglés con langdetect 
def detectar_idioma_principal(text):
	try:
		return detect_langs(" ".join(text.split()[:50]))[0].lang
	except:
		return False

In [10]:
# Detectamos los lenguajes principales de cada noticia
df['language'] = df['text'].apply(detectar_idioma_principal)

In [23]:
# Vemos cuántas noticias hay de cada lenguaje en noticias verdaderas y falsas
print("Noticias verdaderas:")
print(df[df['label']==0]['language'].value_counts())
print("Noticias falsas:")
print(df[df['label']==1]['language'].value_counts())

Noticias verdaderas:
en    10386
fr        1
Name: language, dtype: int64
Noticias falsas:
en       9866
ru        156
es        142
de         96
fr         71
ar         19
pt          9
tr          7
it          6
so          5
ro          4
hr          4
nl          4
no          4
cy          3
da          2
el          2
hu          1
ca          1
False       1
id          1
zh-cn       1
sw          1
fi          1
lt          1
tl          1
et          1
pl          1
vi          1
sv          1
Name: language, dtype: int64


In [25]:
# Descartamos las noticias que no están en inglés
df = df[df['language']=='en']

### Limpieza de texto

In [26]:
# Función para limpiar el texto
def clean_text(text):
	
	# Estandarizamos caracteres extraños
	if(type(text) == str):
		text = text.replace('“', '"')
		text = text.replace('”', '"')
		text = text.replace('’', "'")
		text = text.replace('‘', "'")
		text = text.replace('`', "'")
		text = text.replace('´', "'")
		text = text.replace('–', '-')
		text = text.replace('−', '-')
		text = text.replace('…', '...')
		text = text.replace('—', '-')
		text = text.replace('•', '-')
		text = text.replace('·', '-')
	
	
    # Eliminar urls
	text = re.sub(r'http\S+', ' ', str(text))
	
 	# Eliminar emojis
	emoji_pattern = re.compile("["
			u"\U0001F600-\U0001F64F"
			u"\U0001F300-\U0001F5FF"
			u"\U0001F680-\U0001F6FF"
			u"\U0001F1E0-\U0001F1FF"
			u"\U00002702-\U000027B0"
			u"\U000024C2-\U0001F251"
			"]+", flags=re.UNICODE)
	text = emoji_pattern.sub(r'', text)
	
 	# Eliminar palabras con números
	text = re.sub(r'\w*\d\w*', ' ', text)
	
    # Eliminar palabras que no estén en latino
	text = re.sub(r'[^\x00-\x7F]+', ' ', str(text))
 
 	# Eliminar signos de puntuación
	text = re.sub(r'[^\w\s]', ' ', str(text))
 
	# Eliminar barra baja
	text = re.sub(r'_', ' ', str(text))
	
    # Minusculas
	text = text.lower()
 
 	# Eliminar contracciones
	text = re.sub(r"ain\'t", " am not", text)
	text = re.sub(r"aren\'t", " are not", text)
	text = re.sub(r"can\'t", " can not", text)
	text = re.sub(r"can\'t've", " can not have", text)
	text = re.sub(r"\'cause", " because	", text)
	text = re.sub(r"could\'ve", " could have", text)
	text = re.sub(r"couldn\'t", " could not", text)
	text = re.sub(r"couldn\'t've", " could not have", text)
	text = re.sub(r"didn\'t", " did not", text)
	text = re.sub(r"doesn\'t", " does not", text)
	text = re.sub(r"don\'t", " do not", text)
	text = re.sub(r"hadn\'t", " had not", text)
	text = re.sub(r"hadn\'t've", " had not have", text)
	text = re.sub(r"hasn\'t", " has not", text)
	text = re.sub(r"haven\'t", " have not", text)
	text = re.sub(r"he\'d", " he had", text)
	text = re.sub(r"he\'d've", " he would have", text)
	text = re.sub(r"he\'ll", " he will", text)
	text = re.sub(r"he\'ll've", " he will have", text)
	text = re.sub(r"he\'s", " he is", text)
	text = re.sub(r"how\'d", " how did", text)
	text = re.sub(r"how\'d'y", " how do you", text)
	text = re.sub(r"how\'ll", " how will", text)
	text = re.sub(r"how\'s", " how is", text)
	text = re.sub(r"how\'d", " how did", text)
	text = re.sub(r"i\'d", " i had", text)
	text = re.sub(r"i\'d've", " i would have", text)
	text = re.sub(r"i\'ll", " i will", text)
	text = re.sub(r"i\'ll've", " i will have", text)
	text = re.sub(r"i\'m", " i am", text)
	text = re.sub(r"i\'ve", " i have", text)
	text = re.sub(r"i\'ll", " i will", text)
	text = re.sub(r"isn\'t", " is not", text)
	text = re.sub(r"it\'d", " it had", text)
	text = re.sub(r"it\'d've", " it would have", text)
	text = re.sub(r"it\'ll", " it will", text)
	text = re.sub(r"it\'ll've", " it will have", text)
	text = re.sub(r"it\'s", " it is", text)
	text = re.sub(r"let\'s", " let us", text)
	text = re.sub(r"ma\'am", " madam", text)
	text = re.sub(r"mayn\'t", " may not", text)
	text = re.sub(r"might\'ve", " might have", text)
	text = re.sub(r"mightn\'t", " might not", text)
	text = re.sub(r"mightn\'t've", " might not have", text)
	text = re.sub(r"mightn\'t", " might not", text)
	text = re.sub(r"must\'ve", " must have", text)
	text = re.sub(r"mustn\'t", " must not", text)
	text = re.sub(r"mustn\'t've", " must not have", text)
	text = re.sub(r"needn\'t", " need not", text)
	text = re.sub(r"needn\'t've", " need not have", text)
	text = re.sub(r"o\'clock", " of the clock", text)
	text = re.sub(r"oughtn\'t", " ought not", text)
	text = re.sub(r"oughtn\'t've", " ought not have", text)
	text = re.sub(r"shan\'t", " shall not", text)
	text = re.sub(r"sha\'n't", " shall not", text)
	text = re.sub(r"shan\'t've", " shall not have", text)
	text = re.sub(r"she\'d", " she had", text)
	text = re.sub(r"she\'d've", " she would have", text)
	text = re.sub(r"she\'ll", " she will", text)
	text = re.sub(r"she\'ll've", " she will have", text)
	text = re.sub(r"she\'s", " she is", text)
	text = re.sub(r"should\'ve", " should have", text)
	text = re.sub(r"shouldn\'t", " should not", text)
	text = re.sub(r"shouldn\'t've", " should not have", text)
	text = re.sub(r"so\'ve", " so have", text)
	text = re.sub(r"so\'s", " so is", text)
	text = re.sub(r"that\'d", " that would", text)
	text = re.sub(r"that\'d've", " that would have", text)
	text = re.sub(r"that\'s", " that is", text)
	text = re.sub(r"there\'d", " there had", text)
	text = re.sub(r"there\'d've", " there would have", text)
	text = re.sub(r"there\'s", " there is", text)
	text = re.sub(r"they\'d", " they had", text)
	text = re.sub(r"they\'d've", " they would have", text)
	text = re.sub(r"they\'ll", " they will", text)
	text = re.sub(r"they\'ll've", " they will have", text)
	text = re.sub(r"they\'re", " they are", text)
	text = re.sub(r"they\'ve", " they have", text)
	text = re.sub(r"to\'ve", " to have", text)
	text = re.sub(r"wasn\'t", " was not", text)
	text = re.sub(r"we\'d", " we had", text)
	text = re.sub(r"we\'d\'ve", " we would have", text)
	text = re.sub(r"we\'ll", " we will", text)
	text = re.sub(r"we\'ll\'ve", " we will have", text)
	text = re.sub(r"we\'re", " we are", text)
	text = re.sub(r"we\'ve", " we have", text)
	text = re.sub(r"weren\'t", " were not", text)
	text = re.sub(r"what\'ll", " what will", text)
	text = re.sub(r"what\'ll\'ve", " what will have", text)
	text = re.sub(r"what\'re", " what are", text)
	text = re.sub(r"what\'s", " what is", text)
	text = re.sub(r"what\'ve", " what have", text)
	text = re.sub(r"when\'s", " when is", text)
	text = re.sub(r"when\'ve", " when have", text)
	text = re.sub(r"where\'d", " where did", text)
	text = re.sub(r"where\'s", " where is", text)
	text = re.sub(r"where\'ve", " where have", text)
	text = re.sub(r"who\'ll", " who will", text)
	text = re.sub(r"who\'ll\'ve", " who will have", text)
	text = re.sub(r"who\'s", " who is", text)
	text = re.sub(r"who\'ve", " who have", text)
	text = re.sub(r"why\'s", " why is", text)
	text = re.sub(r"why\'ve", " why have", text)
	text = re.sub(r"will\'ve", " will have", text)
	text = re.sub(r"won\'t", " will not", text)
	text = re.sub(r"won\'t\'ve", " will not have", text)
	text = re.sub(r"would\'ve", " would have", text)
	text = re.sub(r"wouldn\'t", " would not", text)
	text = re.sub(r"wouldn\'t\'ve", " would not have", text)
	text = re.sub(r"y\'all", " you all", text)
	text = re.sub(r"y\'all\'d\'ve", " you all would have", text)
	text = re.sub(r"y\'all\'d\'ve", " you all would have", text)
	text = re.sub(r"y\'all\'re", " you all are", text)
	text = re.sub(r"y\'all\'ve", " you all have", text)
	text = re.sub(r"you\'d\'ve", " you would have", text)
	text = re.sub(r"you\'ll", " you will", text)
	text = re.sub(r"you\'ll\'ve", " you will have", text)
	text = re.sub(r"you\'re", " you are", text)
	text = re.sub(r"you\'ve", " you have", text)
 
	# Eliminar espacios en blanco
	text = re.sub(r'\s{2,}', ' ', text)
 
	# Eliminar stopwords
	stop_words = set(stopwords.words('english'))
	text = text.split()
	text = [w for w in text if not w in stop_words]
	text = " ".join(text)
 
	# Stemming
	text = text.split()
	stemmer = SnowballStemmer('english')
	stemmed_words = [stemmer.stem(word) for word in text]
	text = " ".join(stemmed_words)
 	
	return text

In [27]:
# Texto limpio en un nuevo dataframe
df_clean = pd.DataFrame()

# Limpiamos el texto
df_clean['text'] = df['text'].apply(clean_text)
df_clean['label'] = df['label']

### Almacenamos las noticias limpias en un nuevo csv

In [29]:
df_clean.to_csv('../data/train_clean.csv', index=False)