# Laboratorio #2 – Detección de SPAM
### Jose Hernandez 20053
### Javier Mombiela 20067

In [10]:
# importando librerias
import re
import nltk
import pandas as pd
import contractions
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

## Parte 1 – Ingeniería de características 
### 1.1 - Exploración de datos y Pre-procesamiento

In [11]:
# Cargando datos y eliminando columnas vacias
data = pd.read_csv("spam.csv", encoding='latin-1')
data = data.iloc[:, :2]

data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [12]:
# Descargar recursos de NLTK (stopwords y lemmatizer)
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rjmom\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rjmom\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

Aplicando tecnicas de pre-procesamiento de lenguaje natural

In [13]:
# Convertir texto a minúsculas en la columna 'v2'
data['v2'] = data['v2'].str.lower()

data.head()

Unnamed: 0,v1,v2
0,ham,"go until jurong point, crazy.. available only ..."
1,ham,ok lar... joking wif u oni...
2,spam,free entry in 2 a wkly comp to win fa cup fina...
3,ham,u dun say so early hor... u c already then say...
4,ham,"nah i don't think he goes to usf, he lives aro..."


In [14]:
# Remover caracteres especiales, números y símbolos
data['v2'] = data['v2'].apply(lambda x: re.sub(r'[^a-z\s]', '', x))

data.head()

Unnamed: 0,v1,v2
0,ham,go until jurong point crazy available only in ...
1,ham,ok lar joking wif u oni
2,spam,free entry in a wkly comp to win fa cup final...
3,ham,u dun say so early hor u c already then say
4,ham,nah i dont think he goes to usf he lives aroun...


In [15]:
# Expandir contracciones
data['v2'] = data['v2'].apply(lambda x: contractions.fix(x))

data.head()

In [16]:
# Eliminando stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
data['v2'] = data['v2'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

data.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rjmom\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,v1,v2
0,ham,go jurong point crazy available bugis n great ...
1,ham,ok lar joking wif u oni
2,spam,free entry wkly comp win fa cup final tkts st ...
3,ham,u dun say early hor u c already say
4,ham,nah dont think goes usf lives around though


In [19]:
# Stemming
porter_stemmer = PorterStemmer()

def stem_text(text):
    return ' '.join([porter_stemmer.stem(word) for word in text.split()])

data['v2'] = data['v2'].apply(stem_text)

data.head()

Unnamed: 0,v1,v2
0,ham,go jurong point crazi avail bugi n great world...
1,ham,ok lar joke wif u oni
2,spam,free entri wkli comp win fa cup final tkt st m...
3,ham,u dun say earli hor u c alreadi say
4,ham,nah dont think goe usf live around though
5,spam,freemsg hey darl week word back id like fun st...
6,ham,even brother like speak treat like aid patent
7,ham,per request mell mell oru minnaminungint nurun...
8,spam,winner valu network custom select receivea pri...
9,spam,mobil month u r entitl updat latest colour mob...
