# Laboratorio #2 – Detección de SPAM
### Jose Hernandez 20053
### Javier Mombiela 20067

In [22]:
# importando librerias
import re
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

## Parte 1 – Ingeniería de características 
### 1.1 - Exploración de datos y Pre-procesamiento

In [23]:
# Cargando datos y eliminando columnas vacias
data = pd.read_csv("spam.csv", encoding='latin-1')
data = data.iloc[:, :2]

data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [24]:
# Descargar recursos de NLTK (stopwords y lemmatizer)
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rjmom\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rjmom\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rjmom\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

Aplicando tecnicas de pre-procesamiento de lenguaje natural

In [25]:
# Funcion para procesar el texto
def preprocess_text(text):
    # Convertir a minúsculas
    text = text.lower()
    # Tokenización
    tokens = word_tokenize(text)
    # Eliminar stopwords y caracteres especiales
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words and word.isalpha()]
    # Expansión de contracciones (por ejemplo, "don't" a "do not")
    tokens = [re.sub(r"n't", " not", word) for word in tokens]
    # Lemmatización
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # Unir los tokens preprocesados en un texto nuevamente
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

In [27]:
# Aplicar la función de preprocesamiento a la columna 'v2' (mensajes)
data['v2_preprocessed'] = data['v2'].apply(preprocess_text)

# Mostrar el resultado del preprocesamiento
data[['v2', 'v2_preprocessed']].head()

Unnamed: 0,v2,v2_preprocessed
0,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,Free entry in 2 a wkly comp to win FA Cup fina...,free entry wkly comp win fa cup final tkts may...
3,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,"Nah I don't think he goes to usf, he lives aro...",nah think go usf life around though


Agregando caracteristicas relevantes

In [28]:
data['Word_Count'] = data['v2_preprocessed'].apply(lambda x: len(str(x).split()))
data['Stopword_Count'] = data['v2_preprocessed'].apply(lambda x: len([word for word in str(x).lower().split() if word in stopwords.words('english')]))
data['Stopword_Ratio'] = data['Stopword_Count'] / data['Word_Count']
data['Unique_Stopword_Count'] = data['v2_preprocessed'].apply(lambda x: len(set([word for word in str(x).lower().split() if word in stopwords.words('english')])))

data = data[['v1', 'v2_preprocessed', 'Word_Count', 'Stopword_Count', 'Stopword_Ratio', 'Unique_Stopword_Count']]
data.head()

Unnamed: 0,v1,v2_preprocessed,Word_Count,Stopword_Count,Stopword_Ratio,Unique_Stopword_Count
0,ham,go jurong point crazy available bugis n great ...,16,0,0.0,0
1,ham,ok lar joking wif u oni,6,0,0.0,0
2,spam,free entry wkly comp win fa cup final tkts may...,20,0,0.0,0
3,ham,u dun say early hor u c already say,9,0,0.0,0
4,ham,nah think go usf life around though,7,0,0.0,0
