# Laboratorio #2 – Detección de SPAM 
## Ingeniería de características 

In [1]:
import pandas as pd
import numpy as np
import re
import nltk

### Exploración de datos

#### Dataset 1

In [2]:
df1 = pd.read_csv("datasets/enronSpamSubset.csv")

In [3]:
df1.head(5)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Body,Label
0,2469,2469,Subject: stock promo mover : cwtd\n * * * urge...,1
1,5063,5063,Subject: are you listed in major search engine...,1
2,12564,12564,"Subject: important information thu , 30 jun 20...",1
3,2796,2796,Subject: = ? utf - 8 ? q ? bask your life with...,1
4,1468,1468,"Subject: "" bidstogo "" is places to go , things...",1


In [4]:
df1 = df1.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1)
df1.head(5)

Unnamed: 0,Body,Label
0,Subject: stock promo mover : cwtd\n * * * urge...,1
1,Subject: are you listed in major search engine...,1
2,"Subject: important information thu , 30 jun 20...",1
3,Subject: = ? utf - 8 ? q ? bask your life with...,1
4,"Subject: "" bidstogo "" is places to go , things...",1


#### Dataset 2

In [5]:
df2 = pd.read_csv("datasets/completeSpamAssassin.csv")

In [6]:
df2.head(5)

Unnamed: 0.1,Unnamed: 0,Body,Label
0,0,\nSave up to 70% on Life Insurance.\nWhy Spend...,1
1,1,1) Fight The Risk of Cancer!\nhttp://www.adcli...,1
2,2,1) Fight The Risk of Cancer!\nhttp://www.adcli...,1
3,3,##############################################...,1
4,4,I thought you might like these:\n1) Slim Down ...,1


In [7]:
df2 = df2.drop(['Unnamed: 0'], axis=1)
df2.head(5)

Unnamed: 0,Body,Label
0,\nSave up to 70% on Life Insurance.\nWhy Spend...,1
1,1) Fight The Risk of Cancer!\nhttp://www.adcli...,1
2,1) Fight The Risk of Cancer!\nhttp://www.adcli...,1
3,##############################################...,1
4,I thought you might like these:\n1) Slim Down ...,1


Verificamos que label tenga solo valores validos

In [8]:
df1['Label'].value_counts()

1    5000
0    5000
Name: Label, dtype: int64

In [9]:
df2['Label'].value_counts()

0    4150
1    1896
Name: Label, dtype: int64

Verificamos que tengan las mismas columnas

In [10]:
list(df1.columns) == list(df2.columns)

True

#### Dataset unificado 

In [11]:
frames = [df1,df2]
df = pd.concat(frames)
del df1, df2
df.head()

Unnamed: 0,Body,Label
0,Subject: stock promo mover : cwtd\n * * * urge...,1
1,Subject: are you listed in major search engine...,1
2,"Subject: important information thu , 30 jun 20...",1
3,Subject: = ? utf - 8 ? q ? bask your life with...,1
4,"Subject: "" bidstogo "" is places to go , things...",1


### Preprocesamiento

In [12]:
df = df.drop_duplicates()

In [13]:
df = df[df['Body'] != '']
df = df.dropna()

In [14]:
# Quitar acentos
import unicodedata
remove_accents = lambda s: s
df['Body'] = df['Body'].apply(remove_accents)

In [15]:
# Quitar caracteres especiales
import re
pattern_to_remove = r'[^a-zA-Z\s]'
remove_special_chars = lambda txt: re.sub(pattern_to_remove,'', txt)
df['Body'] = df['Body'].apply(remove_special_chars)

In [16]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/josuevalenzuela/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/josuevalenzuela/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/josuevalenzuela/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/josuevalenzuela/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [17]:
stop_words = nltk.corpus.stopwords.words('english')
def normalize_document(doc):
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    tokens = nltk.word_tokenize(doc)
    filtered_tokens = [token for token in tokens if token not in stop_words]
    doc = ' '.join(filtered_tokens)
    return doc

In [18]:
df['Body'] = df['Body'].apply(normalize_document)

Guardamos el dataset unificado

In [19]:
df.to_csv('datasets/output.csv')
df.head(5)

Unnamed: 0,Body,Label
0,subject stock promo mover cwtd urgent investor...,1
1,subject listed major search engines submitting...,1
2,subject important information thu jun subject ...,1
3,subject utf q bask life utf q individual incre...,1
4,subject bidstogo places go things hello privac...,1


### Representación de texto

#### Bag of Grams

In [20]:
from sklearn.feature_extraction.text import CountVectorizer

In [21]:
# N = 1
bv = CountVectorizer(ngram_range=(1,1))
bv_matrix = bv.fit_transform(df['Body'])

In [22]:
bv_matrix = bv_matrix.toarray()
vocabulario = bv.get_feature_names()
pd.DataFrame(bv_matrix, columns=vocabulario).head(5)



Unnamed: 0,aa,aaa,aaaaaa,aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaacuxrmplffhxl,aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaacmmvzcjzld,aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaacuwlcuwmdlo,aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaabcfudhgkxt,aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaacuxrmplffhxlmh,aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaafcvwyfklcuxrmplf,...,zzzryanairspamassassintaintorg,zzzz,zzzzasonorgmy,zzzzcchackwatchcom,zzzzjmasonorg,zzzzneilgarnercomas,zzzzspamassassintaintorg,zzzzspamassassintaintorgoldhtlheuhcclco,zzzzteana,zzzzteanayahoogroupscom
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
# N = 2
bv = CountVectorizer(ngram_range=(2,2))
bv_matrix = bv.fit_transform(df['Body'])

In [24]:
bv_matrix = bv_matrix.toarray()
vocabulario = bv.get_feature_names()
pd.DataFrame(bv_matrix, columns=vocabulario).head(5)



Unnamed: 0,aa ab,aa adobe,aa allocations,aa also,aa amy,aa ask,aa bank,aa battery,aa bcdf,aa benifits,...,zzzzspamassassintaintorg password,zzzzspamassassintaintorg today,zzzzteana announcement,zzzzteana fwd,zzzzteana megalithomania,zzzzteana muppet,zzzzteana sent,zzzzteana van,zzzzteanayahoogroupscom sent,zzzzteanayahoogroupscom subject
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### TF-IDF

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [26]:
tv = TfidfVectorizer(min_df=0., max_df=1., use_idf=True)
tv_matrix = tv.fit_transform(df['Body'])
tv_matrix = tv_matrix.toarray()

In [27]:
vocabulario = tv.get_feature_names()
pd.DataFrame(np.round(tv_matrix, 2), columns=vocabulario).head(5)



Unnamed: 0,aa,aaa,aaaaaa,aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaacuxrmplffhxl,aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaacmmvzcjzld,aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaacuwlcuwmdlo,aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaabcfudhgkxt,aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaacuxrmplffhxlmh,aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaafcvwyfklcuxrmplf,...,zzzryanairspamassassintaintorg,zzzz,zzzzasonorgmy,zzzzcchackwatchcom,zzzzjmasonorg,zzzzneilgarnercomas,zzzzspamassassintaintorg,zzzzspamassassintaintorgoldhtlheuhcclco,zzzzteana,zzzzteanayahoogroupscom
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Implementación del modelo

KeyboardInterrupt: 