# Clasificación de documentos (email spam o no spam)

In [1]:
!git clone https://github.com/pachocamacho1990/datasets

fatal: la ruta de destino 'datasets' ya existe y no es un directorio vacío.


In [2]:
import nltk, random
import pandas as pd
import numpy as np
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk import word_tokenize

[nltk_data] Downloading package punkt to /home/oem/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/oem/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [3]:
df = pd.read_csv('datasets/email/csv/spam-apache.csv', names = ['clase','contenido'])
df['tokens'] = df['contenido'].apply(lambda x: word_tokenize(x))
print(len(df))
df.head()

250


Unnamed: 0,clase,contenido,tokens
0,-1,"<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.0 Tr...","[<, !, DOCTYPE, HTML, PUBLIC, ``, -//W3C//DTD,..."
1,1,> Russell Turpin:\n> > That depends on how the...,"[>, Russell, Turpin, :, >, >, That, depends, o..."
2,-1,Help wanted. We are a 14 year old fortune 500...,"[Help, wanted, ., We, are, a, 14, year, old, f..."
3,-1,Request A Free No Obligation Consultation!\nAc...,"[Request, A, Free, No, Obligation, Consultatio..."
4,1,Is there a way to look for a particular file o...,"[Is, there, a, way, to, look, for, a, particul..."


In [4]:
df['tokens'].values[0]

['<',
 '!',
 'DOCTYPE',
 'HTML',
 'PUBLIC',
 '``',
 '-//W3C//DTD',
 'HTML',
 '4.0',
 'Transitional//EN',
 "''",
 '>',
 '<',
 'HTML',
 '>',
 '<',
 'HEAD',
 '>',
 '<',
 'META',
 'http-equiv=Content-Type',
 'content=',
 "''",
 'text/html',
 ';',
 'charset=iso-8859-1',
 "''",
 '>',
 '<',
 'META',
 'content=',
 "''",
 'MSHTML',
 '6.00.2600.0',
 "''",
 'name=GENERATOR',
 '>',
 '<',
 'STYLE',
 '>',
 '<',
 '/STYLE',
 '>',
 '<',
 '/HEAD',
 '>',
 '<',
 'BODY',
 'bgColor=',
 '#',
 'ffffff',
 '>',
 '<',
 'DIV',
 '>',
 '<',
 'FONT',
 'face=Arial',
 'size=2',
 '>',
 '<',
 'FONT',
 'face=',
 "''",
 'Times',
 'New',
 'Roman',
 "''",
 'size=3',
 '>',
 'Dear',
 'Friend',
 ',',
 '<',
 'BR',
 '>',
 '<',
 'BR',
 '>',
 'A',
 'recent',
 'survey',
 'by',
 'Nielsen/Netratings',
 'says',
 'that',
 '``',
 'The',
 'Internet',
 '<',
 'BR',
 '>',
 'population',
 'is',
 'rapidly',
 'approaching',
 'a',
 "'Half",
 'a',
 'Billion',
 "'",
 'people',
 '!',
 '``',
 '<',
 'BR',
 '>',
 '<',
 'BR',
 '>',
 'SO',
 'WHAT',
 'D

In [5]:
all_words = nltk.FreqDist([w for tokenlist in df['tokens'].values for w in tokenlist])
top_words = all_words.most_common(200)
top_words

[('.', 2200),
 (',', 2173),
 ('the', 1963),
 ('>', 1787),
 ('--', 1611),
 ('to', 1435),
 (':', 1220),
 ('and', 1064),
 ('of', 958),
 ('a', 879),
 ('you', 743),
 ('in', 742),
 ('I', 741),
 ('<', 718),
 ('!', 698),
 ('%', 677),
 ('for', 609),
 ('is', 577),
 ('#', 521),
 ('BR', 494),
 ('that', 477),
 (')', 463),
 ('it', 458),
 ("''", 434),
 ('$', 413),
 ('this', 384),
 ('(', 380),
 ('on', 378),
 ('http', 362),
 ('?', 360),
 ('your', 359),
 ('have', 351),
 ('with', 334),
 ('...', 327),
 ('``', 307),
 ('be', 299),
 ('-', 289),
 ('from', 271),
 ("'s", 263),
 ('are', 257),
 ('31', 255),
 ('or', 252),
 ('as', 251),
 ('will', 243),
 ('not', 224),
 ('30', 220),
 ('my', 206),
 ('at', 199),
 ('The', 196),
 ('has', 195),
 ('can', 194),
 ('&', 181),
 ('all', 176),
 ("n't", 175),
 ('do', 167),
 ('out', 166),
 ('but', 164),
 ('our', 160),
 ('by', 156),
 ('if', 152),
 ('was', 149),
 ('one', 129),
 ('an', 129),
 ('just', 128),
 ('@', 128),
 ('This', 125),
 ('1', 123),
 ('more', 118),
 ('You', 117),
 ('5

In [6]:
def document_features(document, top_words=top_words):
    document_words = set(document)
    features = {}
    for word, freq in top_words:
        features['contains({})'.format(word)] = (word in document_words)
    return features

In [7]:
document_features(df['tokens'].values[0])

{'contains(.)': True,
 'contains(,)': True,
 'contains(the)': True,
 'contains(>)': True,
 'contains(--)': True,
 'contains(to)': True,
 'contains(:)': True,
 'contains(and)': True,
 'contains(of)': True,
 'contains(a)': True,
 'contains(you)': True,
 'contains(in)': True,
 'contains(I)': True,
 'contains(<)': True,
 'contains(!)': True,
 'contains(%)': True,
 'contains(for)': True,
 'contains(is)': True,
 'contains(#)': True,
 'contains(BR)': True,
 'contains(that)': True,
 'contains())': True,
 'contains(it)': True,
 "contains('')": True,
 'contains($)': True,
 'contains(this)': True,
 'contains(()': True,
 'contains(on)': True,
 'contains(http)': False,
 'contains(?)': True,
 'contains(your)': True,
 'contains(have)': True,
 'contains(with)': True,
 'contains(...)': True,
 'contains(``)': True,
 'contains(be)': True,
 'contains(-)': True,
 'contains(from)': True,
 "contains('s)": True,
 'contains(are)': True,
 'contains(31)': False,
 'contains(or)': True,
 'contains(as)': True,
 'co

Lo primero que hacemos es un conjunto de atributos como una lista de **tuplas**, obteniendo **textos** y **clases**. De esta forma estamos recorriendo dos listas de forma simultanea.

La función `zip` se utiliza porque se estan recorriendo dos listas de forma simultanea.

In [8]:
fset = [(document_features(texto), clase) for texto, clase in zip(df['tokens'].values, df['clase'].values)]
random.shuffle(fset)
train, test = fset[:200], fset[200:]

In [9]:
classifier = nltk.NaiveBayesClassifier.train(train)

In [10]:
print(nltk.classify.accuracy(classifier, train))

0.975


In [11]:
print(nltk.classify.accuracy(classifier, test))

0.92


In [12]:
df[df['clase']==-1]['contenido']

0      <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Tr...
2      Help wanted.  We are a 14 year old fortune 500...
3      Request A Free No Obligation Consultation!\nAc...
10     >\n>“µ×è¹µÑÇ ¡ÑºâÅ¡¸ØÃ¡Ô¨º¹ÍÔ¹àµÍÃìà¹çµ” \n>àµ...
                             ...                        
243    ##############################################...
244    Wanna see sexually curious teens playing with ...
246    REQUEST FOR URGENT BUSINESS ASSISTANCE\n------...
248    Email marketing works!  There's no way around ...
249    Email marketing works!  There's no way around ...
Name: contenido, Length: 125, dtype: object

In [13]:
classifier.show_most_informative_features(5)

Most Informative Features
            contains(We) = True               -1 : 1      =     11.7 : 1.0
          contains(YOUR) = True               -1 : 1      =     11.0 : 1.0
        contains(Please) = True               -1 : 1      =      9.6 : 1.0
         contains(below) = True               -1 : 1      =      9.2 : 1.0
           contains(FOR) = True               -1 : 1      =      9.1 : 1.0


In [14]:
!jupyter nbconvert --to=python 3_clasificacion.ipynb

[NbConvertApp] Converting notebook 3_clasificacion.ipynb to python
[NbConvertApp] Writing 1874 bytes to 3_clasificacion.py
