In [2]:
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np 
import pandas as pd
import re
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split

# Phase 0 : exploring the data

In [3]:
df=pd.read_csv("/Users/simone/Downloads/spam.csv",encoding = "ISO-8859-1")

In [4]:
# We drop the redundent looking columns
unuseful = ["Unnamed: 2","Unnamed: 3","Unnamed: 4"]
df = df.drop(df[unuseful], axis=1)

# We rename the columns in order to make them more understandable
df.rename(columns = {"v1":"Target", "v2":"Text"}, inplace = True)
df.head()

Unnamed: 0,Target,Text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# Phase 1: Data Preprocessing

In order to further process the data, we need to make the data cleaner.

In the first step we extract only the alphabetic characters, so we remove punctuation and numbers. Then we convert all the characters into lowercase.

In [13]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/simone/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
# pre-process a text : clean, tokenize and stem each word in text
def pre_processing(text):
    lemmatizer=WordNetLemmatizer()
    # removing punctuation, lowercase the text, removing stopwords, map punctuation to space
    translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation))
    p_text = text.translate(translator).lower()
    ppt = ""
    for word in p_text.split():
        if word not in stopwords.words('english'):
            ppt += word + " "
    text = ppt.strip(" ")
    token_words = word_tokenize(text)
    lem_sentence = []
    for word in token_words:
        lem_sentence.append(lemmatizer.lemmatize(word, pos ='v'))
    return ' '.join(lem_sentence)

df["Pre_processed_text"] = df["Text"].apply(pre_processing)

In [6]:
#it creates a list of all the words
bag_words = set()
for sms in df["Pre_processed_text"]:
    #print(sms)
    for w in sms.split(" "):
        if w != "":
            bag_words = bag_words.union({w})
bag = list(bag_words)

# Phase 2: extracting the features

In [7]:
#it returns a list of words for each sms
def split_words(text, bag_words):
    return text.split(" ")

In [8]:
df["Words"] = df["Pre_processed_text"].apply(split_words, args = (bag_words,))

In [9]:
bag_len = len(bag)
def vectorize_sms(words):
    vector = np.zeros(bag_len,dtype="int64")
    for i in range(bag_len):
        if bag[i] in words:
            vector[i] += 1
    return vector
df["Vector"] = df["Words"].apply(vectorize_sms)
df 

Unnamed: 0,Target,Text,Pre_processed_text,Words,Vector
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...,"[go, jurong, point, crazy, available, bugis, n...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,ham,Ok lar... Joking wif u oni...,ok lar joke wif u oni,"[ok, lar, joke, wif, u, oni]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...,"[free, entry, 2, wkly, comp, win, fa, cup, fin...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say,"[u, dun, say, early, hor, u, c, already, say]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah think go usf live around though,"[nah, think, go, usf, live, around, though]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,2nd time try 2 contact u u å£750 pound prize 2...,"[2nd, time, try, 2, contact, u, u, å£750, poun...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
5568,ham,Will Ì_ b going to esplanade fr home?,ì b go esplanade fr home,"[ì, b, go, esplanade, fr, home]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
5569,ham,"Pity, * was in mood for that. So...any other s...",pity mood suggestions,"[pity, mood, suggestions]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
5570,ham,The guy did some bitching but I acted like i'd...,guy bitch act like interest buy something else...,"[guy, bitch, act, like, interest, buy, somethi...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [10]:
X=np.zeros((len(df),bag_len),dtype="int64")
for i in range(len(df)):
    X[i]+=df.iloc[i,4]
pd.DataFrame(X,columns=bag)

Unnamed: 0,holiday,p,freedom,academic,stereo,bathroom,harri,describe,care,cdgt,...,clas,custcare,arent,buck,sayin,november,callon,ö´ó,truble,costå£3
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5568,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5569,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5570,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
y=np.zeros(len(df),dtype="int64")
for i in range(len(df["Target"])):
    if df.iloc[i,0]=="ham":
        y[i]=0
    else:
        y[i]=1

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, y_train.shape

((4457, 7644), (4457,))

In [16]:
X_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [13]:
N_yi = np.zeros((2, bag_len)) # feature count
N_y = np.zeros((2)) # total count 
for i in range(len(y_train)):
    # Compute N_y counting the features for each specific class
    N_y[y_train[i]] += np.sum(X_train[i])
    # Compute N_yi adding counting the specific words in each class
    N_yi[y_train[i]] += (X_train[i])
print(N_yi)

[[13.  6.  2. ...  0.  1.  0.]
 [27.  3.  0. ...  0.  0.  1.]]


# Prior 

In [14]:
P = np.zeros(2)
classes=np.unique(y_train)
# Implement Prior Probability P(A)
for j in classes:
    P[j] = np.count_nonzero(y_train == j)/(len(y_train))
print(P)

[0.8660534 0.1339466]


# Likelihood

In [54]:
import math

In [87]:
likelihood_matrix_words=np.zeros((2,bag_len))
for i in range(bag_len):
    for j in range(2):
        likelihood_matrix_words[j][i]=float((N_yi[j][i] + 1)/(N_y[j] + bag_len))
likelihood_matrix_words=pd.DataFrame(likelihood_matrix_words,columns=bag)

In [88]:
likelihood_matrix_words

Unnamed: 0,holiday,p,freedom,academic,stereo,bathroom,harri,describe,care,cdgt,...,clas,custcare,arent,buck,sayin,november,callon,ö´ó,truble,costå£3
0,0.000375,0.000187,8e-05,5.4e-05,5.4e-05,5.4e-05,5.4e-05,5.4e-05,0.001258,2.7e-05,...,5.4e-05,2.7e-05,5.4e-05,0.000161,5.4e-05,2.7e-05,5.4e-05,2.7e-05,5.4e-05,2.7e-05
1,0.001561,0.000223,5.6e-05,5.6e-05,5.6e-05,5.6e-05,5.6e-05,5.6e-05,0.000334,0.000167,...,5.6e-05,0.000502,5.6e-05,5.6e-05,5.6e-05,0.000111,5.6e-05,5.6e-05,5.6e-05,0.000111


In [96]:
likelihood_matrix=np.zeros((len(X_train),2))
for i in range(len(X_train)):
    tmp_spam=[]
    tmp_ham=[]
    sms=X_train[i] #list of words
    for index in range(len(sms)): #for each word in sms
        if sms[index]!=0:
            weight_ham,weight_spam=likelihood_matrix_words.iloc[:,index]
            tmp_spam.append(math.log(weight_spam))
            tmp_ham.append(math.log(weight_ham))
    likelihood_matrix[i][0]=float(np.sum(tmp_ham))
    likelihood_matrix[i][1]=float(np.sum(tmp_spam))

In [97]:
likelihood_matrix=pd.DataFrame(likelihood_matrix,columns=["ham","spam"])

In [98]:
likelihood_matrix

Unnamed: 0,ham,spam
0,-54.434155,-62.207409
1,-137.468846,-123.164313
2,-47.935712,-50.348928
3,-46.803153,-56.129671
4,-180.748513,-142.225845
...,...,...
4452,-53.888084,-66.716916
4453,-16.839779,-22.429726
4454,-50.434432,-57.382434
4455,-29.822265,-35.846948


In [99]:
likelihood_matrix["ham"] = likelihood_matrix["ham"]+math.log(P[0])

In [100]:
likelihood_matrix["spam"] = likelihood_matrix["spam"]+math.log(P[1])

In [101]:
likelihood_matrix.iloc[0:10,:]

Unnamed: 0,ham,spam
0,-54.577964,-64.217723
1,-137.612654,-125.174627
2,-48.07952,-52.359242
3,-46.946962,-58.139986
4,-180.892322,-144.236159
5,-53.85011,-62.8596
6,-35.791139,-35.178199
7,-49.437304,-59.392749
8,-11.986493,-18.267686
9,-47.172168,-54.710617



ham
ham
spam
ham
ham
spam
ham
ham
spam
spam
ham
spam
spam
ham
ham
spam

## Non funziona nulla!!

Letizia propone di usare il pacchetto, ma io sono stronzo e non voglio

Abbiamo provato anche come diceva Franceseca, ci metteva troppo e sembrava non funzionare

Se avete idee, ditelo che qua siamo disperati