## Importation des packages

In [1]:
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords

## Lecture des données

### 1ére méthode avec manipulation des string et listes puis transformer en df

In [2]:
my_data = open('./Data/SMSSpamCollection.txt').read()

In [3]:
print(f"type des données : {type(my_data)} et taille : {len(my_data)}")


type des données : <class 'str'> et taille : 477907


In [4]:
my_data = my_data.replace("\t","\n")
my_data = my_data.split("\n")

In [5]:
# on a une liste de str qui contient: 'label','mail'
print(len(my_data))

11149


In [6]:
# séparé les labels et les mails en deux listes 
list_label = my_data[0::2]
list_mail = my_data[1::2]

In [7]:
print(len(list_mail))
print(len(list_label))

5574
5575


In [8]:
del list_label[-1]

In [9]:
print(len(list_mail))
print(len(list_label))

print(list_mail[0:5])
print(list_label[0:5])

5574
5574
['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...', 'Ok lar... Joking wif u oni...', "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's", 'U dun say so early hor... U c already then say...', "Nah I don't think he goes to usf, he lives around here though"]
['ham', 'ham', 'spam', 'ham', 'ham']


In [10]:
pd.set_option('display.max_colwidth', 100)

df = pd.DataFrame(
    {
        'label' : list_label,
        'mail': list_mail
    }
)

In [11]:
df.head()

Unnamed: 0,label,mail
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### 2éme méthode avec df directement

In [12]:
df = pd.read_csv('./Data/SMSSpamCollection.txt', sep='\t', header=None)
df.columns = ['label', 'mail']
df.head()

Unnamed: 0,label,mail
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [13]:
print("Nombre de lignes totales : {}\nNombre de mail spam : {}\nNombre de mail ham : {}".format(len(df),
                                                                                                len(df[df['label']=='spam']),
                                                                                                len(df[df['label']=='ham'])))

Nombre de lignes totales : 5572
Nombre de mail spam : 747
Nombre de mail ham : 4825


In [14]:
# vérification à faire 
print("Nombre de label vide (avec une valeur nulle) : {}".format(df['label'].isnull().sum()))
print("Nombre de mail vide (avec une valeur nulle) : {}".format(
    df['mail'].isnull().sum()))


Nombre de label vide (avec une valeur nulle) : 0
Nombre de mail vide (avec une valeur nulle) : 0


## Nettoyage de données

### 1.remove punctuation

In [15]:
def remove_punctuation(texte):
    data_clean = "".join([ch for ch in texte if ch not in string.punctuation])
    return data_clean


df['mail_without_punct'] = df['mail'].apply(lambda x: remove_punctuation(x))
df.head()

Unnamed: 0,label,mail,mail_without_punct
0,ham,"Go until jurong point, crazy.. Available only ...",Go until jurong point crazy Available only in ...
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say
4,ham,"Nah I don't think he goes to usf, he lives aro...",Nah I dont think he goes to usf he lives aroun...


### 2.tokenization

In [16]:
# tokenizer les mail
def tokeninze(texte):
    data_tokenized = re.split("\W+",texte)
    return data_tokenized

df['mail_tokenized'] = df['mail_without_punct'].apply(lambda x: tokeninze(x))
df.head()

Unnamed: 0,label,mail,mail_without_punct,mail_tokenized
0,ham,"Go until jurong point, crazy.. Available only ...",Go until jurong point crazy Available only in ...,"[Go, until, jurong, point, crazy, Available, o..."
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,"[Ok, lar, Joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in 2 a wkly comp to win FA Cup fina...,"[Free, entry, in, 2, a, wkly, comp, to, win, F..."
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,"[U, dun, say, so, early, hor, U, c, already, t..."
4,ham,"Nah I don't think he goes to usf, he lives aro...",Nah I dont think he goes to usf he lives aroun...,"[Nah, I, dont, think, he, goes, to, usf, he, l..."


### 3.remove stop words

In [17]:
#nltk.download('stopwords')

In [18]:
en_stop_words = set(stopwords.words('english'))
print(sorted(list(en_stop_words)))


['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', 'her', 'here', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is', 'isn', "isn't", 'it', "it's", 'its', 'itself', 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she', "she's", 'should', "should've", 'shouldn', "shouldn't", 'so', 'some',

In [19]:
def remove_stop_word(texte):
    data_clean = [word for word in texte if word not in en_stop_words]
    return data_clean


df['mail_without_stops'] = df['mail_tokenized'].apply(lambda x: remove_stop_word(x))
df.head()

Unnamed: 0,label,mail,mail_without_punct,mail_tokenized,mail_without_stops
0,ham,"Go until jurong point, crazy.. Available only ...",Go until jurong point crazy Available only in ...,"[Go, until, jurong, point, crazy, Available, o...","[Go, jurong, point, crazy, Available, bugis, n..."
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,"[Ok, lar, Joking, wif, u, oni]","[Ok, lar, Joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in 2 a wkly comp to win FA Cup fina...,"[Free, entry, in, 2, a, wkly, comp, to, win, F...","[Free, entry, 2, wkly, comp, win, FA, Cup, fin..."
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,"[U, dun, say, so, early, hor, U, c, already, t...","[U, dun, say, early, hor, U, c, already, say]"
4,ham,"Nah I don't think he goes to usf, he lives aro...",Nah I dont think he goes to usf he lives aroun...,"[Nah, I, dont, think, he, goes, to, usf, he, l...","[Nah, I, dont, think, goes, usf, lives, around..."


In [49]:
# Regrouppant le tout dans une seule fonction
def clean_mail(texte):
    data_without_punct = "".join([ch for ch in texte if ch not in string.punctuation])
    data_tokenized = re.split("\W+", data_without_punct)
    data_without_stops = [word for word in data_tokenized if word not in en_stop_words]
    result = data_without_stops
    return result

df['mail_clean'] = df['mail'].apply(lambda x: clean_mail(x.lower()))
df.head()

Unnamed: 0,label,mail,mail_clean,mail_stemed,mail_lemmatized
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...","[go, jurong, point, crazy, available, bugis, n, great, world, la, e, buffet, cine, got, amore, wat]","[go, jurong, point, crazi, avail, bugi, n, great, world, la, e, buffet, cine, got, amor, wat]","[Go, jurong, point, crazy, Available, bugis, n, great, world, la, e, buffet, Cine, got, amore, wat]"
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, u, oni]","[ok, lar, joke, wif, u, oni]","[Ok, lar, Joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv...","[free, entri, 2, wkli, comp, win, fa, cup, final, tkt, 21st, may, 2005, text, fa, 87121, receiv,...","[Free, entry, 2, wkly, comp, win, FA, Cup, final, tkts, 21st, May, 2005, Text, FA, 87121, receiv..."
3,ham,U dun say so early hor... U c already then say...,"[u, dun, say, early, hor, u, c, already, say]","[u, dun, say, earli, hor, u, c, alreadi, say]","[U, dun, say, early, hor, U, c, already, say]"
4,ham,"Nah I don't think he goes to usf, he lives around here though","[nah, dont, think, goes, usf, lives, around, though]","[nah, i, dont, think, goe, usf, live, around, though]","[Nah, I, dont, think, go, usf, life, around, though]"


In [33]:
res = df[['label','mail_clean']];
print(res)

     label                                         mail_clean
0      ham  [Go, jurong, point, crazy, Available, bugis, n...
1      ham                     [Ok, lar, Joking, wif, u, oni]
2     spam  [Free, entry, 2, wkly, comp, win, FA, Cup, fin...
3      ham      [U, dun, say, early, hor, U, c, already, say]
4      ham  [Nah, I, dont, think, goes, usf, lives, around...
...    ...                                                ...
5567  spam  [This, 2nd, time, tried, 2, contact, u, U, 750...
5568   ham           [Will, ü, b, going, esplanade, fr, home]
5569   ham                   [Pity, mood, Soany, suggestions]
5570   ham  [The, guy, bitching, I, acted, like, id, inter...
5571   ham                            [Rofl, Its, true, name]

[5572 rows x 2 columns]


### 4.stematisation

In [35]:
ps = nltk.PorterStemmer()

def stem_mail(words):
    result = [ps.stem(word) for word in words]
    return result

df['mail_stemmed'] = df['mail_clean'].apply(lambda x: stem_mail(x))
df.head()

Unnamed: 0,label,mail,mail_without_punct,mail_tokenized,mail_without_stops,mail_clean,mail_stemed
0,ham,"Go until jurong point, crazy.. Available only ...",Go until jurong point crazy Available only in ...,"[Go, until, jurong, point, crazy, Available, o...","[Go, jurong, point, crazy, Available, bugis, n...","[Go, jurong, point, crazy, Available, bugis, n...","[go, jurong, point, crazi, avail, bugi, n, gre..."
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,"[Ok, lar, Joking, wif, u, oni]","[Ok, lar, Joking, wif, u, oni]","[Ok, lar, Joking, wif, u, oni]","[ok, lar, joke, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in 2 a wkly comp to win FA Cup fina...,"[Free, entry, in, 2, a, wkly, comp, to, win, F...","[Free, entry, 2, wkly, comp, win, FA, Cup, fin...","[Free, entry, 2, wkly, comp, win, FA, Cup, fin...","[free, entri, 2, wkli, comp, win, fa, cup, fin..."
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,"[U, dun, say, so, early, hor, U, c, already, t...","[U, dun, say, early, hor, U, c, already, say]","[U, dun, say, early, hor, U, c, already, say]","[u, dun, say, earli, hor, u, c, alreadi, say]"
4,ham,"Nah I don't think he goes to usf, he lives aro...",Nah I dont think he goes to usf he lives aroun...,"[Nah, I, dont, think, he, goes, to, usf, he, l...","[Nah, I, dont, think, goes, usf, lives, around...","[Nah, I, dont, think, goes, usf, lives, around...","[nah, i, dont, think, goe, usf, live, around, ..."


In [38]:
df = df.drop('mail_tokenized', axis=1)
df = df.drop('mail_without_stops', axis=1)
df.head()


Unnamed: 0,label,mail,mail_clean,mail_stemed
0,ham,"Go until jurong point, crazy.. Available only ...","[Go, jurong, point, crazy, Available, bugis, n...","[go, jurong, point, crazi, avail, bugi, n, gre..."
1,ham,Ok lar... Joking wif u oni...,"[Ok, lar, Joking, wif, u, oni]","[ok, lar, joke, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[Free, entry, 2, wkly, comp, win, FA, Cup, fin...","[free, entri, 2, wkli, comp, win, fa, cup, fin..."
3,ham,U dun say so early hor... U c already then say...,"[U, dun, say, early, hor, U, c, already, say]","[u, dun, say, earli, hor, u, c, alreadi, say]"
4,ham,"Nah I don't think he goes to usf, he lives aro...","[Nah, I, dont, think, goes, usf, lives, around...","[nah, i, dont, think, goe, usf, live, around, ..."


### 4.v2 Lemmatisation

In [47]:
# Le défaut du stemming est qu'il renvoie certains mots qui n'apparaissent pas dans le dictionnaire (perte d'informations)
nltk.download('wordnet')
nltk.download('omw-1.4')
wn = nltk.WordNetLemmatizer()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\etudiant\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\etudiant\AppData\Roaming\nltk_data...


True

In [50]:
def lemmatise_mail(words):
    result = [wn.lemmatize(word) for word in words]
    return result

df['mail_lemmatized'] = df['mail_clean'].apply(lambda x: lemmatise_mail(x))
df.head()

Unnamed: 0,label,mail,mail_clean,mail_stemed,mail_lemmatized
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...","[go, jurong, point, crazy, available, bugis, n, great, world, la, e, buffet, cine, got, amore, wat]","[go, jurong, point, crazi, avail, bugi, n, great, world, la, e, buffet, cine, got, amor, wat]","[go, jurong, point, crazy, available, bugis, n, great, world, la, e, buffet, cine, got, amore, wat]"
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, u, oni]","[ok, lar, joke, wif, u, oni]","[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv...","[free, entri, 2, wkli, comp, win, fa, cup, final, tkt, 21st, may, 2005, text, fa, 87121, receiv,...","[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv..."
3,ham,U dun say so early hor... U c already then say...,"[u, dun, say, early, hor, u, c, already, say]","[u, dun, say, earli, hor, u, c, alreadi, say]","[u, dun, say, early, hor, u, c, already, say]"
4,ham,"Nah I don't think he goes to usf, he lives around here though","[nah, dont, think, goes, usf, lives, around, though]","[nah, i, dont, think, goe, usf, live, around, though]","[nah, dont, think, go, usf, life, around, though]"
