In [1]:
import os
import numpy as np
import pandas as pd

from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import SGDClassifier 
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

In [2]:
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB

In [4]:
DATA_DIR = '/home/simplon/Documents/aclImdb'
target_names = ['neg', 'pos']
# On crée une liste tout les noms des sous dossier dans le dossier eron
subfolders = ['train','test']

#On crée une liste X qui contiendra tout les email
X = []
#On crée une liste y qui contiendra 1 si spam 0 sinon.
y = []

for subfolder in subfolders:
    
    #On crée une liste de tout les fichier contenue le dossier spam de chaque sous-dossier
    neg_files = os.listdir(os.path.join(DATA_DIR, subfolder, 'neg'))
    for neg_file in neg_files:
        with open(os.path.join(DATA_DIR, subfolder, 'neg', neg_file), encoding='ascii', errors='ignore') as f:
            X.append(f.read())
            y.append(0)
            
    pos_files = os.listdir(os.path.join(DATA_DIR, subfolder, 'pos'))
    for pos_file in pos_files:
        with open(os.path.join(DATA_DIR, subfolder, 'pos', pos_file), encoding='ascii', errors='ignore') as f:
            X.append(f.read())
            y.append(1)

In [5]:
data = pd.DataFrame()
data["text"]=X
data["y"]=y

data["y"].value_counts()

1    25000
0    25000
Name: y, dtype: int64

In [34]:
data["text"].head()

0    Andie McDowell is beautiful as the 40-ish woma...
1    I was interested in the title and description ...
2    This film was bad because there was nothing in...
3    This movie is terrible. TERRIBLE. One of the w...
4    Anyone who actually had the ability to sit thr...
Name: text, dtype: object

In [40]:
import re

def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r' ',text)

# retire 
def remove_things(text):
    html=re.compile("[^a-zA-Z\s']")
    return html.sub(' ',text)

#Remove stopwords & Punctuations
from nltk.corpus import stopwords
", ".join(stopwords.words('english'))
STOPWORDS = set(stopwords.words('english'))

def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

data["text"] = data["text"].apply(lambda x:remove_html(x))
data["text"] = data["text"].apply(lambda x:remove_things(x))
data["text"] = data["text"].apply(lambda x:remove_stopwords(x))

X_train, X_test, y_train, y_test = train_test_split(data["text"], data["y"], test_size=0.20)

In [42]:
X_train[0]

"Andie McDowell beautiful ish woman whose late start serious relationship leads considerably younger man subsequenet falling long time best girldfriends Seeing gigolo gold digger sincere young man girl friends dead set terminating silly relationship go beyond call duty helping friend obviously blinded gigolo's tricky game A short succession situations absolutely ridiculous Far fetched longer covers Without unbelievable scenes may hope sweet love story Instead viewer left involuntary shaking head things happen Without giving away cliff hanger details I warn viewer high expectations film like disappointed On scale one ranks weak There much better material This one worth time"

In [45]:
text_model = Pipeline([('count_vec', CountVectorizer(stop_words=STOPWORDS,lowercase=True)),
                     ('tfidf_transformer', TfidfTransformer()),
                     ('text_model', BernoulliNB())])

text_model.fit(X_train, y_train)
y_pred = text_model.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.8507


In [47]:
text_model = Pipeline([('count_vec', CountVectorizer(stop_words=STOPWORDS,lowercase=True)),
                     ('tfidf_transformer', TfidfTransformer()),
                     ('text_model', MultinomialNB())])

text_model.fit(X_train, y_train)
y_pred = text_model.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.8613


In [46]:
text_model = Pipeline([('count_vec', CountVectorizer(stop_words=STOPWORDS,lowercase=True)), 
                      ('tfidf_transformer', TfidfTransformer()),
                      ('text_model', SGDClassifier(tol=None, loss='log'))])

text_model.fit(X_train, y_train)
y_pred = text_model.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.8808
