In [24]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem.snowball import EnglishStemmer
#from dataprep.eda import *
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

## Importation du jeu de données

In [25]:
#importation des données
data = pd.read_csv("/content/drive/MyDrive/Test Pratique-Novalitix AI Lab/sentimentAnalysis/IMDB Dataset.csv")

In [26]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


## Fonction de préparation


In [27]:
REMPLACE_SANS_ESPACE = re.compile("[;:!\'?,\"()\[\]]")
REMPLACE_AVEC_ESPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)|[.]")
PUR_NOMBRE = re.compile("[0-9]")

def setClassBin(i):
  if(i == "positive"):
     return 1
  else:
    return 0


def preprocess_reviews(reviews):
    reviews = [PUR_NOMBRE.sub("", line.lower()) for line in reviews] # retire les nomre (comme les années)
    reviews = [line.replace('\n', ' ')  for line in reviews] # Retire les \n (retours chariots)
    reviews = [REMPLACE_SANS_ESPACE.sub("", line.lower()) for line in reviews]
    reviews = [REMPLACE_AVEC_ESPACE.sub(" ", line) for line in reviews]
    return reviews



def prepare_dataset(X):
    X['review'] = pd.DataFrame(preprocess_reviews(X['review']))
    english_stopwords = set(stopwords.words('english'))
    filtre_stopen =  lambda text: [token for token in text if token.lower() not in english_stopwords]
    X['review'] = [' '.join(filtre_stopen(word_tokenize(item))) for item in X['review']]
    stemmer = EnglishStemmer()
    X['review'] = [stemmer.stem(w) for w in X['review']]

    yList = [setClassBin(x) for x in X.sentiment]
    y = pd.DataFrame(yList)

    X = X.drop('sentiment', axis=1)
    return X, y

In [None]:
X, y = prepare_dataset(data.copy())

In [None]:
X.head()

In [None]:
y.head()

## Vectorisation

In [31]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(binary=True)
cv.fit(X["review"])

X_onehot = cv.transform(X["review"])

In [32]:
X_onehot.shape

(50000, 119775)

## Trouve le meilleur hyperparametre c (régularisation)

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X_onehot, y, train_size = 0.75)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Précision pour C=%s: %s" % (c, accuracy_score(y_test, lr.predict(X_test))))

  y = column_or_1d(y, warn=True)


Précision pour C=0.01: 0.88432


  y = column_or_1d(y, warn=True)


Précision pour C=0.05: 0.89216


  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  y = column_or_1d(y, warn=True)


Précision pour C=0.25: 0.89136


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  y = column_or_1d(y, warn=True)


Précision pour C=0.5: 0.88984
Précision pour C=1: 0.88784


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Entraînement du modèle

In [None]:
import warnings
warnings.filterwarnings('ignore', '.*do not.*',)

In [None]:
final_model = LogisticRegression(C=0.05)
final_model.fit(X_onehot, y)

In [36]:
print ("Précision: %s" % accuracy_score(y_test, final_model.predict(X_test)))

Précision: 0.94384


In [37]:
y_test.head()

Unnamed: 0,0
25087,0
5578,1
9527,0
48737,0
41961,1


Unnamed: 0,0
25087,0
5578,1
9527,0
48737,0
41961,1


In [40]:
final_model.predict(X_test[9527])

array([0])

## Exportation du modèle pour le déploiement


In [47]:
import pickle

In [50]:
with open("final_model.pkl", "wb") as model_file:
    pickle.dump(final_model, model_file)

with open("cv.pkl", "wb") as cv_file:
    pickle.dump(cv, cv_file)