# TP1 MLOPS : 

In [1]:
import pandas as pd 
import numpy as np 

In [2]:
df = pd.read_csv("../data/train.csv")

In [3]:
df

Unnamed: 0.1,Unnamed: 0,film-url,review,polarity
0,0,http://www.allocine.fr/film/fichefilm-135259/c...,Si vous cherchez du cinéma abrutissant à tous ...,0
1,1,http://www.allocine.fr/film/fichefilm-172430/c...,"Trash, re-trash et re-re-trash...! Une horreur...",0
2,2,http://www.allocine.fr/film/fichefilm-15105/cr...,"Et si, dans les 5 premières minutes du film, l...",0
3,3,http://www.allocine.fr/film/fichefilm-188629/c...,Mon dieu ! Quelle métaphore filée ! Je suis ab...,0
4,4,http://www.allocine.fr/film/fichefilm-23514/cr...,"Premier film de la saga Kozure Okami, ""Le Sabr...",1
...,...,...,...,...
159995,159995,http://www.allocine.fr/film/fichefilm-132387/c...,Un rythme bien trop lent et un Ashton Kutcher ...,0
159996,159996,http://www.allocine.fr/film/fichefilm-53313/cr...,Monsieur Duchovny vous êtes aussi piètre acteu...,0
159997,159997,http://www.allocine.fr/film/fichefilm-248258/c...,Complètement différent des films de la série C...,1
159998,159998,http://www.allocine.fr/film/fichefilm-268731/c...,Alors franchement pour le moment c'est le meil...,1


In [4]:
df.isna().sum()

Unnamed: 0    0
film-url      0
review        0
polarity      0
dtype: int64

In [5]:
# drop the table names Unnamed: 0
df = df.drop(columns = ["Unnamed: 0"])

In [6]:
df.head()

Unnamed: 0,film-url,review,polarity
0,http://www.allocine.fr/film/fichefilm-135259/c...,Si vous cherchez du cinéma abrutissant à tous ...,0
1,http://www.allocine.fr/film/fichefilm-172430/c...,"Trash, re-trash et re-re-trash...! Une horreur...",0
2,http://www.allocine.fr/film/fichefilm-15105/cr...,"Et si, dans les 5 premières minutes du film, l...",0
3,http://www.allocine.fr/film/fichefilm-188629/c...,Mon dieu ! Quelle métaphore filée ! Je suis ab...,0
4,http://www.allocine.fr/film/fichefilm-23514/cr...,"Premier film de la saga Kozure Okami, ""Le Sabr...",1


In [7]:
df.shape

(160000, 3)

# Preprocessing : 

In [8]:
# use sklearn feature extraction , preprocess the data 
# use the tf idf vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from spacy.lang.fr.stop_words import STOP_WORDS as stopwords
stopwords = list(stopwords)

# apply it in the df  
tfidf = TfidfVectorizer(stop_words = stopwords) 

X = tfidf.fit_transform(df["review"])








# Conception : 

In [9]:
# sklearn logistic regression

from sklearn.linear_model import LogisticRegression
y = df["polarity"]


model_log = LogisticRegression(max_iter = 2000)
model_log.fit(X,y)


In [10]:
df_test = pd.read_csv("../data/test.csv")

In [11]:
#load the tfidf vectorizer dumped before , and use it to transform the test data into tfidf matrix that has the same shape as the train data
X_test = tfidf.transform(df_test["review"])
X_test.shape

(20000, 152189)

In [12]:
y_test = df_test["polarity"]


In [13]:
model_log.score(X_test,y_test)

0.9223

In [14]:
# precsion , recall , accuracy  of the model 
from sklearn.metrics import precision_score , recall_score , accuracy_score

In [15]:
y_pred = model_log.predict(X_test)

print( "precision : ",precision_score(y_test,y_pred)) 
print( "recall : ",recall_score(y_test,y_pred))
print( "accuracy : ",accuracy_score(y_test,y_pred))

precision :  0.9097675367047309
recall :  0.9302543786488741
accuracy :  0.9223


**precision is the ratio tp / (tp + fp) which means, the ration of the real positive between of those who were flagged as positive.**

**recall is the ratio tp / (tp + fn) ratio of the real positives that we flagged as positive.**

**accuracy is the fraction of predictions our model got right.**

In [16]:
# create pipeline the gets the data , apply the tfidf of in 
# the data , and then apply the logistic regression model
from sklearn.pipeline import Pipeline

pipe = Pipeline([ ("tfidf",TfidfVectorizer(stop_words = stopwords)),("log",LogisticRegression(max_iter = 2000))])

pipe.fit(df["review"],df["polarity"])

y_pred = pipe.predict(df_test["review"])

print( "precision : ",precision_score(y_test,y_pred))
print( "recall : ",recall_score(y_test,y_pred))
print( "accuracy : ",accuracy_score(y_test,y_pred))




precision :  0.9097675367047309
recall :  0.9302543786488741
accuracy :  0.9223


In [17]:
# do the same with different C and penalty values of logistic regression
# C = 2 , penalty = l2
model_log2 = LogisticRegression(max_iter = 2000,C = 2, penalty = "l2")
model_log2.fit(X,y)
y_pred = model_log2.predict(X_test)
print( "precision : ",precision_score(y_test,y_pred))
print( "recall : ",recall_score(y_test,y_pred))
print( "accuracy : ",accuracy_score(y_test,y_pred))


precision :  0.9110294867870625
recall :  0.9308798999165971
accuracy :  0.92325


**A little augmentation of the scores, we can see some augmentation in all of them.**

# same using svc
from sklearn.svm import SVC
model_svc = SVC()
model_svc.fit(X,y)
y_pred = model_svc.predict(X_test)
print( "precision : ",precision_score(y_test,y_pred))
print( "recall : ",recall_score(y_test,y_pred))
print( "accuracy : ",accuracy_score(y_test,y_pred))

# same using mlp classifier , use many architectures and hyperparameters 
from sklearn.neural_network import MLPClassifier
model_mlp = MLPClassifier()
model_mlp.fit(X,y)
y_pred = model_mlp.predict(X_test)
print( "precision : ",precision_score(y_test,y_pred))
print( "recall : ",recall_score(y_test,y_pred))
print( "accuracy : ",accuracy_score(y_test,y_pred))


# use different architechture and hyperparameters
model_mlp2 = MLPClassifier(hidden_layer_sizes = (100,100,100),max_iter = 1000)
model_mlp2.fit(X,y)
y_pred = model_mlp2.predict(X_test)
print( "precision : ",precision_score(y_test,y_pred))
print( "recall : ",recall_score(y_test,y_pred))
print( "accuracy : ",accuracy_score(y_test,y_pred))

# another architecture
model_mlp3 = MLPClassifier(hidden_layer_sizes = (100,100,100,100),max_iter = 1000)
model_mlp3.fit(X,y)
y_pred = model_mlp3.predict(X_test)
print( "precision : ",precision_score(y_test,y_pred))
print( "recall : ",recall_score(y_test,y_pred))
print( "accuracy : ",accuracy_score(y_test,y_pred))

# Optimization : 


In [18]:
df_valid = pd.read_csv("../data/valid.csv")
X_valid = tfidf.transform(df_valid["review"])
y_valid = df_valid["polarity"]


In [None]:
#import hyperopt
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
# use hyperopt to find the best hyperparameters for logistic regression

# space of hyperparameters
# use solver : 'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga' in the hyperopt space by using hp.pchoice
space = {
    'C': hp.uniform('C', 0.1, 10),
    'penalty': hp.choice('penalty', ['l2']),
    'solver': hp.pchoice('solver',[(1/6,'lbfgs'),(1/6,'liblinear'),(1/6,'newton-cg'),(1/6,'newton-cholesky'),(1/6, 'sag'),(1/6, 'saga')] ) 
}




# define the objective function
def objective(params):
    params = {'C': params['C'], 'penalty': params['penalty'] , 'solver' : params['solver']}
    model = LogisticRegression(**params,max_iter = 100 )
    model.fit(X , y )
    y_pred = model.predict(X_valid)
    score = accuracy_score(y_valid,y_pred)
    loss = 1 - score
    return loss

# start the trials to find the best hyperparameters
trials = Trials()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=20,
            trials=trials)

# print the best hyperparameters
print(best)

 30%|███       | 6/20 [00:19<00:47,  3.41s/trial, best loss: 0.08040000000000003]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



 50%|█████     | 10/20 [00:32<00:32,  3.22s/trial, best loss: 0.08040000000000003]