# Lesueur Philippe

## Imports

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import plot_roc_curve
from sklearn.svm import SVC
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing

## Data recuperation

In [None]:
data = pd.read_csv("train_master.csv")

## Data preparation

In [None]:
data_train, data_tv, y_train, y_tv = train_test_split(data.drop( ["Target","ID", "product"],axis=1 ), data.Target, test_size=0.4)

In [None]:
data_validation, data_test, y_validation , y_test = train_test_split(data_tv, y_tv, test_size=0.5)

In [None]:
X_train = data_train.review_content
X_validation = data_validation.review_content
X_test = data_test.review_content

## Classifier preparation

In [None]:
text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])
text_clf.fit(X_train, y_train)

## Scoring

In [None]:
train_score = text_clf.score(X_train, y_train)
validation_score = text_clf.score(X_validation, y_validation)
test_score = text_clf.score(X_test, y_test)

## Result visualisation

In [None]:
print("Train score : {}".format(train_score))
plot_roc_curve(text_clf, X_train, y_train)
plt.plot([0,1])
plt.title("Train ROC curve")
plt.show()

plot_confusion_matrix(text_clf, X_train, y_train)
plt.title("Train confusion matrix")
plt.show()

print("\n\n")


print("Validation score : {}".format(validation_score))
plot_roc_curve(text_clf, X_validation, y_validation)
plt.plot([0,1])
plt.title("Validation ROC curve")
plt.show()

plot_confusion_matrix(text_clf, X_validation, y_validation)
plt.title("Validation confusion matrix")
plt.show()

print("\n\n")

print("Test score : {}".format(test_score))
plot_roc_curve(text_clf, X_test, y_test)
plt.plot([0,1])
plt.title("Test ROC curve")
plt.show()

plot_confusion_matrix(text_clf, X_test, y_test)
plt.title("Test confusion matrix")
plt.show()