<a href="https://colab.research.google.com/github/leirbag95/AlphaCalculator/blob/master/ProjetML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Pré-requis**

*   Charger tout le data préprocessing **dans l'ordre** avant de lancer un modèle
*   Pour tester un modèle, envoyer tout le dataset d'évaluation sans certaine colonnnes
*   Si l'import d'une library ne fonctionne pas, retirer le commentaire juste au dessus de celui-ci



**Liste de tous les imports nécessaires**

In [0]:
# libraries
import math
import pandas as pd
import numpy as np
import nltk
from nltk.tag import UnigramTagger
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('wordnet')
nltk.download('punkt')
from nltk.stem import WordNetLemmatizer 
nltk.download('sentiwordnet')
from nltk.corpus import sentiwordnet as swn
from nltk.tokenize import regexp_tokenize
!pip3 install vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import matplotlib.pyplot as plt
import re
from nltk.stem import PorterStemmer
!pip3 install textblob
from textblob import TextBlob
from textblob import Word
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error


**Chargement du dataset**

Chargement du dataset via le drive.google.
Le résultat attendu après execution de la commande est:

`Dataframe crée `

In [0]:
try: 
  global_df = pd.read_csv("http://christophe-rodrigues.fr/hotel_reviews.csv", sep=';')
  eval_data = pd.read_csv("http://christophe-rodrigues.fr/eval_reviews.csv", sep=';')
  print("Dataframe crée")
except Exception as e:
  print("Une erreur a eu lieu lors du chargement du dataset")
  print(e)


In [0]:
# on renomme le nom des colonnes
global_df = global_df.rename(columns=({'Unnamed: 0':'id','reviews.rating':'rate','reviews.text':'review'}))
eval_data = global_df.rename(columns=({'Unnamed: 0':'id','reviews.rating':'rate','reviews.text':'review'}))
global_df.shape

In [0]:
global_df.head()

# Data Preprocessing

In [0]:
df0 = global_df.copy()
df_eval = eval_data.copy()

### **Data Cleaning**

In [0]:
df0.insert(3,'review.cleaned', "")
df0.insert(2,'is_good_review',0)
df_eval.insert(3,'review.cleaned', "")
df_eval.insert(2,'is_good_review',0)

In [0]:
# return the wordnet object value corresponding to the POS tag
from nltk.corpus import wordnet

def get_wordnet_pos(pos_tag):
    if pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    
import string
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer
nltk.download('averaged_perceptron_tagger')

def clean_text(text):
    # lower text
    text = text.lower()
    # tokenize text and remove puncutation
    text = [word.strip(string.punctuation) for word in text.split(" ")]
    # remove words that contain numbers
    text = [word for word in text if not any(c.isdigit() for c in word)]
    # remove stop words
    stop = stopwords.words('english')
    text = [x for x in text if x not in stop]
    # remove empty tokens
    text = [t for t in text if len(t) > 0]
    # pos tag text
    pos_tags = pos_tag(text)
    # lemmatize text
    text = [WordNetLemmatizer().lemmatize(t[0], get_wordnet_pos(t[1])) for t in pos_tags]
    # remove words with only one letter
    text = [t for t in text if len(t) > 1]
    # join all
    text = " ".join(text)
    return(text)

# ajoute de la variable is_good_review
def is_good_review(rate):
  if rate > 2.5:
    return 1
  return 0
# clean text data
df0["review.cleaned"] = df0["review"].apply(lambda x: clean_text(x))
df0["is_good_review"] = df0["rate"].apply(lambda x: is_good_review(x) )
df_eval["review.cleaned"] = df0["review"].apply(lambda x: clean_text(x))
df_eval["is_good_review"] = df0["rate"].apply(lambda x: is_good_review(x) )

In [0]:
dfshuf0 = df0.copy().sample(frac=1)
dfshuf_eval = df_eval.copy().sample(frac=1)

In [0]:
n = dfshuf0[dfshuf0['review'] == "MoreMore"].shape[0]
dfshuf0 = dfshuf0.drop(dfshuf0[dfshuf0.review == "MoreMore"].sample(n = n).index, axis=0)
n = dfshuf_eval[dfshuf_eval['review'] == "MoreMore"].shape[0]
dfshuf_eval = dfshuf_eval.drop(dfshuf_eval[dfshuf_eval.review == "MoreMore"].sample(n = n).index, axis=0)


In [0]:
dfshuf_eval.head()

### **Feature engineering**

In [0]:
# add sentiment anaylsis columns
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

sid = SentimentIntensityAnalyzer()
dfshuf0["sentiments"] = dfshuf0["review.cleaned"].apply(lambda x: sid.polarity_scores(x))
dfshuf0 = pd.concat([dfshuf0.drop(['sentiments'], axis=1), dfshuf0['sentiments'].apply(pd.Series)], axis=1)
dfshuf_eval["sentiments"] = dfshuf_eval["review.cleaned"].apply(lambda x: sid.polarity_scores(x))
dfshuf_eval = pd.concat([dfshuf_eval.drop(['sentiments'], axis=1), dfshuf_eval['sentiments'].apply(pd.Series)], axis=1)

In [0]:
# Ajout du nombre de character à chaque colonne
dfshuf0["nb_chars"] = dfshuf0["review"].apply(lambda x: len(x))

# Ajout du nombre de mots à chaque colonne
dfshuf0["nb_words"] = dfshuf0["review"].apply(lambda x: len(x.split(" ")))

# Ajout du nombre de character à chaque colonne
dfshuf_eval["nb_chars"] = dfshuf_eval["review"].apply(lambda x: len(x))

# Ajout du nombre de mots à chaque colonne
dfshuf_eval["nb_words"] = dfshuf_eval["review"].apply(lambda x: len(x.split(" ")))

In [0]:
# create doc2vec vector columns
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(dfshuf0["review.cleaned"].apply(lambda x: x.split(" ")))]
documents_eval = [TaggedDocument(doc, [i]) for i, doc in enumerate(dfshuf_eval["review.cleaned"].apply(lambda x: x.split(" ")))]

# train a Doc2Vec model with our text data
model = Doc2Vec(documents, vector_size=5, window=2, min_count=1, workers=4)
model_eval = Doc2Vec(documents_eval, vector_size=5, window=2, min_count=1, workers=4)

# transform each document into a vector data
doc2vec_df = dfshuf0["review.cleaned"].apply(lambda x: model.infer_vector(x.split(" "))).apply(pd.Series)
doc2vec_df_eval = dfshuf_eval["review.cleaned"].apply(lambda x: model_eval.infer_vector(x.split(" "))).apply(pd.Series)
doc2vec_df.columns = ["doc2vec_vector_" + str(x) for x in doc2vec_df.columns]
doc2vec_df_eval.columns = ["doc2vec_vector_" + str(x) for x in doc2vec_df_eval.columns]
dfshuf0 = pd.concat([dfshuf0, doc2vec_df], axis=1)
dfshuf_eval = pd.concat([dfshuf_eval, doc2vec_df_eval], axis=1)

In [0]:
# add tf-idfs columns
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(min_df = 10)
tfidf_result = tfidf.fit_transform(dfshuf0["review.cleaned"]).toarray()
tfidf_df = pd.DataFrame(tfidf_result, columns = tfidf.get_feature_names())
tfidf_df.columns = ["word_" + str(x) for x in tfidf_df.columns]
tfidf_df.index = dfshuf0.index
dfshuf0 = pd.concat([dfshuf0, tfidf_df], axis=1)

tfidf_result = tfidf.fit_transform(dfshuf_eval["review.cleaned"]).toarray()
tfidf_df = pd.DataFrame(tfidf_result, columns = tfidf.get_feature_names())
tfidf_df.columns = ["word_" + str(x) for x in tfidf_df.columns]
tfidf_df.index = dfshuf_eval.index
dfshuf_eval = pd.concat([dfshuf_eval, tfidf_df], axis=1)

In [0]:
dfshuf0.columns

### **Data exploration**

In [0]:
# Montre la distribution des notes 

balanced0 = dfshuf0["rate"].value_counts(normalize = True)
balanced0.plot(kind='bar', title='Count (target)');

In [0]:
# equilibrage du dataset
for i in [4,3,5]:
  ratio = dfshuf_eval['rate'].value_counts()[2]/(dfshuf_eval['rate'].value_counts()[i])
  n = dfshuf_eval["rate"].value_counts()[i] - int(dfshuf_eval["rate"].value_counts()[i]*ratio)
  rows = dfshuf_eval[dfshuf_eval["rate"] == i].sample(n = n)
  dfshuf_eval = dfshuf_eval.drop(rows.index,axis=0)
# equilibrage du dataset

for i in [4,3,5]:
  ratio = dfshuf0['rate'].value_counts()[2]/(dfshuf0['rate'].value_counts()[i])
  n = dfshuf0["rate"].value_counts()[i] - int(dfshuf0["rate"].value_counts()[i]*ratio)
  rows = dfshuf0[dfshuf0["rate"] == i].sample(n = n)
  dfshuf0 = dfshuf0.drop(rows.index,axis=0)

In [0]:
# Montre la distribution des notes 

balanced1 = dfshuf0["rate"].value_counts(normalize = True)
balanced1.plot(kind='bar', title='Count (target)');

In [0]:
# wordcloud function

from wordcloud import WordCloud
import matplotlib.pyplot as plt

def show_wordcloud(data, title = None):
    wordcloud = WordCloud(
        background_color = 'white',
        max_words = 200,
        max_font_size = 40, 
        scale = 3,
        random_state = 42
    ).generate(str(data))

    fig = plt.figure(1, figsize = (20, 20))
    plt.axis('off')
    if title: 
        fig.suptitle(title, fontsize = 20)
        fig.subplots_adjust(top = 2.3)

    plt.imshow(wordcloud)
    plt.show()
    
# print wordcloud
show_wordcloud(dfshuf0["review.cleaned"])

In [0]:
# Les reviews les plus positives (plus de 5 mots minimum)
dfshuf0.sort_values("pos", ascending = False)[["rate","review", "pos","nb_words"]].head(10)

In [0]:
# Les reviews les plus négatives (plus de 5 mots minimum)
dfshuf0.sort_values("neg", ascending = False)[["rate","review", "neg", "nb_words"]].head(10)

In [0]:
# Les reviews les plus objectives (plus de 5 mots minimum)
dfshuf0.sort_values("neu", ascending = False)[["rate","review", "neu", "nb_words"]].head(10)

In [0]:
# Les reviews les plus objectives (plus de 5 mots minimum)
dfshuf0[dfshuf0["rate"] == 3].sort_values("neu", ascending = False)[["rate","review", "neu", "nb_words"]].head(10)

In [0]:
# plot sentiment distribution for positive and negative reviews

import seaborn as sns

for x in [1,2,3,4,5]:
    subset = dfshuf0[dfshuf0['rate'] == x]
    
    # Draw the density plot
    label = str(x)
    sns.distplot(subset['compound'], hist = False, label = label)

In [0]:
# plot sentiment distribution for positive and negative reviews

import seaborn as sns

for x in [0,1]:
    subset = dfshuf0[dfshuf0['is_good_review'] == x]
    
    # Draw the density plot
    label = str(x)
    sns.distplot(subset['compound'], hist = False, label = label)

# Modeling `is_good_review`




## Prediction based on `compound`

In [0]:
"""
On predit la note à l'aide du calcule de sentiments.
Le résultat obtenu est un chiffre compris dans l'intervalle [-1;1] qu'on normalise dans [1;5]
"""
def predict_classifier_rate(compound): 
  final_rate = 2.5 * compound + 2.5
  if final_rate > 2.5:
    return 1
  return 0

def get_score_classifier(data):
  mean = 0
  divisor = data.shape[0]
  for i in range(divisor):
    rate = predict_classifier_rate(data['compound'][i])
    if rate == data['is_good_review'][i]:
      mean += 1
  return mean / divisor


# FIXME: problème d'index
print(get_score_classifier(dfshuf_eval))

## Logistic Regression

In [0]:
def logistic_regression(df_eval):
  label = "is_good_review"
  ignore_cols = [label,"rate", "review", "review.cleaned", "id"]
  features = [c for c in dfshuf0.columns if c not in ignore_cols]
  # split the data into train and test
  from sklearn.linear_model import LogisticRegression
  from sklearn.model_selection import train_test_split
  X_train, X_test, y_train, y_test = train_test_split(dfshuf0[features], dfshuf0[label], test_size = 0.20, random_state = 42)
  # train a logitstic regression classifier
  lr = LogisticRegression()
  lr.fit(X_train, y_train)
  y_pred = lr.predict(df_eval)
  return y_pred

### Résultats

In [0]:
from sklearn.metrics import mean_squared_error
label = "is_good_review"
ignore_cols = [label,"rate", "review", "review.cleaned","id"]
features = [c for c in dfshuf_eval.columns if c not in ignore_cols]
X_dfshuf_eval = dfshuf_eval[features]
y_dfshuf_eval = dfshuf_eval[label]
eval_predicted = logistic_regression(X_dfshuf_eval)
print(mean_squared_error(eval_predicted,y_dfshuf_eval))

In [0]:
# ROC curve

from sklearn.metrics import roc_curve, auc, roc_auc_score
import matplotlib.pyplot as plt

y_pred = eval_predicted
fpr, tpr, thresholds = roc_curve(y_dfshuf_eval, y_pred, pos_label = 1)

roc_auc = auc(fpr, tpr)

plt.figure(1, figsize = (15, 10))
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()

## RandomForest Classifier

In [0]:
def random_forest(df_eval):
  # on lance la logistic regression sur les 2 dataset: dfshuf1 et dfshuf0
  label = "is_good_review"
  ignore_cols = [label,"rate", "review", "review.cleaned", "id"]
  features = [c for c in dfshuf0.columns if c not in ignore_cols]
  # split the data into train and test
  from sklearn.ensemble import RandomForestClassifier
  from sklearn.model_selection import train_test_split
  
  X_train, X_test, y_train, y_test = train_test_split(dfshuf0[features], dfshuf0[label], test_size = 0.20, random_state = 42)
  # train a random forest classifier
  rf = RandomForestClassifier(n_estimators = 100, random_state = 42)
  rf.fit(X_train, y_train)
  y_pred = rf.predict(df_eval)
  return y_pred


### Résultats

In [0]:
from sklearn.metrics import mean_squared_error
label = "is_good_review"
ignore_cols = [label,"rate", "review", "review.cleaned", "id"]
features = [c for c in dfshuf_eval.columns if c not in ignore_cols]
X_dfshuf_eval = dfshuf_eval[features]
y_dfshuf_eval = dfshuf_eval[label]
eval_predicted = random_forest(X_dfshuf_eval)
print(mean_squared_error(eval_predicted,y_dfshuf_eval))

In [0]:
# show feature importance
feature_importances_df = pd.DataFrame({"feature": features, "importance": rf.feature_importances_}).sort_values("importance", ascending = False)
feature_importances_df.head(20)

In [0]:
# ROC curve

from sklearn.metrics import roc_curve, auc, roc_auc_score
import matplotlib.pyplot as plt

y_pred = eval_predicted
fpr, tpr, thresholds = roc_curve(y_dfshuf_eval, y_pred, pos_label = 1)

roc_auc = auc(fpr, tpr)

plt.figure(1, figsize = (15, 10))
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()

## Réseaux de neuronnes avec Keras

In [0]:
def neural_network(df_eval):
  from keras.utils import to_categorical
  # load the dataset but only keep the top n words, zero the rest
  label = "is_good_review"
  ignore_cols = [label, "rate","review", "review.cleaned", "id"]
  features = [c for c in dfshuf0.columns if c not in ignore_cols]
  # split the data into train and test
  from sklearn.model_selection import train_test_split
  X_train, X_test, y_train, y_test = train_test_split(dfshuf0[features], dfshuf0[label], test_size = 0.20, random_state = 42)
  y_train = to_categorical(y_train, num_classes=2)
  from keras.models import Sequential
  from keras.layers import Dense
  #create model
  model = Sequential()
  #get number of columns in training data
  n_cols_2 = X_train.shape[1]
  #add layers to model
  model.add(Dense(250, activation='relu', kernel_initializer='random_normal', input_shape=(n_cols_2,)))
  model.add(Dense(250, activation='relu', kernel_initializer='random_normal'))
  model.add(Dense(250, activation='relu', kernel_initializer='random_normal'))
  model.add(Dense(2, activation='softmax', kernel_initializer='random_normal'))
  #compile model using accuracy to measure model performance
  model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
  #train model
  from keras.callbacks import ModelCheckpoint, EarlyStopping
  history = model.fit(X_train, y_train, epochs=150, batch_size=500)
  from keras import losses
  y_pred = model.predict(df_eval)
  return y_pred



### Résultalts

In [0]:
from sklearn.metrics import mean_squared_error
from keras.utils import to_categorical
label = "is_good_review"
ignore_cols = [label,"rate", "review", "review.cleaned","id"]
features = [c for c in dfshuf_eval.columns if c not in ignore_cols]
X_dfshuf_eval = dfshuf_eval[features]
y_dfshuf_eval = dfshuf_eval[label]
y_dfshuf_eval = to_categorical(y_dfshuf_eval, num_classes=2)
eval_predicted = neural_network(X_dfshuf_eval)
print(mean_squared_error(eval_predicted,y_dfshuf_eval))

# Modeling `rate`

## Prediction based on `compound`


In [0]:
"""
On predit la note à l'aide du calcule de sentiments.
Le résultat obtenu est un chiffre compris dans l'intervalle [-1;1] qu'on normalise dans [1;5]
"""
def predict_rate(compound): 
  final_rate = 2.5 * compound + 2.5
  return math.ceil(final_rate)

def get_score(data):
  mean = 0
  divisor = data.shape[0]
  for i in range(len(data)):
    rate = predict_rate(data['compound'][i])
    if rate == data['rate'][i]:
      mean += 1
  return mean / divisor

In [0]:
print("Accuracy:",get_score(dfshuf0))

In [0]:
# Quelques exemple

df_tmp = pd.DataFrame({"rate":dfshuf0['rate'], "review":dfshuf0['review'], "compound":dfshuf0['compound']})
compound0 = df_tmp.iloc[0,2]
compound1 = df_tmp.iloc[1,2]
compound2 = df_tmp.iloc[2,2]
compound3 = df_tmp.iloc[3,2]
compound4 = df_tmp.iloc[4,2]
print("Rate predicted:",predict_rate(compound0))
print("Rate predicted:",predict_rate(compound1))
print("Rate predicted:",predict_rate(compound2))
print("Rate predicted:",predict_rate(compound3))
print("Rate predicted:",predict_rate(compound4))
df_tmp.head()

In [0]:
def confusion_m(data):
  cm = [[0,0,0,0,0],
        [0,0,0,0,0],
        [0,0,0,0,0],
        [0,0,0,0,0],
        [0,0,0,0,0]]
  for i in range(data.shape[0]):
    rate = data.rate[i] - 1
    rate_pred = predict_rate(data["compound"][i]) - 1
    cm[rate][rate_pred] += 1
  return cm
cm = confusion_m(dfshuf0)
print("True prediction")
print("1:", cm[0][0])
print("2:", cm[1][1])
print("3:", cm[2][2])
print("4:", cm[3][3])
print("5:", cm[4][4])

In [0]:
from sklearn.metrics import confusion_matrix

labels = ["1","2","3","4","5"]
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(cm)
plt.title('Confusion matrix du classifier')
fig.colorbar(cax)
ax.set_xticklabels([''] + labels)
ax.set_yticklabels([''] + labels)
plt.xlabel('Predicted')
plt.ylabel('True')

for i in range(len(cm)):
    for j in range(len(cm)):
        plt.text(j, i, format(cm[i][j]),
                ha="center", va="center",
                color="white") 
plt.show()

## Logistic Regression

In [0]:
def logistic_regression2(df_eval):
  # on lance la logistic regression sur les 2 dataset: dfshuf1 et dfshuf0
  label = "rate"
  ignore_cols = [label,"is_good_review", "review", "review.cleaned", "id"]
  features = [c for c in dfshuf0.columns if c not in ignore_cols]
  # split the data into train and test
  from sklearn.linear_model import LogisticRegression
  from sklearn.model_selection import train_test_split
  X_train, X_test, y_train, y_test = train_test_split(dfshuf0[features], dfshuf0[label], test_size = 0.20, random_state = 42)
  # train a logitstic regression classifier
  from sklearn.pipeline import Pipeline, make_pipeline
  from sklearn.feature_extraction.text import TfidfTransformer
  parameters = {
      # 'tfidftransformer__use_idf': (True, False),
      'logisticregression__C': [0.001, 0.01, 0.1, 1, 10]
  }
  from sklearn.model_selection import GridSearchCV
  pipeline = make_pipeline(LogisticRegression(solver="newton-cg",multi_class='multinomial'))
  grid_search = GridSearchCV(pipeline, parameters, verbose=1, cv=2)
  # train Logistic regression à partir d'un grid_search
  # ⚠︎ Le temps d'entrainement peut durer plus de 20 minutes ⚠ ︎︎
  try:
    grid_search.fit(X_train, y_train)
  except Exception as e:
    print(e)
  y_pred = grid_search.predict(df_eval)
  return y_pred

### Résultats

In [0]:
from sklearn.metrics import mean_squared_error
label = "rate"
ignore_cols = [label,"is_good_review", "review", "review.cleaned","id"]
features = [c for c in dfshuf_eval.columns if c not in ignore_cols]
X_dfshuf_eval = dfshuf_eval[features]
y_dfshuf_eval = dfshuf_eval[label]
eval_predicted = logistic_regression2(X_dfshuf_eval)
print(mean_squared_error(eval_predicted,y_dfshuf_eval))

In [0]:
from sklearn.metrics import confusion_matrix

labels = ["1","2","3","4","5"]
cm = confusion_matrix(y_dfshuf_eval, eval_predicted)
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(cm)
plt.title('Confusion matrix')
fig.colorbar(cax)
ax.set_xticklabels([''] + labels)
ax.set_yticklabels([''] + labels)
plt.xlabel('Predicted')
plt.ylabel('True')
thresh = cm.max()/2
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        plt.text(j, i, format(cm[i, j]),
                ha="center", va="center",
                color="white") 
plt.show()

## Randomforest Classifier

In [0]:
def random_forest2(df_eval):
  label = "rate"
  ignore_cols = [label, "is_good_review","review", "review.cleaned", "id"]
  features = [c for c in dfshuf0.columns if c not in ignore_cols]
  # split the data into train and test
  from sklearn.ensemble import RandomForestClassifier
  from sklearn.model_selection import train_test_split
  # split the data into train and test
  from sklearn.linear_model import LogisticRegression
  from sklearn.model_selection import train_test_split
  X_train, X_test, y_train, y_test = train_test_split(dfshuf0[features], dfshuf0[label], test_size = 0.20, random_state = 42)
  from sklearn.pipeline import Pipeline, make_pipeline
  from sklearn.feature_extraction.text import TfidfTransformer
  parameters = {
    'randomforestclassifier__n_estimators':(25,50,75,100),
    'randomforestclassifier__criterion':('entropy', 'gini')
    }
  from sklearn.model_selection import GridSearchCV
  pipeline = make_pipeline(TfidfTransformer(),RandomForestClassifier())
  grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, cv=5)
  # train random forest classifier à partir d'un grid_search
  grid_search.fit(X_train, y_train)
  y_pred = grid_search.predict(df_eval)
  return y_pred


### Résultats

In [0]:
from sklearn.metrics import mean_squared_error
label = "rate"
ignore_cols = [label,"is_good_review", "review", "review.cleaned","id"]
features = [c for c in dfshuf_eval.columns if c not in ignore_cols]
X_dfshuf_eval = dfshuf_eval[features]
y_dfshuf_eval = dfshuf_eval[label]
eval_predicted = random_forest2(X_dfshuf_eval)
print(mean_squared_error(eval_predicted,y_dfshuf_eval))

In [0]:
# on montre les features importante
feature_importances_df = pd.DataFrame({"feature": features, "importance": grid_search.best_estimator_.named_steps["randomforestclassifier"].feature_importances_}).sort_values("importance", ascending = False)
feature_importances_df.head(20)

In [0]:
from sklearn.metrics import confusion_matrix
labels = ["1","2","3","4","5"]
cm = confusion_matrix(y_dfshuf_eval, eval_predicted)
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(cm)
plt.title('Confusion matrix du classifier')
fig.colorbar(cax)
ax.set_xticklabels([''] + labels)
ax.set_yticklabels([''] + labels)
plt.xlabel('Predicted')
plt.ylabel('True')
thresh = cm.max()/2
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        plt.text(j, i, format(cm[i, j]),
                ha="center", va="center",
                color="white") 
plt.show()

## Kéras

In [0]:
def neural_network2(df_eval):
  from keras.utils import to_categorical
  # load the dataset but only keep the top n words, zero the rest
  label = "rate"
  ignore_cols = [label, "is_good_review","review", "review.cleaned", "id"]
  features = [c for c in dfshuf0.columns if c not in ignore_cols]
  # split the data into train and test
  from sklearn.model_selection import train_test_split
  X_train, X_test, y_train, y_test = train_test_split(dfshuf0[features], dfshuf0[label], test_size = 0.20, random_state = 42)
  y_train = [x - 1 for x in y_train]
  y_train = to_categorical(y_train, num_classes=5)
  from keras.models import Sequential
  from keras.layers import Dense
  #create model
  model = Sequential()
  #get number of columns in training data
  n_cols_2 = X_train.shape[1]
  #add layers to model
  model.add(Dense(250, activation='relu', kernel_initializer='random_normal', input_shape=(n_cols_2,)))
  model.add(Dense(250, activation='relu', kernel_initializer='random_normal'))
  model.add(Dense(250, activation='relu', kernel_initializer='random_normal'))
  model.add(Dense(5, activation='softmax', kernel_initializer='random_normal'))
  #compile model using accuracy to measure model performance
  model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
  #train model
  from keras.callbacks import ModelCheckpoint, EarlyStopping

  history = model.fit(X_train, y_train, epochs=150, batch_size=500)
  
  from keras import losses
  y_pred = model.predict(df_eval)
  return y_pred



### Résultats

In [0]:
from sklearn.metrics import mean_squared_error
label = "rate"
ignore_cols = [label,"is_good_review", "review", "review.cleaned","id"]
features = [c for c in dfshuf_eval.columns if c not in ignore_cols]
X_dfshuf_eval = dfshuf_eval[features]
y_dfshuf_eval = dfshuf_eval[label]

y_dfshuf_eval = [x - 1 for x in dfshuf_eval.rate]
y_dfshuf_eval = to_categorical(y_dfshuf_eval, num_classes=5)
eval_predicted = neural_network2(X_dfshuf_eval)
print(mean_squared_error(eval_predicted,y_dfshuf_eval))