# Importation des textes et premières observations

In [1]:
import pandas as pd

In [2]:
import numpy as np

In [3]:
data=pd.read_csv("base.csv", sep="\t", encoding="latin-1", index_col=0)
data.drop(["Fichier", "Longueur"], axis=1, inplace=True)

In [4]:
res=[]
for i in range (len(data)) :
    if data["Note"][i] >7 :
        res.append(1)
    else : 
        res.append(0)
data["Note"]=res

In [5]:
data=data.iloc[np.random.permutation(len(data))]
data.reset_index(drop=True, inplace=True)

In [6]:
data[:5]

Unnamed: 0,Avis,Film,Note
0,Dans la surenchère de super-héros qui est aujo...,XD,1
1,Il n'y a que chez les super-héros qu'on peut f...,Sb,1
2,C'est une «chanson» qui s'inculque. Qui se déc...,GG,1
3,"Spider-Man est un héros de l'univers Marvel, I...",SM,1
4,"Le sucre est un poison pour la santé, mais il ...",AU,0


## Séparation entre train et test 

In [7]:
limite=int(len(data)*2/3)

In [8]:
data_train=data["Avis"][:limite]
data_test=data["Avis"][limite:]

In [9]:
y_train=data["Note"][:limite]
y_test=data["Note"][limite:]

# Tokenisation

In [10]:
from nltk.tokenize import TreebankWordTokenizer
from nltk.corpus import stopwords

# chargement des stopwords français
stopwords_extent= stopwords.words('french')+[",", ".", "(", ")", "c'est", "le", "la", "les", "un", "une", "des", "c'est",
                                             "va", "ainsi", "ce", "cette", "ces", "là", "du", "s", "''", ':', "a"
                                            'cependant', 'certain', 'certaine', 'certainement', 'certaines', 'certains', 
                                             'certes', 'cet', 'ceux', 'chacun', 'chaque']
#retire certains signes de ponctuation qui ne présentent pas
#d'intérêt dans l'immédiat.

french_stopwords = set(stopwords_extent)



In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

In [12]:
def score(max) :
    count_vect = CountVectorizer(min_df=0.0, stop_words=french_stopwords, max_features=max)
    X_train_counts = count_vect.fit_transform(data_train)
    
    tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
    X_train_tf = tf_transformer.transform(X_train_counts)
    
    clf = MultinomialNB().fit(X_train_tf, y_train)

    docs_new = data_test
    X_new_counts = count_vect.transform(docs_new)
    X_new_tf = tf_transformer.transform(X_new_counts)
    predicted = clf.predict(X_new_tf)

    return np.mean(predicted == y_test)

In [13]:
print([[i, score(i)] for i in [100, 200, 400, 600, 800, 1000, 3000, 5000, 6000, 10000]])

[[100, 0.67142857142857137], [200, 0.70952380952380956], [400, 0.76190476190476186], [600, 0.75714285714285712], [800, 0.76666666666666672], [1000, 0.77619047619047621], [3000, 0.68571428571428572], [5000, 0.59999999999999998], [6000, 0.59999999999999998], [10000, 0.56190476190476191]]


In [20]:
count_vect = CountVectorizer(min_df=0.0, stop_words=french_stopwords, max_features=1000)
X_train_counts = count_vect.fit_transform(data_train)
    
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
    
clf = MultinomialNB().fit(X_train_tf, y_train)

docs_new = data_test
X_new_counts = count_vect.transform(docs_new)
X_new_tf = tf_transformer.transform(X_new_counts)
predicted = clf.predict(X_new_tf)

np.mean(predicted == y_test)

0.77619047619047621

In [17]:
count_vect = CountVectorizer(min_df=0.0, stop_words=french_stopwords, max_features=1000, ngram_range=(1, 2))
X_train_counts = count_vect.fit_transform(data_train)
    
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
    
clf = MultinomialNB().fit(X_train_tf, y_train)

docs_new = data_test
X_new_counts = count_vect.transform(docs_new)
X_new_tf = tf_transformer.transform(X_new_counts)
predicted = clf.predict(X_new_tf)

np.mean(predicted == y_test)

0.75714285714285712

In [19]:
#print (count_vect.get_feature_names())

Idées pour améliorer le score : 
- scrapper plus de données,
- jeter un coup d'oeil sur les features et en retirer (soit par intuition, regard sur les données(ajouter des stopwords), soit en jouant sur max_features dans countvectorizer),
- essayer d'autres modèles : linéaires, arbres de décision (avec un nombre minimal d'ind par feuille).

In [94]:
count_vect = CountVectorizer(min_df=0.0, stop_words=french_stopwords, max_features=1000, ngram_range=(2,2))
X_train_counts = count_vect.fit_transform(data_train)
    
tf_transformer = TfidfTransformer(use_idf=True).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
    
clf = MultinomialNB().fit(X_train_tf, y_train)

docs_new = data_test
X_new_counts = count_vect.transform(docs_new)
X_new_tf = tf_transformer.transform(X_new_counts)
predicted = clf.predict(X_new_tf)

np.mean(predicted == y_test)

0.74285714285714288

In [86]:
count_vect = CountVectorizer(min_df=0.0, stop_words=french_stopwords, max_features=1000, ngram_range=(3,3))
X_train_counts = count_vect.fit_transform(data_train)
tf_transformer = TfidfTransformer(use_idf=True).fit(X_train_counts)
X_train_tf_lin = tf_transformer.transform(X_train_counts)


clf_lin = LinearSVC().fit(X_train_tf_lin, y_train)

docs_new = data_test
X_new_counts = count_vect.transform(docs_new)
X_new_tf = tf_transformer.transform(X_new_counts)

predicted_lin = clf_lin.predict(X_new_tf)

np.mean(predicted_lin == y_test)

0.6333333333333333

http://streamhacker.com/2012/11/22/text-classification-sentiment-analysis-nltk-scikitlearn/

In [95]:
coeff_cr=clf.coef_.argsort()[0]
coeff_dec=clf.coef_.argsort()[0][::-1]

In [96]:
for i in range (50) :
    print("coeff :"+ str(clf_lin.coef_[0][coeff_dec[i]])+" pour :"+count_vect.get_feature_names()[coeff_dec[i]])

coeff :-0.186388722533 pour :spider man
coeff :-0.574879460811 pour :super héros
coeff :0.226398578216 pour :iron man
coeff :0.206366367018 pour :captain america
coeff :0.37432297347 pour :sam raimi
coeff :0.0292976837649 pour :peter parker
coeff :-0.0154127364728 pour :effets spéciaux
coeff :-0.337683119775 pour :très bon
coeff :0.88526016418 pour :très bien
coeff :-0.211933862962 pour :film super
coeff :0.688429192536 pour :scènes action
coeff :0.24287947928 pour :films super
coeff :0.687878932005 pour :bouffon vert
coeff :0.148286137749 pour :soldat hiver
coeff :-0.404322715881 pour :bryan singer
coeff :0.930454737326 pour :bon film
coeff :-0.502185279111 pour :first class
coeff :0.216952034944 pour :tony stark
coeff :0.329600869003 pour :mary jane
coeff :-0.7219312654 pour :premier film
coeff :0.444499238084 pour :downey jr
coeff :0.0138125194145 pour :robert downey
coeff :-0.374186204946 pour :gardiens galaxie
coeff :0.0616152051193 pour :tobey maguire
coeff :0.434650394441 pour :

In [97]:
for i in range (50) :
    print("coeff :"+ str(clf_lin.coef_[0][coeff_cr[i]])+" pour :"+count_vect.get_feature_names()[coeff_cr[i]])

coeff :-0.114303193949 pour :ça suffit
coeff :-0.059963044104 pour :trois fois
coeff :-0.0313714316869 pour :mal écrit
coeff :0.1647771063 pour :christian chat
coeff :-0.0161444152248 pour :demande si
coeff :0.213643308907 pour :guerre contre
coeff :0.519254493305 pour :chat parle
coeff :-0.335478786675 pour :tante milf
coeff :-0.58967719617 pour :ça part
coeff :-0.0496227079574 pour :encore moins
coeff :-0.556740893816 pour :trilogie raimi
coeff :0.361263656391 pour :peyton reed
coeff :-0.660269402961 pour :aucun intérêt
coeff :-0.708067484114 pour :pete ouais
coeff :0.194658269456 pour :ça veut
coeff :0.335293341231 pour :liv tyler
coeff :-0.260550380729 pour :marvel universe
coeff :-0.22039363023 pour :ça tante
coeff :-0.275858620499 pour :paul rudd
coeff :0.106802240096 pour :zack snyder
coeff :0.108721226912 pour :petit peu
coeff :0.219751249927 pour :encore ça
coeff :-0.486928891273 pour :parce faut
coeff :0.394020852467 pour :seule seconde
coeff :0.194159113492 pour :bon ça
coef