In [3]:
# Authors: Alexandre Gramfort
#          Chloe Clavel
# License: BSD Style.
# TP Cours ML Telecom ParisTech MDI343

# Set up

In [9]:
import os.path as op
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin
from glob import glob
import re
import pandas as pd
import operator
import unittest
import sys
import operator
from sklearn.model_selection import *
import matplotlib.pyplot as plt
%load_ext line_profiler

The line_profiler extension is already loaded. To reload it, use:
  %reload_ext line_profiler


In [10]:
# Load data
print("Loading dataset")

filenames_neg = sorted(glob(op.join('.', 'data', 'imdb1', 'neg', '*.txt')))
filenames_pos = sorted(glob(op.join('.', 'data', 'imdb1', 'pos', '*.txt')))

texts_neg = [open(f).read() for f in filenames_neg]
texts_pos = [open(f).read() for f in filenames_pos]
texts = texts_neg + texts_pos
y = np.ones(len(texts), dtype=np.int)
y[:len(texts_neg)] = 0.

Loading dataset


# Implementation of the classifier
## Count_words

In [None]:
def count_words(texts):
    """Vectorize text : return count of each word in the text snippets

    Parameters
    ----------
    texts : list of str
        The texts

    Returns
    -------
    vocabulary : dict
        A dictionary that points to an index in counts for each word.
    counts : ndarray, shape (n_samples, n_features)
        The counts of each word in each text.
        n_samples == number of documents.
        n_features == number of words in vocabulary.
    """  
    
    wordList = re.subn('\W', ' ', ' '.join(texts).lower())[0].split(' ')
    words = set(wordList)
    vocabulary = dict(zip(words, range(len(words))))
              
    n_features = len(words)
    counts = np.zeros( (len(texts), n_features))
    
    for l, text in enumerate(texts, 0):
        wordList = re.subn('\W', ' ', text.lower())[0].split(' ')
        for word in wordList:
            if(word == ''): continue
            counts[l][vocabulary[word]] = counts[l][vocabulary[word]] + 1   
    return vocabulary, counts

## README

## Implementation du NB

In [None]:
class NB(BaseEstimator, ClassifierMixin):
    def __init__(self):
        pass

        
    def fit(self, X, y):
        N = X.shape[0]
        val, counter = np.unique(y, return_counts=True)
        self.class_counter = dict(zip(val, counter))
        nb_word = X.shape[1]
        nb_class = val.shape[0]
        
        self.condprob = dict()
        self.prior = np.zeros(nb_class)
        T = {c:np.sum(X[y==c, :], axis=0) for c in self.class_counter}
        
        for c,Nc in self.class_counter.items():
            self.prior[c] = Nc/N
            self.condprob[c] = (T[c] +1) / ( np.sum(T[c] + 1))

        return self

    def predict(self, X):
        X = np.array(X)
        y = [self.apply(vec) for vec in X]
        return np.array(y)
    
    def apply(self, vector):
        vector = np.array(vector)
        score = dict()
        for c in self.class_counter:
            score[c] = np.log(self.prior[c])
            score[c] += np.log(self.condprob[c][vector >0.0]).sum()
        return max(score.items(), key=operator.itemgetter(1))[0]
    
    
    def score(self, X, y):
        return np.mean(self.predict(X) == y)

In [None]:
# Count words in text
vocabulary, X = count_words(texts)

# Try to fit, predict and score
nb = NB()
nb.fit(X[::2], y[::2])
print(nb.score(X[1::2], y[1::2]))

## Avec CV = 5 :

In [None]:
# Try to fit, predict and score
nb = NB()
my_nb_res = cross_validate(nb, X, y, cv=5)

##  Stop word

In [None]:
def count_words(texts, stop_word = False):
    """Vectorize text : return count of each word in the text snippets

    Parameters
    ----------
    texts : list of str
        The texts

    Returns
    -------
    vocabulary : dict
        A dictionary that points to an index in counts for each word.
    counts : ndarray, shape (n_samples, n_features)
        The counts of each word in each text.
        n_samples == number of documents.
        n_features == number of words in vocabulary.
    """
    wordList = re.subn('\W', ' ', ' '.join(texts).lower())[0].split(' ')
    words = set(wordList)
    
    
    
    if stop_word:
        with open("./data/english.stop", 'r') as file:
            stop_words = set(file.read())
        words = words - stop_words    
    vocabulary = dict(zip(words, range(len(words))))    
            
    n_features = len(words)
    counts = np.zeros( (len(texts), n_features))
    
    for l, text in enumerate(texts, 0):
        wordList = re.subn('\W', ' ', text.lower())[0].split(' ')
        for word in wordList:
            if(word not in words): continue
            counts[l][vocabulary[word]] = counts[l][vocabulary[word]] + 1   
    return vocabulary, counts

In [None]:
# Count words in text
vocabulary, X = count_words(texts, stop_word = True)

# Try to fit, predict and score
nb = NB()
nb.fit(X[::2], y[::2])
print(nb.score(X[1::2], y[1::2]))

In [None]:
# Try to fit, predict and score
nb = NB()
my_nb_res_stop_word = cross_validate(nb, X, y, cv=5)

# SCIKIT-LEARN USE

## Question 1:
Compare your implementation with scikitlearn

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

### By words

In [None]:
nb_pipline = Pipeline([('count', CountVectorizer()), ('nb', MultinomialNB())])
sk_nb_res = cross_validate(nb_pipline.set_params(
    count__analyzer='word'), texts, y, cv=5)
sk_nb_bigram_res = cross_validate(nb_pipline.set_params(
    count__analyzer='word', count__ngram_range=(1, 2)), texts, y, cv=5)
sk_nb_res_stop_word = cross_validate(nb_pipline.set_params(
    count__analyzer='word', count__ngram_range=(1,1), count__stop_words='english'), texts, y, cv=5)
sk_nb_bigram_res_stop_word = cross_validate(nb_pipline.set_params(
    count__analyzer='word', count__ngram_range=(2, 2), count__stop_words='english'), texts, y, cv=5)

In [None]:
label = ['my NB','my NB/stop word','NB', 'NB/bigram', 'NB/stop word', 'NB/bigram/stop word']
data_nb = [my_nb_res, my_nb_res_stop_word,sk_nb_res, sk_nb_bigram_res, sk_nb_res_stop_word, sk_nb_bigram_res_stop_word]

plt.figure(figsize=(16,12))

for i, item in enumerate(sk_nb_res.keys(),0):
    plt.subplot(2,2,i+1)
    x = [np.mean(res[item]) for res in data_nb]
    plt.bar(range(len(label)),x, tick_label =label)
    plt.ylim(min(x) -0.05, max(x) + 0.01)
    plt.title(item)
plt.plot()

### Question 2

In [None]:
from sklearn.svm import LinearSVC
svc_pipline = Pipeline([('count', CountVectorizer()), ('svc', LinearSVC())])
svc_nb_res = cross_validate(svc_pipline.set_params(
    count__analyzer='word'), texts, y, cv=5)
svc_nb_bigram_res = cross_validate(svc_pipline.set_params(
    count__analyzer='char_wb', count__ngram_range=(2, 2)), texts, y, cv=5)
svc_nb_res_stop_word = cross_validate(svc_pipline.set_params(
    count__analyzer='word', count__ngram_range=(1, 1), count__stop_words='english'), texts, y, cv=5)
svc_nb_bigram_res_stop_word = cross_validate(svc_pipline.set_params(
    count__analyzer='char_wb', count__ngram_range=(2, 2), count__stop_words='english'), texts, y, cv=5)

In [None]:
label = ['SVC', 'SVC/bigram', 'SVC/stop word', 'SVC/bigram/stop word']
data_svc = [svc_nb_res, svc_nb_bigram_res,
            svc_nb_res_stop_word, svc_nb_bigram_res_stop_word]

plt.figure(figsize=(16, 12))

for i, item in enumerate(sk_nb_res.keys(), 0):
    plt.subplot(2, 2, i+1)
    x = [np.mean(res[item]) for res in data_svc]
    plt.bar(range(len(label)), x, tick_label=label)
    plt.title(item)
plt.plot()

In [None]:
from sklearn.linear_model import LogisticRegression
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(texts)
cross_validate(LogisticRegression(), X, y, cv=5)

lr_pipline = Pipeline([('count', CountVectorizer()),
                       ('lr', LogisticRegression())])
lr_res = cross_validate(lr_pipline.set_params(
    count__analyzer='word'), texts, y, cv=5)
lr_bigram_res = cross_validate(lr_pipline.set_params(
    count__analyzer='char_wb', count__ngram_range=(2, 2)), texts, y, cv=5)
lr_res_stop_word = cross_validate(lr_pipline.set_params(
    count__analyzer='word', count__ngram_range=(1, 1), count__stop_words='english'), texts, y, cv=5)
lr_bigram_res_stop_word = cross_validate(lr_pipline.set_params(
    count__analyzer='char_wb', count__ngram_range=(2, 2), count__stop_words='english'), texts, y, cv=5)

In [None]:
label = ['LR', 'LR/bigram', 'LR/stop word', 'LR/bigram/stop word']
data_lr = [lr_res, lr_bigram_res, lr_res_stop_word, lr_bigram_res_stop_word]

plt.figure(figsize=(16, 12))

for i, item in enumerate(sk_nb_res.keys(), 0):
    plt.subplot(2, 2, i+1)
    x = [np.mean(res[item]) for res in data]
    plt.bar(range(len(label)), x, tick_label=label)
    plt.title(item)
plt.plot()

### NLTK

In [None]:
from nltk import SnowballStemmer

In [None]:
stemmer = SnowballStemmer("english", ignore_stopwords=True)
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])
stemmed_count_vect = StemmedCountVectorizer(stop_words='english')
text_mnb_stemmed = Pipeline([('vect', stemmed_count_vect),
                      ('mnb', MultinomialNB(fit_prior=False)),

                             

In [4]:
from nltk import pos_tag

In [5]:
pos_tag('Hello world')

[('H', 'NNP'),
 ('e', 'NN'),
 ('l', 'NN'),
 ('l', 'NN'),
 ('o', 'NN'),
 (' ', 'NNP'),
 ('w', 'NN'),
 ('o', 'NN'),
 ('r', 'NN'),
 ('l', 'NN'),
 ('d', 'NN')]

In [25]:
import nltk

from nltk import pos_tag
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('universal_tagset')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\zhufa\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\zhufa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\zhufa\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\universal_tagset.zip.


True

In [26]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from nltk import word_tokenize

In [27]:
def filter_pos(texts=texts):
    filtered = [list(filter(lambda x: (x[1] == 'NOUN') 
                         or (x[1] == 'VERB') 
                         or (x[1] == 'ADV') 
                         or (x[1] == 'ADJ'), pos_tag(word_tokenize(txt), tagset='universal'))) for txt in texts]
    return filtered

def apply_pos_tag(filtered):
    postagged = []
    for i, tuples in enumerate(filtered, 0):
        postagged.append('')
        for t in tuples:
            postagged[i] += t[0] + ' '
    return postagged



In [36]:
postText = apply_pos_tag(filter_pos())
postText

["plot teen couples go church party drink then drive get accident guys dies girlfriend continues see life has nightmares 's deal watch movie sorta find critique mind-fuck movie teen generation touches very cool idea presents very bad package is makes review even harder write i generally applaud films attempt break mold mess head such lost highway memento are good bad ways making types films folks just did n't snag correctly seem have taken pretty neat concept executed terribly are problems movie well main problem is 's simply too jumbled starts normal then downshifts fantasy world audience member have idea 's going are dreams are characters coming back dead are others look dead are strange apparitions are disappearances are looooot chase scenes are tons weird things happen most is simply not explained now i personally do n't mind trying unravel film now then when does is give same clue over again i get kind fed while is film biggest problem 's obviously got big secret hide seems want h

In [None]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


Exception in thread Thread-8:
Traceback (most recent call last):
  File "D:\ProgramData\Anaconda3\lib\threading.py", line 916, in _bootstrap_inner
    self.run()
  File "D:\ProgramData\Anaconda3\lib\site-packages\nltk\downloader.py", line 1871, in run
    for msg in self.data_server.incr_download(self.items):
  File "D:\ProgramData\Anaconda3\lib\site-packages\nltk\downloader.py", line 535, in incr_download
    for msg in self._download_list(info_or_id, download_dir, force):
  File "D:\ProgramData\Anaconda3\lib\site-packages\nltk\downloader.py", line 578, in _download_list
    for msg in self.incr_download(item, download_dir, force):
  File "D:\ProgramData\Anaconda3\lib\site-packages\nltk\downloader.py", line 549, in incr_download
    for msg in self.incr_download(info.children, download_dir, force):
  File "D:\ProgramData\Anaconda3\lib\site-packages\nltk\downloader.py", line 535, in incr_download
    for msg in self._download_list(info_or_id, download_dir, force):
  File "D:\ProgramDat