# Exercise 15

In [38]:
import numpy as np

from sklearn.datasets import fetch_20newsgroups

import nltk
from nltk.stem.snowball import SnowballStemmer

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.ensemble import RandomForestClassifier

import pickle

In [11]:
np.set_printoptions(precision=2)

In [44]:
def save_model(model, fname):
    with open(fname, "wb") as f:
        pickle.dump(model, f)

def load_model(fname):
    with open(fname, "rb") as f:
        model = pickle.load(f)
        return model

def test_accurracy(model, dataset):
    predicted = model.predict(dataset.data)
    return np.mean(predicted == dataset.target)

## Fetch data

In [3]:
train_set = fetch_20newsgroups(subset='train', shuffle=True)
test_set = fetch_20newsgroups(subset='test', shuffle=True)

train_set.target_names #prints all the categories

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

## Fetch stemming data

In [7]:
nltk.download('snowball_data')
nltk.download('stopwords')

[nltk_data] Downloading package snowball_data to
[nltk_data]     /home/adrian/nltk_data...
[nltk_data]   Package snowball_data is already up-to-date!
[nltk_data] Downloading package stopwords to /home/adrian/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Stemming vectorization process

In [62]:
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])
    
stemmer = SnowballStemmer("english", ignore_stopwords=True)
stemmed_count_vect = StemmedCountVectorizer(stop_words='english')

## GridSearch config (Random Forest)

In [36]:
# Create a list of parameters and their values to be checked.
# All the parameters name are of the form 'stepName__paramName'.
# E.g. 'vect__ngram_range': [(1, 1), (1, 2)]
# that means use unigram and bigrams and choose the one which is optimal.

parameters = {
    'vect__ngram_range': [(1, 1), (1, 2)],  
    'tfidf__use_idf': [True],
    'dtc__n_estimators': [64, 128, 256],
    'dtc__max_depth': [32],
}

pipe_clf = Pipeline([
    ('vect', CountVectorizer()), 
    ('tfidf', TfidfTransformer()), 
    ('dtc', RandomForestClassifier())
])

gs_clf = GridSearchCV(pipe_clf, parameters, n_jobs=-1)

In [27]:
# gs_clf.get_params().keys()

## Choose best params

In [39]:
# Best score: 0.7556135236708782
# Best param: {'dtc__max_depth': 16, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 1)}
# Best score: 0.802545504840292
# Best param: {'dtc__max_depth': 32, 'dtc__n_estimators': 128, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 1)}
# Best score: 0.8153619542925773
# Best param: {'dtc__max_depth': 32, 'dtc__n_estimators': 256, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}

In [37]:
gs_clf = gs_clf.fit(train_set.data, train_set.target)
model = gs_clf.best_estimator_

print("Best score: %s" % gs_clf.best_score_) 
print("Best param: %s" % gs_clf.best_params_)

Best score: 0.8153619542925773
Best param: {'dtc__max_depth': 32, 'dtc__n_estimators': 256, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}


In [42]:
save_model(model, "random_forest.pckl")

In [45]:
random_forest_model = load_model("random_forest.pckl")
test_accurracy(model, test_set)

0.7510621348911312

## Manual set params

In [65]:
pipe_clf_manual = Pipeline([
    ('vect', stemmed_count_vect), 
    ('tfidf', TfidfTransformer()), 
    ('dtc', RandomForestClassifier())
])


pipe_clf_manual.set_params(
    vect__ngram_range=(1,2),
    tfidf__use_idf=True,
    dtc__n_estimators=100,
    dtc__max_depth=100,
    dtc__class_weight='balanced'
)

Pipeline(steps=[('vect',
                 StemmedCountVectorizer(ngram_range=(1, 2),
                                        stop_words='english')),
                ('tfidf', TfidfTransformer()),
                ('dtc',
                 RandomForestClassifier(class_weight='balanced',
                                        max_depth=100))])

In [66]:
model_manual_random_forest = pipe_clf_manual.fit(train_set.data, train_set.target)

In [67]:
predicted = pipe_clf_manual.predict(test_set.data)

print("Accuracy: {}".format(np.mean(predicted == test_set.target)))

Accuracy: 0.789830058417419
