In [1]:
import pandas as pd
import numpy as np
import re 
import nltk
import json

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import reuters, stopwords
from string import punctuation

from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
stopwords_json_en = set(json.load(open('en.json')))
stopwords_nltk_en = set(stopwords.words('english'))
stopwords_punct = set(punctuation)
stopwords_time = set(["--","jan","january","feb","february","mar","march","apr","april","may","jun","june","jul","july",
             "aug","august","sept","september","oct","october","nov","november","dec","december"])
# Combine the stopwords. Its a lot longer so I'm not printing it out...
stoplist_combined = set.union(stopwords_json_en, stopwords_nltk_en, stopwords_punct, stopwords_time)

In [4]:
documents = reuters.fileids()
train_docs_id = list(filter(lambda doc: doc.startswith("train"), documents))
test_docs_id = list(filter(lambda doc: doc.startswith("test"), documents))
    
train_docs = [reuters.raw(doc_id) for doc_id in train_docs_id]
test_docs = [reuters.raw(doc_id) for doc_id in test_docs_id]

In [5]:
train_categories = [reuters.categories(doc_id) for doc_id in train_docs_id]
test_categories = [reuters.categories(doc_id) for doc_id in test_docs_id]
mlb = MultiLabelBinarizer()
train_labels = mlb.fit_transform(train_categories) 
test_labels = mlb.transform(test_categories)

## Tokenization with stamming

In [6]:
def tokenize_stam(text):
    min_length = 3
    words = map(lambda word: word.lower(), word_tokenize(text));
    #выкидываем стоп-слова
    words = [word for word in words
                  if word not in stoplist_combined]
    #используем стемминг
    tokens =(list(map(lambda token: PorterStemmer().stem(token),
                  words)));
    p = re.compile('[a-zA-Z]+');
    #выкидываем строки с лишними символами и длиной не меньше 3
    filtered_tokens =list(filter(lambda token: p.match(token) and len(token)>=min_length,
         tokens));
    return filtered_tokens

In [7]:
x_train_stam = [tokenize_stam(t) for t in train_docs]
x_test_stam = [tokenize_stam(t) for t in test_docs]

In [26]:
words = [word_tokenize(t) for t in test_docs + train_docs]

In [28]:
length = [len(t) for t in words]

In [30]:
print(np.max(length))
print(np.argmin(length))

1861
835


In [31]:
all_docs = test_docs + train_docs
all_docs[835]

'13-APR-1987\n  13-APR-1987\n\n'

## Learning

In [8]:
y_test_onelabel = np.argmax(test_labels, axis = 1)
y_train_onelabel = np.argmax(train_labels, axis = 1)

In [10]:
#to work svm with tokenised text
def const(tmp):
    return tmp

In [12]:
def grid_search(train_x, train_y, test_x, test_y, parameters, pipeline):
    grid_search_tune = GridSearchCV(pipeline, parameters, n_jobs=-1, scoring='accuracy', cv=5, verbose=1)
    grid_search_tune.fit(train_x, train_y)

    print("Best parameters set:")
    print(grid_search_tune.best_estimator_.steps)

    print("Best score: %0.3f" % grid_search_tune.best_score_)
    print("Applying best classifier on test data:")
    best_clf = grid_search_tune.best_estimator_
    predictions = best_clf.predict(test_x)
    print(accuracy_score(test_y, predictions))
    print(classification_report(test_y, predictions))
    return best_clf

In [17]:
pipeline = Pipeline([
                ('vect', CountVectorizer(tokenizer=const, preprocessor=const)),
                ('clf', MultinomialNB()),
            ])
parameters = {
                'vect__max_df': [0.3, 0.5, 0,75, 0.9, None],
                'vect__min_df': [3, None],
                'vect__max_features': [3000, 5000, None],
                'vect__ngram_range': [(1, 1), (1, 2), (1, 3)]
                #'clf__alpha': [1e-3, 1e-5],
} 
model_bayes_stam_onelabel = grid_search(x_train_stam, y_train_onelabel, x_test_stam, y_test_onelabel, parameters, pipeline)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   52.7s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed: 11.1min
[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed: 13.7min finished


Best parameters set:
[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=0.3, max_features=3000, min_df=3,
                ngram_range=(1, 1),
                preprocessor=<function const at 0x7f12fbbe2290>,
                stop_words=None, strip_accents=None,
                token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=<function const at 0x7f12fbbe2290>, vocabulary=None)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]
Best score: 0.845
Applying best classifier on test data:
0.8565750248426631
              precision    recall  f1-score   support

           0       0.97      0.96      0.97       719
           1       0.60      0.55      0.57        22
           2       0.78      0.50      0.61        14
           3       0.48      0.67      0.56        30
           4       0.57      0.71      0.63 

### Accuracy with Naive bayes: 85.66

In [33]:
pipeline = Pipeline([
                ('tfidf', CountVectorizer(tokenizer=const, preprocessor=const)),
                ('clf', LinearSVC()),
            ])
parameters = {
                'tfidf__max_df': [0.5, 0,75, 0.9],
                'tfidf__min_df': [3, None],
                'tfidf__max_features': [3000, 5000, None],
                'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
} 
model_svc_stam_onelabel = grid_search(x_train_stam, y_train_onelabel, x_test_stam, y_test_onelabel, parameters, pipeline)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed: 10.6min finished


Best parameters set:
[('tfidf', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=0.9, max_features=None, min_df=3,
                ngram_range=(1, 3),
                preprocessor=<function const at 0x7f12fbbe2290>,
                stop_words=None, strip_accents=None,
                token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=<function const at 0x7f12fbbe2290>, vocabulary=None)), ('clf', LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0))]
Best score: 0.888
Applying best classifier on test data:
0.8860549850944022
              precision    recall  f1-score   support

           0       0.95      0.97      0.96       719
           1       0.88      0.64 

### Accuracy with SVM(+CountVectorizer): 88.61