In [1]:
import pandas as pd
import numpy as np
import re 
import nltk
import json

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import reuters, stopwords
from string import punctuation

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('reuters')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to /home/alex/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/alex/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/alex/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package reuters to /home/alex/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/alex/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [4]:
stopwords_json_en = set(json.load(open('en.json')))
stopwords_nltk_en = set(stopwords.words('english'))
stopwords_punct = set(punctuation)
stopwords_time = set(["--","jan","january","feb","february","mar","march","apr","april","may","jun","june","jul","july",
             "aug","august","sept","september","oct","october","nov","november","dec","december"])
# Combine the stopwords. Its a lot longer so I'm not printing it out...
stoplist_combined = set.union(stopwords_json_en, stopwords_nltk_en, stopwords_punct, stopwords_time)

In [5]:
documents = reuters.fileids()
train_docs_id = list(filter(lambda doc: doc.startswith("train"), documents))
test_docs_id = list(filter(lambda doc: doc.startswith("test"), documents))
    
train_docs = [reuters.raw(doc_id) for doc_id in train_docs_id]
test_docs = [reuters.raw(doc_id) for doc_id in test_docs_id]

In [6]:
train_categories = [reuters.categories(doc_id) for doc_id in train_docs_id]
test_categories = [reuters.categories(doc_id) for doc_id in test_docs_id]
mlb = MultiLabelBinarizer()
train_labels = mlb.fit_transform(train_categories) 
test_labels = mlb.transform(test_categories)

## Lemmatisation

In [7]:
wnl = WordNetLemmatizer()

def penn2morphy(penntag):
    """ Converts Penn Treebank tags to WordNet. """
    morphy_tag = {'NN':'n', 'JJ':'a',
                  'VB':'v', 'RB':'r'}
    try:
        return morphy_tag[penntag[:2]]
    except:
        return 'n' 
    
def lemmatize_sent(text): 
    # Text input is string, returns lowercased strings.
    return [wnl.lemmatize(word.lower(), pos=penn2morphy(tag)) 
            for word, tag in pos_tag(word_tokenize(text))]

In [8]:
def preprocess_text(text):
    # Input: str, i.e. document/sentence
    # Output: list(str) , i.e. list of lemmas
    return [word for word in lemmatize_sent(text) 
            if word not in stoplist_combined
            and word.isalpha() ]

In [9]:
x_train_lem = [preprocess_text(t) for t in train_docs]
x_test_lem = [preprocess_text(t) for t in test_docs]

In [10]:
x_train_lem[0]

['bahia',
 'cocoa',
 'review',
 'shower',
 'continue',
 'week',
 'bahia',
 'cocoa',
 'zone',
 'alleviate',
 'drought',
 'early',
 'improve',
 'prospect',
 'temporao',
 'normal',
 'humidity',
 'level',
 'restore',
 'comissaria',
 'smith',
 'weekly',
 'review',
 'dry',
 'period',
 'temporao',
 'late',
 'year',
 'arrival',
 'week',
 'end',
 'bag',
 'kilo',
 'make',
 'cumulative',
 'total',
 'season',
 'mln',
 'stage',
 'year',
 'cocoa',
 'deliver',
 'earlier',
 'consignment',
 'include',
 'arrival',
 'figure',
 'comissaria',
 'smith',
 'doubt',
 'crop',
 'cocoa',
 'harvesting',
 'practically',
 'end',
 'total',
 'bahia',
 'crop',
 'estimate',
 'mln',
 'bag',
 'sale',
 'stand',
 'mln',
 'hundred',
 'thousand',
 'bag',
 'hand',
 'farmer',
 'middleman',
 'exporter',
 'processor',
 'doubt',
 'cocoa',
 'fit',
 'export',
 'shipper',
 'experience',
 'dificulties',
 'obtain',
 'certificate',
 'view',
 'low',
 'quality',
 'recent',
 'week',
 'farmer',
 'sell',
 'good',
 'part',
 'cocoa',
 'hold',


## Tokenization with stamming

In [11]:
def tokenize_stam(text):
    min_length = 3
    words = map(lambda word: word.lower(), word_tokenize(text));
    #выкидываем стоп-слова
    words = [word for word in words
                  if word not in stoplist_combined]
    #используем стемминг
    tokens =(list(map(lambda token: PorterStemmer().stem(token),
                  words)));
    p = re.compile('[a-zA-Z]+');
    #выкидываем строки с лишними символами и длиной не меньше 3
    filtered_tokens =list(filter(lambda token: p.match(token) and len(token)>=min_length,
         tokens));
    return filtered_tokens

In [12]:
x_train_stam = [tokenize_stam(t) for t in train_docs]
x_test_stam = [tokenize_stam(t) for t in test_docs]

In [28]:
x_train_stam[0]

['bahia',
 'cocoa',
 'review',
 'shower',
 'continu',
 'week',
 'bahia',
 'cocoa',
 'zone',
 'allevi',
 'drought',
 'earli',
 'improv',
 'prospect',
 'come',
 'temporao',
 'normal',
 'humid',
 'level',
 'restor',
 'comissaria',
 'smith',
 'weekli',
 'review',
 'dri',
 'period',
 'mean',
 'temporao',
 'late',
 'year',
 'arriv',
 'week',
 'end',
 'bag',
 'kilo',
 'make',
 'cumul',
 'total',
 'season',
 'mln',
 'stage',
 'year',
 'cocoa',
 'deliv',
 'earlier',
 'consign',
 'includ',
 'arriv',
 'figur',
 'comissaria',
 'smith',
 'doubt',
 'crop',
 'cocoa',
 'harvest',
 'practic',
 'end',
 'total',
 'bahia',
 'crop',
 'estim',
 'mln',
 'bag',
 'sale',
 'stand',
 'mln',
 'hundr',
 'thousand',
 'bag',
 'hand',
 'farmer',
 'middlemen',
 'export',
 'processor',
 'doubt',
 'cocoa',
 'fit',
 'export',
 'shipper',
 'experienc',
 'dificulti',
 'obtain',
 'superior+',
 'certif',
 'view',
 'lower',
 'qualiti',
 'recent',
 'week',
 'farmer',
 'sold',
 'good',
 'part',
 'cocoa',
 'held',
 'consign',
 '

## Learning

In [13]:
def grid_search(train_x, train_y, test_x, test_y, parameters, pipeline):
    grid_search_tune = GridSearchCV(pipeline, parameters, n_jobs=-1, scoring='accuracy', cv=5, verbose=1)
    grid_search_tune.fit(train_x, train_y)

    print("Best parameters set:")
    print(grid_search_tune.best_estimator_.steps)

    print("Best score: %0.3f" % grid_search_tune.best_score_)
    print("Applying best classifier on test data:")
    best_clf = grid_search_tune.best_estimator_
    predictions = best_clf.predict(test_x)
    print(accuracy_score(test_y, predictions))
    print(classification_report(test_y, predictions))
    return best_clf

In [14]:
#to work svm with tokenised text
def const(tmp):
    return tmp

## Multilabel classification

In [15]:
pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(tokenizer=const, preprocessor=const)),
                ('clf', OneVsRestClassifier(LinearSVC(), n_jobs=-1)),
            ])
parameters = {
                'tfidf__max_df': [0.3, 0.5, 0,75, 0.9],
                'tfidf__min_df': [3, None],
                'tfidf__max_features': [1000, 3000, 5000, None],
                'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
                "clf__estimator__C": [0.01, 0.1, 1],
                "clf__estimator__class_weight": ['balanced', None],
} 
model_svc_lem = grid_search(x_train_lem, train_labels, x_test_lem, test_labels, parameters, pipeline)

Fitting 5 folds for each of 720 candidates, totalling 3600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  7.4min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed: 15.6min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed: 27.6min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed: 40.1min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed: 62.7min
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed: 80.6min
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed: 109.1min
[Parallel(n_jobs=-1)]: Done 3600 out of 3600 | elapsed: 120.5min finished


Best parameters set:
[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.5, max_features=5000,
                min_df=3, ngram_range=(1, 2), norm='l2',
                preprocessor=<function const at 0x7f027b69f830>,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=<function const at 0x7f027b69f830>, use_idf=True,
                vocabulary=None)), ('clf', OneVsRestClassifier(estimator=LinearSVC(C=1, class_weight=None, dual=True,
                                        fit_intercept=True, intercept_scaling=1,
                                        loss='squared_hinge', max_iter=1000,
                                        multi_class='ovr', penalty='l2',
                                        random_state=None, tol=0.

In [24]:
model_svc_stam = grid_search(x_train_stam, train_labels, x_test_stam, test_labels, parameters, pipeline)

Fitting 5 folds for each of 720 candidates, totalling 3600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  7.5min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed: 16.1min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed: 27.6min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed: 40.9min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed: 63.9min
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed: 85.4min
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed: 118.9min
[Parallel(n_jobs=-1)]: Done 3600 out of 3600 | elapsed: 130.3min finished


Best parameters set:
[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.5, max_features=None,
                min_df=3, ngram_range=(1, 3), norm='l2',
                preprocessor=<function const at 0x7f027b69f830>,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=<function const at 0x7f027b69f830>, use_idf=True,
                vocabulary=None)), ('clf', OneVsRestClassifier(estimator=LinearSVC(C=1, class_weight='balanced', dual=True,
                                        fit_intercept=True, intercept_scaling=1,
                                        loss='squared_hinge', max_iter=1000,
                                        multi_class='ovr', penalty='l2',
                                        random_state=None, 

## Accuracy for multilabel classification:
### SVM with lemmatisation: 0.825

### SVM with stamming: 0.822

## Multiclass classification(on target for each text)

#### Здесь автор статьи про RDML https://arxiv.org/pdf/1805.01890v2.pdf сводил  к задаче milticlass следующим образом. 

In [26]:
y_test_onelabel = np.argmax(test_labels, axis = 1)
y_train_onelabel = np.argmax(train_labels, axis = 1)

#### По сути из нескольких тем-таргетов выбирался тот, который лексиграфически раньше(если их упорядочить). Не понятно, почему так  можно делать, ведь никакой связи с между ними нет.

In [0]:
pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(tokenizer=const, preprocessor=const)),
                ('clf', LinearSVC()),
            ])
parameters = {
                'tfidf__max_df': [0.3, 0.5, 0,75, 0.9],
                'tfidf__min_df': [3, None],
                'tfidf__max_features': [1000, 3000, 5000, None],
                'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
                "clf__C": [0.01, 0.1, 1],
                "clf__class_weight": ['balanced', None],
} 
model_svc_lem_onelabel = grid_search(train_docs_tokens, y_train_onelabel, test_docs_tokens, y_test_onelabel, parameters, pipeline)

Fitting 3 folds for each of 720 candidates, totalling 2160 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   47.0s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed: 10.3min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed: 18.7min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed: 29.4min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed: 42.2min
[Parallel(n_jobs=-1)]: Done 2160 out of 2160 | elapsed: 51.1min finished


Best parameters set:
[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.9, max_features=5000,
                min_df=3, ngram_range=(1, 3), norm='l2',
                preprocessor=<function const at 0x7f648d584d40>,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=<function const at 0x7f648d584d40>, use_idf=True,
                vocabulary=None)), ('clf', LinearSVC(C=1, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0))]
Best score: 0.893
Applying best classifier on test data:
0.8973169923815834
              precision    recall  f1-score   support

          

In [27]:
pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(tokenizer=const, preprocessor=const)),
                ('clf', LinearSVC()),
            ])
parameters = {
                'tfidf__max_df': [0.5, 0,75, 0.9],
                'tfidf__min_df': [3, None],
                'tfidf__max_features': [3000, 5000, 7000, None],
                'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
                "clf__C": [0.01, 1],
                "clf__class_weight": ['balanced', None],
} 
model_svc_stam_onelabel = grid_search(x_train_stam, y_train_onelabel, x_test_stam, y_test_onelabel, parameters, pipeline)

Fitting 5 folds for each of 384 candidates, totalling 1920 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   53.3s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed: 10.8min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed: 19.5min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed: 31.2min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed: 45.7min
[Parallel(n_jobs=-1)]: Done 1920 out of 1920 | elapsed: 49.2min finished


Best parameters set:
[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.5, max_features=None,
                min_df=3, ngram_range=(1, 2), norm='l2',
                preprocessor=<function const at 0x7f027b69f830>,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=<function const at 0x7f027b69f830>, use_idf=True,
                vocabulary=None)), ('clf', LinearSVC(C=1, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0))]
Best score: 0.899
Applying best classifier on test data:
0.8999668764491553
              precision    recall  f1-score   support

          

## Accuracy for multiclass classification:
### SVM with lemmatisation: 0.893

### SVM with stamming: 0.899