In [None]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import missingno as msno
import multiprocessing
import seaborn as sns
import pickle

from gensim.corpora.dictionary import Dictionary

# Label dimensionality reduction
from sklearn.cluster import FeatureAgglomeration

# Feature generation
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

# Modelling
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
# Based on this post: https://stackoverflow.com/questions/42819460/what-is-the-difference-between-onevsrestclassifier-and-multioutputclassifier-in
# it appears that OneVsRestClassifier works the same as MultiOutputClassifier in our case with binary Multi-Label classification.

# from sklearn.multioutput import MultiOutputClassifier # For doing One-vs-Rest by training K number of binary classifiers where K = n_classes
# # See docs: https://scikit-learn.org/stable/modules/multiclass.html

# Model Tuning
from sklearn.model_selection import ParameterGrid, RandomizedSearchCV

# Evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.metrics import jaccard_score, hamming_loss, zero_one_loss, multilabel_confusion_matrix
from sklearn.metrics import make_scorer

In [None]:
with open('gensim_dictionary.pickle', 'rb') as f:
    dictionary = pickle.load(f)
    
with open('tokenized_nostops_descriptions.pickle', 'rb') as f:
    tokenized_descriptions = pickle.load(f)

In [None]:
df = pd.read_pickle('df_cats.pickle')
df.head(1)

# Label dimensionality reduction

In [None]:
FeatureAgglomeration(n_clusters=20
                     , affinity = )

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

sns.heatmap(cosine_similarity(labels))

## Preparing features & labels

In [None]:
raw_features = df.description
labels = df.iloc[:, 3:]
print(raw_features.shape)
print(labels.shape)

In [None]:
# Need the tokenized descriptions as entire strings, not lists of tokens
def stringify(description):
    
    return " ".join(description)

pool = multiprocessing.Pool(multiprocessing.cpu_count()) # 

preprocessed_description_strings = pool.map(stringify, tokenized_descriptions)

pool.close()

In [None]:
# Example cleaned string
preprocessed_description_strings[0]

In [None]:
# BOW unigrams
count_vectorizer = CountVectorizer(lowercase=False, ngram_range =(1,1), max_df = .5, min_df = 100)
unigram_bow_corpus = count_vectorizer.fit_transform(preprocessed_description_strings)
unigram_bow_corpus

In [None]:
# BOW unigrams & bigrams
count_vectorizer = CountVectorizer(lowercase=False, ngram_range =(1,2), max_df = .5, min_df = 100)
uni_and_bigram_bow_corpus = count_vectorizer.fit_transform(preprocessed_description_strings)
uni_and_bigram_bow_corpus

In [None]:
# TFIDF unigrams
tfidf_vectorizer = TfidfVectorizer(lowercase=False, ngram_range =(1,1), max_df = .5, min_df = 100)
unigram_tfidf_corpus = tfidf_vectorizer.fit_transform(preprocessed_description_strings)
unigram_tfidf_corpus

In [None]:
# TFIDF unigrams and bigrams
tfidf_vectorizer = TfidfVectorizer(lowercase=False, ngram_range =(1,2), max_df = .5, min_df = 100)
uni_and_bigram_tfidf_corpus = tfidf_vectorizer.fit_transform(preprocessed_description_strings)
uni_and_bigram_tfidf_corpus

## Train_Test Split


<div class="alert alert-warning">NOTE: CURRENTLY JUST EXPERIMENTING WITH unigram_bow_corpus </div>


In [None]:
indices = range(df.shape[0])

train_indices, test_indices = train_test_split(indices, random_state=42, test_size=0.2, shuffle=True)

X_train = unigram_bow_corpus[train_indices]
y_train = labels.iloc[train_indices, :]

X_test = unigram_bow_corpus[test_indices]
y_test = labels.iloc[test_indices, :]

print('TRAINING DATA')
print('FEATURES ', X_train.shape)
print('LABELS ', y_train.shape)

print('TEST DATA')
print('FEATURES ',X_test.shape)
print('LABELS ', y_test.shape)

# Modelling

## RANDOM - FORESTS

In [169]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

RandomForestClassifier(random_state=42)


<div class="alert alert-success">NOTE: Much faster with n_jobs = -1 </div>


In [171]:
%%time
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

CPU times: user 1min 8s, sys: 1.12 s, total: 1min 9s
Wall time: 1min 9s


RandomForestClassifier(random_state=42)

In [172]:
%%time
rf = RandomForestClassifier(random_state=42
                            , n_jobs = -1)
rf.fit(X_train, y_train)

CPU times: user 2min 23s, sys: 4.14 s, total: 2min 27s
Wall time: 14.5 s


RandomForestClassifier(n_jobs=-1, random_state=42)

## RandomCV Search

In [202]:
rf_dict = {'n_estimators': [70, 100, 150, 200]
           , 'max_depth': [None, 2,5,10]
           , 'ccp_alpha': [0, .01, .1]
          }

In [203]:
param_grid = ParameterGrid(rf_dict);param_grid

<sklearn.model_selection._search.ParameterGrid at 0x7fc8b7f7c990>

In [204]:
rf = RandomForestClassifier(random_state=42
                            , n_jobs = -1
                            , verbose = 1
                           )

# Converts the metric function - jaccard_score - into something that RandomizedSearchCV can use to evaluate a given model
jaccard_scorer = make_scorer(jaccard_score, average='micro')

best_rf = RandomizedSearchCV(rf
                             , rf_dict
                             , n_iter=10
                             , n_jobs = -1
                             , cv=3
                             , scoring=jaccard_scorer)



## BOW - UNIGRAM CORPUS

In [205]:
best_rf.fit(unigram_bow_corpus, labels)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    8.8s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:   39.9s finished


RandomizedSearchCV(cv=3,
                   estimator=RandomForestClassifier(n_jobs=-1, random_state=42,
                                                    verbose=1),
                   n_jobs=-1,
                   param_distributions={'ccp_alpha': [0, 0.01, 0.1],
                                        'max_depth': [None, 2, 5, 10],
                                        'n_estimators': [70, 100, 150, 200]},
                   scoring=make_scorer(jaccard_score, average=micro))

In [210]:
# Best Random Forest Parameters
best_rf.best_params_ # {'n_estimators': 150, 'max_depth': None, 'ccp_alpha': 0}

{'n_estimators': 150, 'max_depth': None, 'ccp_alpha': 0}

In [220]:
# Top score
best_rf.best_score_

0.1917938689619385

### The different parameter values that were tried and the corresponding results

In [216]:
# These are the average scores over the 10 iterations (average over the 3 CV folds within a given iteration)
best_rf.cv_results_['mean_test_score']

array([0.00252974, 0.        , 0.        , 0.1915055 , 0.        ,
       0.02180457, 0.        , 0.19179387, 0.02196535, 0.        ])

In [218]:
best_rf.cv_results_['param_max_depth']

masked_array(data=[2, None, 10, None, 2, 5, 10, None, 5, None],
             mask=[False, False, False, False, False, False, False, False,
                   False, False],
       fill_value='?',
            dtype=object)

In [223]:
best_rf.cv_results_['param_n_estimators']

masked_array(data=[150, 70, 70, 200, 100, 150, 100, 150, 100, 200],
             mask=[False, False, False, False, False, False, False, False,
                   False, False],
       fill_value='?',
            dtype=object)

In [224]:
best_rf.cv_results_['param_ccp_alpha']

masked_array(data=[0, 0.1, 0.01, 0, 0.01, 0, 0.1, 0, 0, 0.1],
             mask=[False, False, False, False, False, False, False, False,
                   False, False],
       fill_value='?',
            dtype=object)

## BOW - UNIGRAM + BIGRAM

In [225]:
best_rf.fit(uni_and_bigram_bow_corpus, labels)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:   10.3s finished


RandomizedSearchCV(cv=3,
                   estimator=RandomForestClassifier(n_jobs=-1, random_state=42,
                                                    verbose=1),
                   n_jobs=-1,
                   param_distributions={'ccp_alpha': [0, 0.01, 0.1],
                                        'max_depth': [None, 2, 5, 10],
                                        'n_estimators': [70, 100, 150, 200]},
                   scoring=make_scorer(jaccard_score, average=micro))

In [228]:
# Best Random Forest Parameters
best_rf.best_params_

{'n_estimators': 150, 'max_depth': 10, 'ccp_alpha': 0}

In [227]:
# Top score
best_rf.best_score_

0.06529360773901653

## TFIDF - Unigram_tfidf_corpus

In [229]:
best_rf.fit(unigram_tfidf_corpus , labels)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   12.2s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:   57.0s finished


RandomizedSearchCV(cv=3,
                   estimator=RandomForestClassifier(n_jobs=-1, random_state=42,
                                                    verbose=1),
                   n_jobs=-1,
                   param_distributions={'ccp_alpha': [0, 0.01, 0.1],
                                        'max_depth': [None, 2, 5, 10],
                                        'n_estimators': [70, 100, 150, 200]},
                   scoring=make_scorer(jaccard_score, average=micro))

In [230]:
# Best Random Forest Parameters
best_rf.best_params_

{'n_estimators': 150, 'max_depth': None, 'ccp_alpha': 0}

In [231]:
# Top score
best_rf.best_score_

0.19125782241746278

## TFIDF - UNIGRAM + BIGRAM

In [232]:
best_rf.fit(uni_and_bigram_tfidf_corpus, labels)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    5.3s finished


RandomizedSearchCV(cv=3,
                   estimator=RandomForestClassifier(n_jobs=-1, random_state=42,
                                                    verbose=1),
                   n_jobs=-1,
                   param_distributions={'ccp_alpha': [0, 0.01, 0.1],
                                        'max_depth': [None, 2, 5, 10],
                                        'n_estimators': [70, 100, 150, 200]},
                   scoring=make_scorer(jaccard_score, average=micro))

In [233]:
# Best Random Forest Parameters
best_rf.best_params_

{'n_estimators': 100, 'max_depth': 10, 'ccp_alpha': 0}

In [234]:
# Top score
best_rf.best_score_

0.06368836514927444


<div class="alert alert-success">It appears that max_depth = None with ccp_alpha=0 is best, but could potentially do with more n_estimators </div>


### Checking just n_estimators

In [235]:
rf_dict = {'n_estimators': [130, 150, 170, 200, 230]
          }

In [236]:
rf = RandomForestClassifier(random_state=42
                            , n_jobs = -1
                            , verbose = 1
                           )

# Converts the metric function - jaccard_score - into something that RandomizedSearchCV can use to evaluate a given model
jaccard_scorer = make_scorer(jaccard_score, average='micro')

best_rf = RandomizedSearchCV(rf
                             , rf_dict
                             , n_iter=5
                             , n_jobs = -1
                             , cv=3
                             , scoring=jaccard_scorer)



## BOW - UNIGRAM CORPUS

In [237]:
best_rf.fit(unigram_bow_corpus, labels)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   11.2s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:   50.2s finished


RandomizedSearchCV(cv=3,
                   estimator=RandomForestClassifier(n_jobs=-1, random_state=42,
                                                    verbose=1),
                   n_iter=5, n_jobs=-1,
                   param_distributions={'n_estimators': [130, 150, 170, 200,
                                                         230]},
                   scoring=make_scorer(jaccard_score, average=micro))

In [238]:
# Best Random Forest Parameters
best_rf.best_params_ 

{'n_estimators': 150}

In [239]:
# Top score
best_rf.best_score_

0.1917938689619385

## BOW - UNIGRAM + BIGRAM

In [240]:
best_rf.fit(uni_and_bigram_bow_corpus, labels)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    8.4s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:   51.9s
[Parallel(n_jobs=-1)]: Done 230 out of 230 | elapsed:  1.1min finished


RandomizedSearchCV(cv=3,
                   estimator=RandomForestClassifier(n_jobs=-1, random_state=42,
                                                    verbose=1),
                   n_iter=5, n_jobs=-1,
                   param_distributions={'n_estimators': [130, 150, 170, 200,
                                                         230]},
                   scoring=make_scorer(jaccard_score, average=micro))

In [241]:
# Best Random Forest Parameters
best_rf.best_params_

{'n_estimators': 230}

In [242]:
# Top score
best_rf.best_score_

0.19338024490508335

## TFIDF - Unigram_tfidf_corpus

In [243]:
best_rf.fit(unigram_tfidf_corpus , labels)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   10.5s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:   48.2s finished


RandomizedSearchCV(cv=3,
                   estimator=RandomForestClassifier(n_jobs=-1, random_state=42,
                                                    verbose=1),
                   n_iter=5, n_jobs=-1,
                   param_distributions={'n_estimators': [130, 150, 170, 200,
                                                         230]},
                   scoring=make_scorer(jaccard_score, average=micro))

In [244]:
# Best Random Forest Parameters
best_rf.best_params_

{'n_estimators': 150}

In [245]:
# Top score
best_rf.best_score_

0.19125782241746278

## TFIDF - UNIGRAM + BIGRAM

In [246]:
best_rf.fit(uni_and_bigram_tfidf_corpus, labels)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    9.9s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 230 out of 230 | elapsed:  1.3min finished


RandomizedSearchCV(cv=3,
                   estimator=RandomForestClassifier(n_jobs=-1, random_state=42,
                                                    verbose=1),
                   n_iter=5, n_jobs=-1,
                   param_distributions={'n_estimators': [130, 150, 170, 200,
                                                         230]},
                   scoring=make_scorer(jaccard_score, average=micro))

In [247]:
# Best Random Forest Parameters
best_rf.best_params_

{'n_estimators': 230}

In [248]:
# Top score
best_rf.best_score_

0.19480018667613816


<div class="alert alert-success">It appears that n_estimators=150 is best for the unigram corpuses, but n_estimators=230 (or potentially larger) for the uni+bigram corpuses </div>



<div class="alert alert-success">Also, the Jaccard Score appears to be a bit better on the uni+bigram corpus, but only a little bit </div>


# Logistic Regression

### Single Estimator in OVR

In [12]:
# Single iteration
lr = LogisticRegression(max_iter=1000)
ovr = OneVsRestClassifier(lr).fit(X_train, y_train)

In [13]:
ovr.score(X_test, y_test)

0.13294460641399417

In [14]:
y_pred_lr_ovr = ovr.predict(X_test)

In [15]:
jaccard_score(y_test, y_pred_lr_ovr, average='micro')

0.35095874491574663

### OVR inside RandomizedSearchCV

In [23]:
lr_dict = {'estimator__C': [2,1,.5,.1,.01,0]} # For clarification why there is "estimator__", see this SO post: https://stackoverflow.com/questions/12632992/gridsearch-for-an-estimator-inside-a-onevsrestclassifier

lr = LogisticRegression( #n_jobs = -1
                        random_state = 42
                        , verbose = 0
                        , max_iter = 1000)

ovr = OneVsRestClassifier(lr) #, n_jobs = -1

# Converts the metric function - jaccard_score - into something that RandomizedSearchCV can use to evaluate a given model
jaccard_scorer = make_scorer(jaccard_score, average='micro')

best_lr_ovr = RandomizedSearchCV(ovr
                             , lr_dict
                             , n_iter=5
#                              , n_jobs = -1
                             , cv=3
                             , scoring=jaccard_scorer)

#### BOW UNIGRAM CORPUS

In [24]:
best_lr_ovr.fit(unigram_bow_corpus, labels)

Traceback (most recent call last):
  File "/Users/kristiyan/opt/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/kristiyan/opt/anaconda3/lib/python3.7/site-packages/sklearn/multiclass.py", line 245, in fit
    for i, column in enumerate(columns))
  File "/Users/kristiyan/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 1029, in __call__
    if self.dispatch_one_batch(iterator):
  File "/Users/kristiyan/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 847, in dispatch_one_batch
    self._dispatch(tasks)
  File "/Users/kristiyan/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 765, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  File "/Users/kristiyan/opt/anaconda3/lib/python3.7/site-packages/joblib/_parallel_backends.py", line 206, in apply_async
    result = ImmediateResult(func)
  File

RandomizedSearchCV(cv=3,
                   estimator=OneVsRestClassifier(estimator=LogisticRegression(max_iter=1000,
                                                                              random_state=42)),
                   n_iter=5,
                   param_distributions={'estimator__C': [2, 1, 0.5, 0.1, 0.01,
                                                         0]},
                   scoring=make_scorer(jaccard_score, average=micro))

In [27]:
best_lr_ovr.best_params_

{'estimator__C': 0.5}

In [29]:
best_lr_ovr.best_score_

0.3358777980831083

#### BOW UNI+BIGRAM CORPUS

In [30]:
best_lr_ovr.fit(uni_and_bigram_bow_corpus, labels)

Traceback (most recent call last):
  File "/Users/kristiyan/opt/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/kristiyan/opt/anaconda3/lib/python3.7/site-packages/sklearn/multiclass.py", line 245, in fit
    for i, column in enumerate(columns))
  File "/Users/kristiyan/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 1029, in __call__
    if self.dispatch_one_batch(iterator):
  File "/Users/kristiyan/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 847, in dispatch_one_batch
    self._dispatch(tasks)
  File "/Users/kristiyan/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 765, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  File "/Users/kristiyan/opt/anaconda3/lib/python3.7/site-packages/joblib/_parallel_backends.py", line 206, in apply_async
    result = ImmediateResult(func)
  File

RandomizedSearchCV(cv=3,
                   estimator=OneVsRestClassifier(estimator=LogisticRegression(max_iter=1000,
                                                                              random_state=42)),
                   n_iter=5,
                   param_distributions={'estimator__C': [2, 1, 0.5, 0.1, 0.01,
                                                         0]},
                   scoring=make_scorer(jaccard_score, average=micro))

In [31]:
best_lr_ovr.best_params_

{'estimator__C': 0.5}

In [32]:
best_lr_ovr.best_score_

0.34166566869407355

#### TFIDF UNIGRAM CORPUS

In [33]:
best_lr_ovr.fit(unigram_tfidf_corpus, labels)

Traceback (most recent call last):
  File "/Users/kristiyan/opt/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/kristiyan/opt/anaconda3/lib/python3.7/site-packages/sklearn/multiclass.py", line 245, in fit
    for i, column in enumerate(columns))
  File "/Users/kristiyan/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 1029, in __call__
    if self.dispatch_one_batch(iterator):
  File "/Users/kristiyan/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 847, in dispatch_one_batch
    self._dispatch(tasks)
  File "/Users/kristiyan/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 765, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  File "/Users/kristiyan/opt/anaconda3/lib/python3.7/site-packages/joblib/_parallel_backends.py", line 206, in apply_async
    result = ImmediateResult(func)
  File

RandomizedSearchCV(cv=3,
                   estimator=OneVsRestClassifier(estimator=LogisticRegression(max_iter=1000,
                                                                              random_state=42)),
                   n_iter=5,
                   param_distributions={'estimator__C': [2, 1, 0.5, 0.1, 0.01,
                                                         0]},
                   scoring=make_scorer(jaccard_score, average=micro))

In [34]:
best_lr_ovr.best_params_

{'estimator__C': 1}

In [35]:
best_lr_ovr.best_score_

0.2677553984119441

#### TFIDF UNI+BIGRAM CORPUS

In [36]:
best_lr_ovr.fit(uni_and_bigram_tfidf_corpus, labels)

RandomizedSearchCV(cv=3,
                   estimator=OneVsRestClassifier(estimator=LogisticRegression(max_iter=1000,
                                                                              random_state=42)),
                   n_iter=5,
                   param_distributions={'estimator__C': [2, 1, 0.5, 0.1, 0.01,
                                                         0]},
                   scoring=make_scorer(jaccard_score, average=micro))

In [37]:
best_lr_ovr.best_params_

{'estimator__C': 2}

In [38]:
best_lr_ovr.best_score_

0.30901622431582276


<div class="alert alert-success">Best performing appears to be BOW Uni+Bigram corpus with C=0.5 </div>


# LinearSVC

### Single Estimator in OVR

In [43]:
# Single iteration
svm = LinearSVC(random_state=42
                , max_iter = 50000
                , verbose=1)
ovr = OneVsRestClassifier(svm).fit(X_train, y_train)

[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]

In [44]:
ovr.score(X_test, y_test)

0.0793002915451895

In [45]:
y_pred_svm_ovr = ovr.predict(X_test)

In [46]:
jaccard_score(y_test, y_pred_svm_ovr, average='micro')

0.30012686118715365

### RandomizedSearchCV

In [51]:
svm_dict = {'estimator__C': [2,1,.5,.1,.01,0]} # For clarification why there is "estimator__", see this SO post: https://stackoverflow.com/questions/12632992/gridsearch-for-an-estimator-inside-a-onevsrestclassifier

svm = LinearSVC(#n_jobs = -1
                random_state = 42
                , verbose = 1
                , max_iter = 50000)

ovr = OneVsRestClassifier(svm, n_jobs = -1) #

# Converts the metric function - jaccard_score - into something that RandomizedSearchCV can use to evaluate a given model
jaccard_scorer = make_scorer(jaccard_score, average='micro')

best_svm_ovr = RandomizedSearchCV(ovr
                             , svm_dict
                             , n_iter=5
                             , n_jobs = -1
                             , cv=3
                             , scoring=jaccard_scorer)

### BOW UNIGRAM CORPUS

In [52]:
best_svm_ovr.fit(unigram_bow_corpus, labels)

RandomizedSearchCV(cv=3,
                   estimator=OneVsRestClassifier(estimator=LinearSVC(max_iter=50000,
                                                                     random_state=42,
                                                                     verbose=1),
                                                 n_jobs=-1),
                   n_iter=5, n_jobs=-1,
                   param_distributions={'estimator__C': [2, 1, 0.5, 0.1, 0.01,
                                                         0]},
                   scoring=make_scorer(jaccard_score, average=micro))

In [53]:
best_svm_ovr.best_params_

{'estimator__C': 0.01}

In [54]:
best_svm_ovr.best_score_

0.33347006693731074

### BOW UNI+BIGRAM CORPUS

In [55]:
best_svm_ovr.fit(uni_and_bigram_bow_corpus, labels)

RandomizedSearchCV(cv=3,
                   estimator=OneVsRestClassifier(estimator=LinearSVC(max_iter=50000,
                                                                     random_state=42,
                                                                     verbose=1),
                                                 n_jobs=-1),
                   n_iter=5, n_jobs=-1,
                   param_distributions={'estimator__C': [2, 1, 0.5, 0.1, 0.01,
                                                         0]},
                   scoring=make_scorer(jaccard_score, average=micro))

In [56]:
best_svm_ovr.best_params_

{'estimator__C': 0.1}

In [57]:
best_svm_ovr.best_score_

0.3285471551655335

### TFIDF UNIGRAM CORPUS

In [58]:
best_svm_ovr.fit(unigram_tfidf_corpus, labels)

RandomizedSearchCV(cv=3,
                   estimator=OneVsRestClassifier(estimator=LinearSVC(max_iter=50000,
                                                                     random_state=42,
                                                                     verbose=1),
                                                 n_jobs=-1),
                   n_iter=5, n_jobs=-1,
                   param_distributions={'estimator__C': [2, 1, 0.5, 0.1, 0.01,
                                                         0]},
                   scoring=make_scorer(jaccard_score, average=micro))

In [59]:
best_svm_ovr.best_params_

{'estimator__C': 1}

In [60]:
best_svm_ovr.best_score_

0.35970803324115347

### TFIDF UNI+BIGRAM CORPUS

In [61]:
best_svm_ovr.fit(uni_and_bigram_tfidf_corpus, labels)

RandomizedSearchCV(cv=3,
                   estimator=OneVsRestClassifier(estimator=LinearSVC(max_iter=50000,
                                                                     random_state=42,
                                                                     verbose=1),
                                                 n_jobs=-1),
                   n_iter=5, n_jobs=-1,
                   param_distributions={'estimator__C': [2, 1, 0.5, 0.1, 0.01,
                                                         0]},
                   scoring=make_scorer(jaccard_score, average=micro))

In [62]:
best_svm_ovr.best_params_

{'estimator__C': 1}

In [63]:
best_svm_ovr.best_score_

0.36267199408424333


<div class="alert alert-success">Best performing appears to be TFIDF Uni+Bigram corpus with C=1 </div>



<div class="alert alert-success">Also nice that LinearSVC appears to train fairly fast</div>


# SVC - Radial Basis

### Single Estimator in OVR

In [65]:
# Single iteration
svm = SVC(random_state=42
          , max_iter = -1 # -1 for no limit.
          , verbose=1
          , kernel = 'rbf'
           )
ovr = OneVsRestClassifier(svm).fit(X_train, y_train)

[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]

In [66]:
ovr.score(X_test, y_test)

0.14110787172011663

In [67]:
y_pred_svm_ovr = ovr.predict(X_test)

In [68]:
jaccard_score(y_test, y_pred_svm_ovr, average='micro')

0.2650083963056255


<div class="alert alert-warning">Trains quite slow</div>


## Predictions & Evaluation

In [178]:
y_pred_rf = rf.predict(X_test)
y_pred_rf.shape

(3430, 83)

In [182]:
print("EXACT MATCH RATION: ", rf.score(X_test, y_test))
print("JACCARD SCORE average=micro: ", jaccard_score(y_test, y_pred_rf, average='micro'))
print("JACCARD SCORE average=weighted: ", jaccard_score(y_test, y_pred_rf, average='weighted'))
print("JACCARD SCORE average=samples: ", jaccard_score(y_test, y_pred_rf, average='samples'))
print("HAMMING LOSS : ", hamming_loss(y_test, y_pred_rf))
print("ZERO-ONE LOSS : ", zero_one_loss(y_test, y_pred_rf))

EXACT MATCH RATION:  0.11924198250728862
JACCARD SCORE average=micro:  0.21248283873693105
JACCARD SCORE average=weighted:  0.20345546360720523
JACCARD SCORE average=samples:  0.2668857988464402
HAMMING LOSS :  0.026193403351013382
ZERO-ONE LOSS :  0.8807580174927114


In [None]:
rf

In [None]:
plt.figure(figsize=(20,8))
plt.plot(steps, rewards_lst[0], "-", label="First Trial")
plt.plot(steps, np.mean(rewards_lst[:5],axis=0), "-", label="Avg. First 5 Trials")
plt.plot(steps, np.mean(rewards_lst,axis=0), "-", label="Avg. First 10 Trials")
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.ylabel("Average Reward", fontsize = 20)
plt.xlabel("Number of Steps")
plt.title("Average Reward over 100,000 steps for 1, 5, and 10 trials", fontsize = 30)
plt.legend(loc="right", prop={'size': 16})

# Appendix

### Minimum example of using a classifier for Multi-Label classification in a One-vs-Rest STrategy
"This strategy consists of fitting one classifier per target. This is a simple strategy for extending classifiers that do not natively support multi-target classification"

https://scikit-learn.org/stable/modules/generated/sklearn.multioutput.MultiOutputClassifier.html#sklearn.multioutput.MultiOutputClassifier

In [64]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.datasets import make_multilabel_classification
X, y = make_multilabel_classification(n_samples=100 # Number of rows in X & y
                                      , n_features=20 # Number of features in X
                                      , n_classes=3 # Number of columns in y
                                      , n_labels=2 # Supposed to be the number of unique values for the classes in y, but doesn't change for some reason
                                      , random_state=0)
print("X.shape=", X.shape)
print('X \n', X)
print("y.shape=", y.shape)
print('y \n', y[:5])

X.shape= (100, 20)
X 
 [[3. 6. 1. ... 1. 5. 0.]
 [3. 5. 5. ... 1. 1. 1.]
 [3. 3. 5. ... 0. 2. 1.]
 ...
 [3. 7. 3. ... 2. 4. 2.]
 [7. 2. 1. ... 1. 1. 1.]
 [3. 5. 3. ... 1. 2. 4.]]
y.shape= (100, 3)
y 
 [[0 1 0]
 [0 1 0]
 [1 1 1]
 [1 1 1]
 [0 1 0]]


In [65]:
clf = MultiOutputClassifier(KNeighborsClassifier()).fit(X, y)

In [66]:
# True data for last two observations
y[-2:]

array([[1, 1, 1],
       [1, 0, 1]])

In [67]:
# Prediction for the last two observations in X
clf.predict(X[-2:])

array([[1, 1, 0],
       [1, 1, 1]])

In [68]:
# Prediction probabilities
# We have n_classes (in this case 3) n_samples x n_labels matrices(where n_labels appears to be always 2)
# In other words, the first matrix gives us the probabilities for 0 or 1 for the first class
clf.predict_proba(X[-2:])

[array([[0., 1.],
        [0., 1.]]),
 array([[0. , 1. ],
        [0.2, 0.8]]),
 array([[0.6, 0.4],
        [0.4, 0.6]])]

Note above how all the numbers in the right column are larger.  
This corresponds to the fact taht in the predictions, all the predictions are 1
EXCEPT the one for the first observation in the 3rd column.  
This is correct, because if we look at the _third_ matrix above, we will see that the first row for the first observation has a higher probability for the label 0 (prob = .6) as opposed to the prob for the label 1 (prob = .4)

In [69]:
# clf.predict(X[-2:]).shape
y[-2:].shape

(2, 3)

### Using OneVsRestClassifier - Appears to be the same as MultiOutputClassifier

In [94]:
from sklearn.multiclass import OneVsRestClassifier
# Based on this post: https://stackoverflow.com/questions/42819460/what-is-the-difference-between-onevsrestclassifier-and-multioutputclassifier-in
# it appears that OneVsRestClassifier works the same as MultiOutputClassifier in our case with binary Multi-Label classification.

In [89]:
ovr = OneVsRestClassifier(KNeighborsClassifier()).fit(X, y)

In [91]:
ovr.predict(X[-3:])

array([[0, 1, 0],
       [1, 1, 0],
       [1, 1, 1]])

In [99]:
ovr.predict_proba(X[-3:]) # This gives the same values as the MultiOutPut Classifier but a bit clearer
# Each row refers to a specific observation and each column is just the probability of a positive response for that observation for that class.

array([[0.2, 1. , 0.4],
       [1. , 1. , 0.4],
       [1. , 0.8, 0.6]])

In [92]:
ovr.score(X[-3:], y[-3:])

0.0

In [93]:
ovr.score(X, y)

0.68

# MODEL EVALUATION

### Measuring success with Exact Match Ratio

In [72]:
# Performance on entire dataset - this is the EXACT MATCH RATIO!
clf.score(X, y)

0.68

In [75]:
# From the docstring:
# In multilabel classification, this function computes subset accuracy:
# the set of labels predicted for a sample must *exactly* match the
# corresponding set of labels in y_true.

# In other words, this is the EXACT MATCH RATIO!

accuracy_score(y[-2:], clf.predict(X[-2:]))

0.0

In [76]:
# It produces the same results as:
clf.score(X[-2:], y[-2:])

# From the source code, this is equivalent to:
np.mean(np.all(y[-2:] == clf.predict(X[-2:]), axis=1))

0.0

## Alternative measures of success other than EXACT MATCH RATIO

### Jaccard Score - Measures positive matches
For binary classification this formula is clearest and simplest: https://en.wikipedia.org/wiki/Jaccard_index#Similarity_of_asymmetric_binary_attributes

In [82]:
print("True labels \n", y[-3:])
print("Predicted labels \n", clf.predict(X[-3:]))

True labels 
 [[0 1 1]
 [1 1 1]
 [1 0 1]]
Predicted labels 
 [[0 1 0]
 [1 1 0]
 [1 1 1]]


In [101]:
# If None, the scores for each class are returned.
# i.e. this does column-wise jaccard score
jaccard_score(y[-3:], clf.predict(X[-3:]), average=None)

array([1.        , 0.66666667, 0.33333333])

In [102]:
# Calculate metrics globally by counting the total true positives, false negatives and false positives.
jaccard_score(y[-3:], clf.predict(X[-3:]), average='micro')

0.625

In [85]:
# Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account.
jaccard_score(y[-3:], clf.predict(X[-3:]), average='macro')

0.6666666666666666

In [87]:
# Calculate metrics for each label, and find their average, weighted by support 
# (the number of true instances for each label). This alters ‘macro’ to account for label imbalance.
jaccard_score(y[-3:], clf.predict(X[-3:]), average='weighted')

0.619047619047619

In [86]:
# Calculate metrics for each instance, and find their average (only meaningful for multilabel classification).
jaccard_score(y[-3:], clf.predict(X[-3:]), average='samples')

0.611111111111111

### Hamming Loss - Measures proportion of labels correctly classified
The Hamming loss is the fraction of labels that are incorrectly predicted.  

In multilabel classification, the Hamming loss is different from the subset zero-one loss. The zero-one loss considers the entire set of labels for a given sample incorrect if it does not entirely match the true set of labels. Hamming loss is more forgiving in that it penalizes only the individual labels.

The Hamming loss is upperbounded by the subset zero-one loss, when normalize parameter is set to True. It is always between 0 and 1, lower being better.

In [106]:
ovr.predict(X[-2:])

array([[1, 1, 0],
       [1, 1, 1]])

In [105]:
y[-2:]

array([[1, 1, 1],
       [1, 0, 1]])

In [107]:
from sklearn.metrics import hamming_loss

hamming_loss(y[-2:], ovr.predict(X[-2:]))

0.3333333333333333

In [108]:
hamming_loss(np.array([[0, 1], [1, 1]]), np.ones((2, 2)))

0.25

### Zero-One Loss - Number of things incorrectly classified

In [109]:
from sklearn.metrics import zero_one_loss
y_pred = [1, 2, 3, 4]
y_true = [2, 2, 3, 4]
zero_one_loss(y_true, y_pred)

0.25

In [110]:
zero_one_loss(y_true, y_pred, normalize=False) # A LOT MORE UNFORGIVING

1

In [112]:
zero_one_loss(np.array([[0, 1], [1, 1], [1, 0]]), np.ones((3, 2)))

0.6666666666666667

### Multilabel Confusion Matrix

In [113]:
from sklearn.metrics import multilabel_confusion_matrix

The outpush below is the same type as for MultiOutputClassification.  
i.e. each matrix is a 2x2 confusion matrix for the corresponding label

In [115]:
y_true = np.array([[1, 0, 1, 1],
                   [0, 1, 0, 1]])
y_pred = np.array([[1, 0, 0, 1],
                   [0, 1, 1, 0]])
multilabel_confusion_matrix(y_true, y_pred)


array([[[1, 0],
        [0, 1]],

       [[1, 0],
        [0, 1]],

       [[0, 1],
        [1, 0]],

       [[0, 0],
        [1, 1]]])

In [116]:
y_true = np.array([[1, 0, 1, 1],
                   [0, 1, 0, 1]])
y_pred = np.array([[1, 0, 0, 1],
                   [0, 1, 1, 0]])
multilabel_confusion_matrix(y_true, y_pred, samplewise=True) # This does it ROW BY ROW instead of for each label


array([[[1, 0],
        [1, 2]],

       [[1, 1],
        [1, 1]]])