In [0]:
!pip install scikit-multilearn


In [0]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MultiLabelBinarizer

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler

from skmultilearn.problem_transform import LabelPowerset
from skmultilearn.problem_transform import BinaryRelevance

from imblearn.over_sampling import SMOTE

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import LinearSVC
from sklearn.svm import SVC

from sklearn.metrics import hamming_loss
from sklearn.metrics import f1_score

# Dataset Preprocessing:

In [0]:
# load TREC dataset
trec_data=pd.read_csv('trainingData_U_Sent2vec_V2.csv', index_col=0)

trec_data.set_index('0.1', inplace=True)
# rename the index column to postID
trec_data.index.rename('postID', inplace=True)

trec_data=trec_data.dropna() # drop null values

#encode multi-label classes
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(trec_data['1'])

Y = multilabel_binarizer.transform(trec_data['1'])

trec_data.drop(['1'], axis=1, inplace=True)
X = trec_data

## Balance the dataset using RandomOversampler:

* Example: https://www.kaggle.com/roccoli/multi-label-classification-with-sklearn/comments

In [0]:

lp = LabelPowerset()
ros = RandomOverSampler(random_state=42)

# Applies the above stated multi-label (ML) to multi-class (MC) transformation.
yt = lp.transform(Y)

X_resampled, Y_resampled = ros.fit_sample(X, Y)

X_train, X_test, y_train, y_test = train_test_split(X_resampled, Y_resampled, test_size=0.2, random_state=24)


## Define Special Evaluation Metric for Multi-Label Classification:

In [0]:

def hamming_score(y_true, y_pred, normalize=True, sample_weight=None):
    '''
    Compute the Hamming score (a.k.a. label-based accuracy) for the multi-label case
    http://stackoverflow.com/q/32239577/395857
    '''
    acc_list = []
    for i in range(y_true.shape[0]):
        set_true = set( np.where(y_true[i])[0] )
        set_pred = set( np.where(y_pred[i])[0] )
        tmp_a = None
        if len(set_true) == 0 and len(set_pred) == 0:
            tmp_a = 1
        else:
            tmp_a = len(set_true.intersection(set_pred))/float(len(set_true.union(set_pred)) )
        acc_list.append(tmp_a)
    return np.mean(acc_list)

def print_score(y_pred, clf):
    print("Clf: ", clf.__class__.__name__)
    print("Hamming loss: {}".format(hamming_loss(y_pred, y_test)))
    print("Hamming score: {}".format(hamming_score(y_pred, y_test)))
    print("---")    

# Ensemble Model:

## Baseline Classifiers:

In [0]:
nb_clf = GaussianNB()
clf_svm=SVC(gamma='scale', decision_function_shape='ovo', probability=True)
clf_rf = RandomForestClassifier(n_estimators=25, random_state=24, class_weight='balanced')


## Get Meta-Features (i.e. Predictions of Baseline Classifiers)

In [0]:
#--- train GaussianNB classifer ---
nb_clf.fit(X_train, y_train)
y_pred_nb=nb_clf.predict_proba(X_train)

y_pred=nb_clf.predict(X_test)
print_score(y_pred, y_test)

#--- train a randomforest classifer ----
clf_rf.fit(X_train, y_train)
y_pred_rf=clf_rf.predict_proba(X_train)

y_pred=clf_rf.predict(X_test)
print_score(y_pred, y_test)
print ('F1-score using randomForest: ', f1_score(y_test, y_pred))
    

# meta_features=[]

# for classifier in [nb_clf, clf_rf]:
#     clf = OneVsRestClassifier(classifier)
#     clf.fit(X_train, y_train)
#     y_pred = clf.predict_proba(X_train)
    
#     meta_features.append(y_pred)
#     #print_score(y_pred, classifier)

## Meta-Learner Classifier:

In [0]:
# Get meta-features (predictions of all classifiers)
meta_features=np.concatenate((y_pred_nb, y_pred_rf),axis=1)


meta_clf=LinearSVC()
meta_clf.fit(meta_features, y_train)

meta_x_test=np.concatenate((clf_rf.predict_proba(X_test), nb_clf.predict_proba(X_test)),axis=1)

y_pred_final=meta_clf.predict(meta_x_test)
print ('F1-score using ensemble model: ', f1_score(y_test, y_pred_final, average='micro'))