Creates 80 classifiers on the four datasets. Run *processing.ipynb* first.

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression 
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
import lightgbm as lgb
from sklearn.metrics import confusion_matrix
import random
import warnings
warnings.filterwarnings('ignore')
np.random.seed(5)

In [2]:
dataset = 'covtype' # set this to be each of ['default', 'adult', 'acoustic', 'covtype']
dat = pd.read_csv('./data/' + dataset + '.csv')

dat['label'] = dat['label'].astype(int)
class_list = np.sort(dat.label.unique()).tolist()

classes = len(class_list)

msk = np.random.rand(len(dat)) < 0.6
dat_train = dat[msk]
dat_test = dat[~msk]

dat_train = dat_train.reset_index(drop=True)
dat_test = dat_test.reset_index(drop=True)

X_train = dat_train.iloc[:, 1:-1].values
Y_train = dat_train.label.values

X_test = dat_test.iloc[:, 1:-1].values
Y_test = dat_test.label.values

print("Overall data shape is", dat.shape)
print("Train data shape is", dat_train.shape)
print("Test data shape is", dat_test.shape)
# print("groups are", group_list)
print("classes are", class_list)

Overall data shape is (321202, 55)
Train data shape is (192250, 55)
Test data shape is (128952, 55)
classes are [0, 1, 2, 3, 4]


In [3]:
### 20 models for logistic regression ########

eps = 0.2
class_weight_list = []

for i in range(20):
    w = len(dat_train) / (classes * np.bincount(dat_train.label.values))
    sign = 2*int(np.random.uniform(classes) > 0.5) - 1
    w += np.random.uniform(0,1, classes)*eps*sign
    cls_wgt = dict(zip(class_list, w))
    class_weight_list.append(cls_wgt)

conf_rate_lr = np.zeros((20, classes ** 2))
model_num = 0
for cls_wgt in class_weight_list:
    
    print("Running for model", model_num)

    clf = LogisticRegression(random_state=model_num, class_weight = cls_wgt, \
                             multi_class = 'multinomial').fit(X_train, Y_train)

    Y_pred = clf.predict(X_test)

    cmat = confusion_matrix(Y_test, Y_pred)
    cmat = cmat / np.sum(cmat)
    row_sums = cmat.sum(axis=1)
    crate = cmat / row_sums[:, np.newaxis]
    cvec = crate.ravel()

    conf_rate_lr[model_num, :] = np.array(cvec)
    model_num += 1

Running for model 0
Running for model 1
Running for model 2
Running for model 3
Running for model 4
Running for model 5
Running for model 6
Running for model 7
Running for model 8
Running for model 9
Running for model 10
Running for model 11
Running for model 12
Running for model 13
Running for model 14
Running for model 15
Running for model 16
Running for model 17
Running for model 18
Running for model 19


In [4]:
### 20 models for MLP ########

node_list = [(5, ), (10, ), (25, ), (50, ),\
            (5, 5), (5, 10), (5, 25), (5, 50),\
            (10, 5), (10, 10), (10, 25), (10, 50),\
            (25, 5), (25, 10), (25, 25), (25, 50),\
            (50, 5), (50, 10), (50, 25), (50, 50)]

conf_rate_mlp = np.zeros((20, classes ** 2))
model_num = 0
for nl in node_list:
    
    print("Running for model", model_num)
    clf = MLPClassifier(random_state=model_num, hidden_layer_sizes = nl, max_iter = 20).fit(X_train, Y_train)

    Y_pred = clf.predict(X_test)

    cmat = confusion_matrix(Y_test, Y_pred)
    cmat = cmat / np.sum(cmat)
    row_sums = cmat.sum(axis=1)
    crate = cmat / row_sums[:, np.newaxis]
    cvec = crate.ravel()

    conf_rate_mlp[model_num, :] = np.array(cvec)
    model_num += 1

Running for model 0
Running for model 1
Running for model 2
Running for model 3
Running for model 4
Running for model 5
Running for model 6
Running for model 7
Running for model 8
Running for model 9
Running for model 10
Running for model 11
Running for model 12
Running for model 13
Running for model 14
Running for model 15
Running for model 16
Running for model 17
Running for model 18
Running for model 19


In [5]:
### 20 models for svm ########

iter_list = np.arange(25, 501, 25).tolist()

conf_rate_svm = np.zeros((20, classes ** 2))
model_num = 0
for m in iter_list:
    print("running for model", model_num)

    clf = make_pipeline(SVC(random_state = model_num, max_iter = m)).fit(X_train, Y_train)

    Y_pred = clf.predict(X_test)

    cmat = confusion_matrix(Y_test, Y_pred)
    cmat = cmat / np.sum(cmat)
    row_sums = cmat.sum(axis=1)
    crate = cmat / row_sums[:, np.newaxis]
    cvec = crate.ravel()

    conf_rate_svm[model_num, :] = np.array(cvec)
    model_num += 1

running for model 0
running for model 1
running for model 2
running for model 3
running for model 4
running for model 5
running for model 6
running for model 7
running for model 8
running for model 9
running for model 10
running for model 11
running for model 12
running for model 13
running for model 14
running for model 15
running for model 16
running for model 17
running for model 18
running for model 19


In [6]:
### 20 models for lgb ########


nl_list = [10, 20, 30, 40]
eps = 0.5
class_weight_list = []

for i in range(5):
    w = len(dat_train) / (classes * np.bincount(dat_train.label.values))
    sign = 2*int(np.random.uniform(classes) > 0.5) - 1
    w += np.random.uniform(0,1, classes)*eps*sign
    cls_wgt = dict(zip(class_list, w))
    class_weight_list.append(cls_wgt)

conf_rate_lgb = np.zeros((20, classes ** 2))
model_num = 0
for nl in nl_list:
    for cls_wgt in class_weight_list:
        print("running for model", model_num)

        clf = lgb.LGBMClassifier(class_weight=cls_wgt, num_leaves=nl).fit(X_train, Y_train)

        Y_pred = clf.predict(X_test)

        cmat = confusion_matrix(Y_test, Y_pred)
        cmat = cmat / np.sum(cmat)
        row_sums = cmat.sum(axis=1)
        crate = cmat / row_sums[:, np.newaxis]
        cvec = crate.ravel()

        conf_rate_lgb[model_num, :] = np.array(cvec)
        model_num += 1

running for model 0
running for model 1
running for model 2
running for model 3
running for model 4
running for model 5
running for model 6
running for model 7
running for model 8
running for model 9
running for model 10
running for model 11
running for model 12
running for model 13
running for model 14
running for model 15
running for model 16
running for model 17
running for model 18
running for model 19


In [7]:
conf_rate_final = np.concatenate((conf_rate_lr, conf_rate_mlp, conf_rate_svm, conf_rate_lgb))

column_names = []
for i in range(classes):
    for j in range(classes):
        column_names.append('C_' + str(i) + str(j))
        
conf_save_dat = pd.DataFrame(data = conf_rate_final, columns=column_names)

In [8]:
conf_save_dat

Unnamed: 0,C_00,C_01,C_02,C_03,C_04,C_10,C_11,C_12,C_13,C_14,...,C_30,C_31,C_32,C_33,C_34,C_40,C_41,C_42,C_43,C_44
0,0.596506,0.297051,0.000967,0.013084,0.092392,0.336900,0.575206,0.029105,0.052887,0.005902,...,0.032361,0.020897,0.262807,0.683936,0.0,0.079305,0.087928,0.003765,0.0,0.829002
1,0.465497,0.420327,0.000896,0.014169,0.099111,0.205733,0.706986,0.027634,0.053027,0.006620,...,0.016834,0.037585,0.268756,0.676825,0.0,0.053558,0.105781,0.003765,0.0,0.836896
2,0.533230,0.357451,0.000943,0.013650,0.094726,0.271579,0.640667,0.028755,0.053115,0.005884,...,0.029168,0.024960,0.280511,0.665361,0.0,0.066675,0.096915,0.003765,0.0,0.832645
3,0.397223,0.492303,0.000919,0.011906,0.097650,0.154405,0.764986,0.028422,0.046355,0.005832,...,0.011609,0.053258,0.302569,0.632564,0.0,0.046029,0.120476,0.003765,0.0,0.829730
4,0.549615,0.332720,0.001132,0.016126,0.100408,0.292156,0.608760,0.033834,0.057493,0.007758,...,0.029023,0.019736,0.307793,0.643448,0.0,0.062424,0.094365,0.003765,0.0,0.839446
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75,0.784615,0.167787,0.000943,0.005446,0.041210,0.280686,0.652768,0.028335,0.031539,0.006672,...,0.002467,0.008272,0.086925,0.902336,0.0,0.028297,0.012509,0.000121,0.0,0.959072
76,0.817550,0.134828,0.000967,0.005635,0.041021,0.338161,0.591493,0.030874,0.031732,0.007740,...,0.002757,0.007691,0.099260,0.890292,0.0,0.030240,0.007894,0.000121,0.0,0.961744
77,0.566094,0.378386,0.000967,0.005045,0.049508,0.096913,0.845577,0.025130,0.026391,0.005989,...,0.001306,0.014076,0.096938,0.887680,0.0,0.017124,0.021861,0.000121,0.0,0.960894
78,0.528845,0.414221,0.000754,0.005800,0.050381,0.076668,0.867415,0.023484,0.026741,0.005691,...,0.000580,0.017849,0.089972,0.891598,0.0,0.015667,0.021618,0.000121,0.0,0.962594


In [9]:
if(dataset == 'default'):
    conf_save_dat.to_csv('./data/confusions_C2_default.csv', index = False)
elif(dataset == 'adult'):
    conf_save_dat.to_csv('./data/confusions_C2_adult.csv', index = False)
elif(dataset == 'acoustic'):
    conf_save_dat.to_csv('./data/confusions_C3_acoustic.csv', index = False)
else:
    conf_save_dat.to_csv('./data/confusions_C5_covtype.csv', index = False)