In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn import model_selection
from sklearn.decomposition import PCA

In [792]:
from sklearn.metrics import confusion_matrix

In [2]:
type_precision = "float32"

In [3]:
tabula_muris_path = "../datasets/tabula_muris_whole/"
all_counts_path = "brain_mouse_matrix_all_counts.csv"
all_data_path = "brain_mouse_matrix_all_data.csv"
all_scaled_path = "brain_mouse_matrix_all_scale_data.csv"

In [4]:
all_counts = pd.read_csv(tabula_muris_path + all_counts_path, sep=" ")

In [5]:
#all_data =  pd.read_csv(tabula_muris_path + all_data_path, sep=" ")

In [6]:
#all_scaled =  pd.read_csv(tabula_muris_path + all_scaled_path, sep=" ")

In [7]:
#all_counts.describe()

In [18]:
all_counts.head()

Unnamed: 0,CELL_ID,0610005C13Rik,0610007C21Rik,0610007L01Rik,0610007N19Rik,0610007P08Rik,0610007P14Rik,0610007P22Rik,0610008F07Rik,0610009B14Rik,...,Zxdc,Zyg11a,Zyg11b,Zyx,Zzef1,Zzz3,a,l7Rn6,zsGreen-transgene,annotation
0,A1.B003290.3_38_F.1.1,0,125,16,0,0,0,0,0,0,...,0,0,0,0,0,0,0,54,0,1
1,A1.B003728.3_56_F.1.1,0,0,0,0,0,324,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,A1.MAA000560.3_10_M.1.1,0,348,0,0,0,5,0,0,0,...,0,0,0,0,195,0,0,113,0,6
3,A1.MAA000564.3_10_M.1.1,0,41,36,0,0,24,0,0,0,...,0,0,0,125,0,1,0,0,0,4
4,A1.MAA000923.3_9_M.1.1,0,53,0,0,0,0,0,0,0,...,0,0,81,0,0,0,0,0,0,1


In [19]:
#all_counts.corr()['annotation'][:]

# Preprocessing

In [104]:
def log_normalize_data(data, scale=1000000.0):
    data_row_sums = np.sum(data, axis=1).reshape(-1, 1)
    return np.log(1 + scale * data / data_row_sums)

## Data cleaning
- one hot encoding of y
- Log normalize all data
- Split test and train data (stratified by y)
- Scale data by normal distribution

In [105]:
X = all_counts.iloc[:,1:-1].to_numpy(dtype=type_precision)
y_num = all_counts.iloc[:, -1].to_numpy(dtype=type_precision)

In [106]:
labelBin = preprocessing.LabelBinarizer()
labelBin.fit(y_num)
y = labelBin.transform(y_num)

In [107]:
X = log_normalize_data(X)

In [231]:
X_train_val, X_test, y_train_val, y_test = model_selection.train_test_split(X, y, test_size=0.33, stratify=y, random_state=24)

In [232]:
scaler = preprocessing.StandardScaler()
scaler.fit(X_train_val)
X_train_val = scaler.transform(X_train_val)
X_test = scaler.transform(X_test)

In [233]:
print(X_train_val.shape)

(2278, 23341)


In [234]:
print(X_test.shape)

(1123, 23341)


## Dimension reduction #1 - PCA
- TODO: pick different number of components

In [235]:
num_components = 12

In [236]:
pca = PCA(n_components=num_components)

In [237]:
pca.fit(X_train_val)
X_train_val_prepared_PCA = pca.transform(X_train_val)
X_test_prepared_PCA = pca.transform(X_test)

In [238]:
print("Explained variance: {}".format(np.sum(pca.explained_variance_ratio_)))

Explained variance: 0.1484992802143097


In [239]:
X_train_val_prepared_PCA.shape, X_test_prepared_PCA.shape

((2278, 12), (1123, 12))

In [240]:
(np.var(X_train_val, axis=0))

array([0.99999326, 1.0000049 , 1.0000058 , ..., 1.0000243 , 1.0000169 ,
       0.        ], dtype=float32)

## Dimension reduction #2 - Autoencoder
- Problem: Autoencoder loss not decreasing at all
- Solution: increase number of layers and neurons per layer, train it on google colab

In [241]:
from keras.layers import Input, Dense
from keras.models import Model

In [242]:
base_feature_num = X_train_val.shape[1]
encoding_dim = 12

In [243]:
base_feature_num

23341

In [244]:
X_train_val_normalized = (X_train_val-np.min(X_train_val))/(np.max(X_train_val)-np.min(X_train_val))

In [245]:
X_train_val_normalized.shape

(2278, 23341)

In [246]:
input_dim = Input(shape=(base_feature_num,))

# Encoder Layers
encoded1 = Dense(1000, activation = 'relu')(input_dim)
encoded2 = Dense(300, activation = 'relu')(encoded1)
encoded3 = Dense(encoding_dim, activation = 'relu')(encoded2)

# Decoder Layers    
decoded1 = Dense(300, activation = 'relu')(encoded3)
decoded2 = Dense(1000, activation = 'relu')(decoded1)
decoded3 = Dense(base_feature_num, activation = 'sigmoid')(decoded2)

In [247]:
autoencoder = Model(inputs=input_dim, outputs=decoded3)

In [248]:
autoencoder.compile(optimizer='adam', loss='binary_crossentropy')

In [249]:
autoencoder.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         (None, 23341)             0         
_________________________________________________________________
dense_31 (Dense)             (None, 1000)              23342000  
_________________________________________________________________
dense_32 (Dense)             (None, 300)               300300    
_________________________________________________________________
dense_33 (Dense)             (None, 12)                3612      
_________________________________________________________________
dense_34 (Dense)             (None, 300)               3900      
_________________________________________________________________
dense_35 (Dense)             (None, 1000)              301000    
_________________________________________________________________
dense_36 (Dense)             (None, 23341)             23364341  
Total para

In [250]:
# autoencoder.fit(X_train_val_normalized,
#                 X_train_val_normalized,
#                 nb_epoch=1,
#                 batch_size=32,
#                 validation_split=0.2)

# Models 
- models: gradient boosting, NN, kNN(k=30 za pocetak), SVM, random forest 
- clean data
- pick dimension reduction method
- pick model
- split train and validation set

## utility functions

In [251]:
def train_model(model, data):
    X_train, y_train = data
    model.fit(X_train, y_train)
    return model

In [252]:
def evaluate_model(model, data, name):
    X_train, X_val, y_train, y_val = data
    y_train_pred = model.predict(X_train)
    y_val_pred = model.predict(X_val)
    print('Model: {}'.format(name))
    print('Train set report: \n{}'.format(metrics.classification_report(y_train, y_train_pred)))
    print('Validation set report: \n{}'.format(metrics.classification_report(y_val, y_val_pred)))

In [253]:
def evaluate_nn(model, data, name):
    X_train, X_val, y_train, y_val = data
    y_train_pred = np.argmax(model.predict(X_train), axis=1) + 1
    y_val_pred = np.argmax(model.predict(X_val), axis=1) + 1
    print('Model: {}'.format(name))
    print('Train set report: \n{}'.format(metrics.classification_report(y_train, y_train_pred)))
    print('Validation set report: \n{}'.format(metrics.classification_report(y_val, y_val_pred)))

In [254]:
def cross_validation_evaluation(model, X, y, number_of_folds, error_function):
    y_predicted = np.empty(y.size)
    
    ix = np.arange(0, X.shape[0]) % number_of_folds
    
    for i in range(number_of_folds):
        X_train = X[ix != i, :]
        y_train = y[ix != i]
        
        y_test = y[ix == i]
        X_test = X[ix == i, :]
        
        model.fit(X_train, X_train)
        y_predicted[ix == i] = model.predict(X_test)
        
    return error_function(y, y_predicted)

In [255]:
def cross_validation_selection(X, y, number_of_folds, error_function, configure_model, configs):
    errors = []
    
    for c in configs:
        model = configure_model(c)
        error = cross_validation_evaluation(model, X, y, number_of_folds, error_function)
        errors.append(error)
        
    errors = np.array(errors)
    c_best = configs[np.argmin(errors)]
    
    model = configure_model(c_best)
    model.fit(X, y)
    
    return model

In [547]:
def get_f1_score(model, X_test, y_test_num):
    y_test_pred = model.predict(X_test)
    f1_score = metrics.f1_score(y_test_num, y_test_pred, average='weighted')
    return f1_score

In [561]:
def get_f1_score_nn(model, X_test, y_test_num):
    y_test_pred = np.argmax(model.predict(X_test), axis=1) + 1
    f1_score = metrics.f1_score(y_test_num, y_test_pred, average='weighted')
    return f1_score

In [774]:
def get_acc_score(model, X_test, y_test_num):
    y_test_pred = model.predict(X_test)
    acc_score = metrics.accuracy_score(y_test_num, y_test_pred)
    return acc_score

In [775]:
def get_acc_score_nn(model, X_test, y_test_num):
    y_test_pred = np.argmax(model.predict(X_test), axis=1) + 1
    acc_score = metrics.accuracy_score(y_test_num, y_test_pred)
    return acc_score

In [799]:
def get_acc_by_class(model, X_test, y_test_num):
    y_test_pred = model.predict(X_test)
    matrix = confusion_matrix(y_test_num, y_test_pred)
    return matrix.diagonal()/matrix.sum(axis=1)

In [800]:
def get_acc_by_class_nn(model, X_test, y_test_num):
    y_test_pred = np.argmax(model.predict(X_test), axis=1) + 1
    matrix = confusion_matrix(y_test_num, y_test_pred)
    return matrix.diagonal()/matrix.sum(axis=1)

In [803]:
def print_acc_by_class(arr):
    for i in arr:
        print(round(i*100, 2))

## import libraries

In [256]:
from sklearn import ensemble
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn import svm

In [257]:
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras import losses, optimizers

In [258]:
from sklearn import metrics

## get data

In [259]:

#X_train_val, y_train_val

In [260]:
X_train, X_val, y_train_bin, y_val_bin = model_selection.train_test_split(X_train_val_prepared_PCA, y_train_val, test_size=0.33, stratify=y_train_val)

In [530]:
y_train_num = labelBin.inverse_transform(y_train_bin)
y_val_num = labelBin.inverse_transform(y_val_bin)
y_test_num = labelBin.inverse_transform(y_test)
y_train_val_num = labelBin.inverse_transform(y_train_val)

In [262]:
print("X train/val shape: ", X_train.shape, X_val.shape)
print("y encoded train/val shape: ", y_train_bin.shape, y_val_bin.shape)
print("y train/val shape: ", y_train_num.shape, y_val_num.shape)

X train/val shape:  (1526, 12) (752, 12)
y encoded train/val shape:  (1526, 7) (752, 7)
y train/val shape:  (1526,) (752,)


In [529]:

#print("y test shape: ", y_test.shape)

## Multionomial logistic regression

In [263]:
from sklearn.linear_model import LogisticRegression

In [327]:
def configure_logistic_regression(conf):
    return LogisticRegression(C = conf, class_weight='balanced')


In [361]:
lr_clf = configure_logistic_regression(12)

In [362]:
lr_clf.fit(X_train, y_train_num)

LogisticRegression(C=12, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [363]:
evaluate_model(lr_clf, (X_train, X_val, y_train_num, y_val_num), "Logistic regression")

Model: Logistic regression
Train set report: 
             precision    recall  f1-score   support

        1.0       0.95      0.98      0.97       194
        2.0       0.80      0.67      0.73        18
        3.0       0.99      1.00      0.99        70
        4.0       1.00      1.00      1.00       321
        5.0       1.00      1.00      1.00       126
        6.0       1.00      0.99      0.99       706
        7.0       0.97      1.00      0.98        91

avg / total       0.99      0.99      0.99      1526

Validation set report: 
             precision    recall  f1-score   support

        1.0       0.93      0.97      0.95        95
        2.0       0.43      0.33      0.38         9
        3.0       0.97      1.00      0.99        35
        4.0       0.99      0.99      0.99       158
        5.0       0.97      1.00      0.98        62
        6.0       1.00      0.98      0.99       348
        7.0       0.96      0.98      0.97        45

avg / total       0.98  

### Evaluating  best model on test set

In [805]:
model = LogisticRegression(C = 12, class_weight='balanced')
model.fit(X_train_val_prepared_PCA, y_train_val_num)
evaluate_model(model, (X_train_val_prepared_PCA, X_test_prepared_PCA, y_train_val_num, y_test_num), "Logistic regression")
print("Model: Logistic Regression")
print("Weighted F1-score: ", get_f1_score(model, X_test_prepared_PCA, y_test_num))
print("Accuracy: ", get_acc_score(model, X_test_prepared_PCA, y_test_num))
print_acc_by_class(get_acc_by_class(model, X_test_prepared_PCA, y_test_num))

Model: Logistic regression
Train set report: 
             precision    recall  f1-score   support

        1.0       0.94      0.98      0.96       289
        2.0       0.59      0.48      0.53        27
        3.0       0.99      1.00      1.00       105
        4.0       1.00      1.00      1.00       479
        5.0       0.99      1.00      1.00       188
        6.0       1.00      0.99      0.99      1054
        7.0       0.97      0.99      0.98       136

avg / total       0.98      0.98      0.98      2278

Validation set report: 
             precision    recall  f1-score   support

        1.0       0.91      0.97      0.94       143
        2.0       0.43      0.23      0.30        13
        3.0       0.96      1.00      0.98        51
        4.0       0.99      0.99      0.99       236
        5.0       0.98      0.99      0.98        93
        6.0       1.00      0.98      0.99       520
        7.0       0.97      1.00      0.99        67

avg / total       0.97  

## Gradient boosting

In [268]:
grad_boost_clf = ensemble.GradientBoostingClassifier(n_estimators=600, max_depth=2, learning_rate=0.006)

In [269]:
grad_boost_clf = train_model(grad_boost_clf, (X_train, y_train_num))

In [270]:
evaluate_model(grad_boost_clf, (X_train, X_val, y_train_num, y_val_num), "Grad Boost")


Model: Grad Boost
Train set report: 
             precision    recall  f1-score   support

        1.0       0.98      0.99      0.99       194
        2.0       1.00      0.78      0.88        18
        3.0       1.00      1.00      1.00        70
        4.0       1.00      1.00      1.00       321
        5.0       0.99      1.00      1.00       126
        6.0       1.00      1.00      1.00       706
        7.0       1.00      1.00      1.00        91

avg / total       1.00      1.00      1.00      1526

Validation set report: 
             precision    recall  f1-score   support

        1.0       0.93      0.98      0.95        95
        2.0       1.00      0.22      0.36         9
        3.0       1.00      0.97      0.99        35
        4.0       0.99      0.99      0.99       158
        5.0       0.97      1.00      0.98        62
        6.0       0.99      0.99      0.99       348
        7.0       0.98      1.00      0.99        45

avg / total       0.98      0.98 

### Evaluating best model on test set

In [806]:
model = ensemble.GradientBoostingClassifier(n_estimators=600, max_depth=2, learning_rate=0.006)
model.fit(X_train_val_prepared_PCA, y_train_val_num)
evaluate_model(model, (X_train_val_prepared_PCA, X_test_prepared_PCA, y_train_val_num, y_test_num), "Grad Boost regression")
print("Model: Grad Boost")
print("Weighted F1-score: ", get_f1_score(model, X_test_prepared_PCA, y_test_num))
print("Accuracy: ", get_acc_score(model, X_test_prepared_PCA, y_test_num))
print_acc_by_class(get_acc_by_class(model, X_test_prepared_PCA, y_test_num))

Model: Grad Boost regression
Train set report: 
             precision    recall  f1-score   support

        1.0       0.96      1.00      0.98       289
        2.0       1.00      0.59      0.74        27
        3.0       1.00      1.00      1.00       105
        4.0       1.00      1.00      1.00       479
        5.0       0.99      1.00      0.99       188
        6.0       1.00      1.00      1.00      1054
        7.0       1.00      1.00      1.00       136

avg / total       0.99      0.99      0.99      2278

Validation set report: 
             precision    recall  f1-score   support

        1.0       0.92      0.98      0.95       143
        2.0       1.00      0.15      0.27        13
        3.0       1.00      1.00      1.00        51
        4.0       0.99      0.99      0.99       236
        5.0       0.98      0.95      0.96        93
        6.0       0.98      0.99      0.99       520
        7.0       0.99      0.99      0.99        67

avg / total       0.98

## ADA Boosting

In [377]:
best_f1_score = 0
best_params = (0,0,0)

for max_depth in range(2,5):
    for n_estimators in range(300, 800, 100):
        for learning_rate in [0.1, 0.33, 1, 3.33, 10]:
            ada_boost_clf = ensemble.AdaBoostClassifier(tree.DecisionTreeClassifier(max_depth=max_depth),
                                                        algorithm='SAMME',
                                                        n_estimators=n_estimators, 
                                                        learning_rate=0.5)

            ada_boost_clf = train_model(ada_boost_clf, (X_train, y_train_num))
            #y_train_pred = model.predict(X_train)
            y_val_pred = ada_boost_clf.predict(X_val)
            curr_f1_score = metrics.f1_score(y_val_num, y_val_pred, average='weighted')
            if curr_f1_score > best_f1_score:
                best_f1_score = curr_f1_score
                best_params = (max_depth, n_estimators, learning_rate)

print(best_f1_score, best_params)

0.9786557151410951 (4, 300)


In [None]:
print(best_f1_score, best_params)

In [378]:
# Create and fit an AdaBoosted decision tree
ada_boost_clf = ensemble.AdaBoostClassifier(tree.DecisionTreeClassifier(max_depth=4),
                                            algorithm="SAMME",
                                            n_estimators=300,
                                            learning_rate=0.5)


In [379]:
ada_boost_clf = train_model(ada_boost_clf, (X_train, y_train_num))

In [380]:
evaluate_model(ada_boost_clf, (X_train, X_val, y_train_num, y_val_num), "Ada Boost")

Model: Ada Boost
Train set report: 
             precision    recall  f1-score   support

        1.0       1.00      1.00      1.00       194
        2.0       1.00      1.00      1.00        18
        3.0       1.00      1.00      1.00        70
        4.0       1.00      1.00      1.00       321
        5.0       1.00      1.00      1.00       126
        6.0       1.00      1.00      1.00       706
        7.0       1.00      1.00      1.00        91

avg / total       1.00      1.00      1.00      1526

Validation set report: 
             precision    recall  f1-score   support

        1.0       0.94      0.97      0.95        95
        2.0       0.60      0.33      0.43         9
        3.0       1.00      0.97      0.99        35
        4.0       0.99      0.99      0.99       158
        5.0       0.97      1.00      0.98        62
        6.0       0.99      0.99      0.99       348
        7.0       0.96      1.00      0.98        45

avg / total       0.98      0.98  

### Evaluating best model on test set

In [812]:
model = ensemble.AdaBoostClassifier(tree.DecisionTreeClassifier(max_depth=4),
                                    algorithm="SAMME",
                                    n_estimators=300,
                                    learning_rate=0.5)
model.fit(X_train_val_prepared_PCA, y_train_val_num)
evaluate_model(model, (X_train_val_prepared_PCA, X_test_prepared_PCA, y_train_val_num, y_test_num), "Ada Boost regression")
print("Model: ADA Boost")
print("Weighted F1-score: ", get_f1_score(model, X_test_prepared_PCA, y_test_num))
print("Accuracy: ", get_acc_score(model, X_test_prepared_PCA, y_test_num))

print_acc_by_class(get_acc_by_class(model, X_test_prepared_PCA, y_test_num))

Model: Ada Boost regression
Train set report: 
             precision    recall  f1-score   support

        1.0       1.00      1.00      1.00       289
        2.0       1.00      1.00      1.00        27
        3.0       1.00      1.00      1.00       105
        4.0       1.00      1.00      1.00       479
        5.0       1.00      1.00      1.00       188
        6.0       1.00      1.00      1.00      1054
        7.0       1.00      1.00      1.00       136

avg / total       1.00      1.00      1.00      2278

Validation set report: 
             precision    recall  f1-score   support

        1.0       0.97      0.97      0.97       143
        2.0       0.73      0.62      0.67        13
        3.0       1.00      0.98      0.99        51
        4.0       0.99      0.99      0.99       236
        5.0       0.99      0.99      0.99        93
        6.0       0.99      0.99      0.99       520
        7.0       0.98      0.97      0.98        67

avg / total       0.98 

## XGBoost

In [274]:
## - pip install xgboost
## -- import xgboost as xgb

In [275]:
import xgboost as xgb

In [397]:
best_f1_score = 0
best_params = (0,0,0)

for max_depth in range(3,5):
    for n_estimators in range(200, 500, 100):
        for learning_rate in [0.01, 0.033, 0.1, 0.33, 1]:
            for booster in ['gbtree', 'gblinear', 'dart']:
                xgb_clf = xgb.XGBClassifier(objective='multi:softprob', 
                                            max_depth=max_depth,
                                            n_estimators=n_estimators,
                                            learning_rate=learning_rate,
                                            booster=booster)

                xgb_clf = train_model(ada_boost_clf, (X_train, y_train_num))
                #y_train_pred = model.predict(X_train)
                y_val_pred = xgb_clf.predict(X_val)
                curr_f1_score = metrics.f1_score(y_val_num, y_val_pred, average='weighted')
                if curr_f1_score > best_f1_score:
                    best_f1_score = curr_f1_score
                    best_params = (max_depth, n_estimators, learning_rate, booster)

print(best_f1_score, best_params)

0.9786557151410951 (3, 200, 0.1, 'gbtree')


In [521]:
xgb_clf = xgb.XGBClassifier(objective='multi:softprob', 
                            max_depth=5, 
                            n_estimators=500, 
                            learning_rate=0.01,
                            subsample=0.8,
                            colsample_bytree=1,
                            gamma=1)
                            #booster='dart'


In [522]:
xgb_clf = train_model(xgb_clf, (X_train, y_train_num))

In [523]:
y_val_pred = xgb_clf.predict(X_val)
curr_f1_score = metrics.f1_score(y_val_num, y_val_pred, average='weighted')
curr_f1_score

  if diff:


0.9808404145191841

In [524]:
evaluate_model(xgb_clf, (X_train, X_val, y_train_num, y_val_num), "XGBoost")

Model: XGBoost
Train set report: 
             precision    recall  f1-score   support

        1.0       0.98      1.00      0.99       194
        2.0       1.00      0.83      0.91        18
        3.0       1.00      1.00      1.00        70
        4.0       1.00      1.00      1.00       321
        5.0       0.99      1.00      1.00       126
        6.0       1.00      1.00      1.00       706
        7.0       1.00      1.00      1.00        91

avg / total       1.00      1.00      1.00      1526

Validation set report: 
             precision    recall  f1-score   support

        1.0       0.94      0.99      0.96        95
        2.0       1.00      0.33      0.50         9
        3.0       1.00      0.97      0.99        35
        4.0       0.99      0.99      0.99       158
        5.0       0.98      1.00      0.99        62
        6.0       0.99      0.99      0.99       348
        7.0       0.98      1.00      0.99        45

avg / total       0.98      0.98    

  if diff:
  if diff:


### Evaluating best model on test set

In [813]:
model = xgb.XGBClassifier(objective='multi:softprob', 
                            max_depth=5, 
                            n_estimators=500, 
                            learning_rate=0.01,
                            subsample=0.8,
                            colsample_bytree=1,
                            gamma=1)
model.fit(X_train_val_prepared_PCA, y_train_val_num)
evaluate_model(model, (X_train_val_prepared_PCA, X_test_prepared_PCA, y_train_val_num, y_test_num), "Ada Boost regression")
print("Model: XGBoost")
print("Weighted F1-score: ", get_f1_score(model, X_test_prepared_PCA, y_test_num))
print("Accuracy: ", get_acc_score(model, X_test_prepared_PCA, y_test_num))


print_acc_by_class(get_acc_by_class(model, X_test_prepared_PCA, y_test_num))

  if diff:
  if diff:


Model: Ada Boost regression
Train set report: 
             precision    recall  f1-score   support

        1.0       0.99      1.00      0.99       289
        2.0       1.00      0.89      0.94        27
        3.0       1.00      1.00      1.00       105
        4.0       1.00      1.00      1.00       479
        5.0       0.99      1.00      1.00       188
        6.0       1.00      1.00      1.00      1054
        7.0       1.00      1.00      1.00       136

avg / total       1.00      1.00      1.00      2278

Validation set report: 
             precision    recall  f1-score   support

        1.0       0.94      0.99      0.96       143
        2.0       0.83      0.38      0.53        13
        3.0       1.00      1.00      1.00        51
        4.0       1.00      0.99      0.99       236
        5.0       0.98      0.99      0.98        93
        6.0       0.99      0.99      0.99       520
        7.0       0.97      0.97      0.97        67

avg / total       0.98 

  if diff:
  if diff:


Accuracy:  0.981300089047195
98.6
38.46
100.0
99.15
98.92
98.85
97.01


  if diff:


## Neural network

In [653]:
from keras.layers import Dropout

In [654]:
number_of_features = X_train.shape[-1]
output_size = y_train_bin.shape[-1]

In [655]:
number_of_features

12

In [690]:

nn_clf = Sequential()
nn_clf.add(Dense(units=15, input_dim=number_of_features, activation='relu'))
nn_clf.add(Dense(units=15, activation='relu'))
nn_clf.add(Dense(units=15, activation='relu'))
nn_clf.add(Dense(units=15, activation='relu'))
nn_clf.add(Dense(units=15, activation='relu'))
nn_clf.add(Dense(units=15, activation='relu'))
nn_clf.add(Dense(units=15, activation='relu'))
nn_clf.add(Dense(units=15, activation='relu'))
nn_clf.add(Dense(units=output_size, activation='sigmoid'))

In [691]:
nn_clf.compile(optimizer='adam', loss=losses.binary_crossentropy, metrics=['accuracy'])

In [692]:
nn_clf.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_166 (Dense)            (None, 15)                195       
_________________________________________________________________
dense_167 (Dense)            (None, 15)                240       
_________________________________________________________________
dense_168 (Dense)            (None, 15)                240       
_________________________________________________________________
dense_169 (Dense)            (None, 15)                240       
_________________________________________________________________
dense_170 (Dense)            (None, 15)                240       
_________________________________________________________________
dense_171 (Dense)            (None, 15)                240       
_________________________________________________________________
dense_172 (Dense)            (None, 15)                240       
__________

In [693]:
history = nn_clf.fit(X_train, y_train_bin, epochs=100, batch_size=32, verbose=2, validation_data=(X_val, y_val_bin))

Train on 1526 samples, validate on 752 samples
Epoch 1/100
 - 2s - loss: 0.6717 - acc: 0.5990 - val_loss: 0.5206 - val_acc: 0.7449
Epoch 2/100
 - 0s - loss: 0.3292 - acc: 0.8707 - val_loss: 0.2194 - val_acc: 0.9043
Epoch 3/100
 - 0s - loss: 0.1911 - acc: 0.9078 - val_loss: 0.1791 - val_acc: 0.9107
Epoch 4/100
 - 0s - loss: 0.1618 - acc: 0.9188 - val_loss: 0.1554 - val_acc: 0.9291
Epoch 5/100
 - 0s - loss: 0.1369 - acc: 0.9400 - val_loss: 0.1275 - val_acc: 0.9441
Epoch 6/100
 - 0s - loss: 0.1042 - acc: 0.9565 - val_loss: 0.0936 - val_acc: 0.9652
Epoch 7/100
 - 0s - loss: 0.0724 - acc: 0.9802 - val_loss: 0.0680 - val_acc: 0.9821
Epoch 8/100
 - 0s - loss: 0.0537 - acc: 0.9846 - val_loss: 0.0555 - val_acc: 0.9837
Epoch 9/100
 - 0s - loss: 0.0443 - acc: 0.9855 - val_loss: 0.0467 - val_acc: 0.9854
Epoch 10/100
 - 0s - loss: 0.0392 - acc: 0.9867 - val_loss: 0.0413 - val_acc: 0.9863
Epoch 11/100
 - 0s - loss: 0.0341 - acc: 0.9879 - val_loss: 0.0386 - val_acc: 0.9858
Epoch 12/100
 - 0s - loss: 

Epoch 97/100
 - 0s - loss: 0.0046 - acc: 0.9993 - val_loss: 0.0225 - val_acc: 0.9962
Epoch 98/100
 - 0s - loss: 0.0061 - acc: 0.9987 - val_loss: 0.0273 - val_acc: 0.9954
Epoch 99/100
 - 0s - loss: 0.0059 - acc: 0.9988 - val_loss: 0.0200 - val_acc: 0.9968
Epoch 100/100
 - 0s - loss: 0.0050 - acc: 0.9993 - val_loss: 0.0212 - val_acc: 0.9968


In [696]:
evaluate_nn(nn_clf, (X_train, X_val, y_train_num, y_val_num), "NN")

Model: NN
Train set report: 
             precision    recall  f1-score   support

        1.0       1.00      0.98      0.99       194
        2.0       0.86      1.00      0.92        18
        3.0       1.00      1.00      1.00        70
        4.0       1.00      1.00      1.00       321
        5.0       1.00      1.00      1.00       126
        6.0       1.00      1.00      1.00       706
        7.0       1.00      1.00      1.00        91

avg / total       1.00      1.00      1.00      1526

Validation set report: 
             precision    recall  f1-score   support

        1.0       0.99      0.97      0.98        95
        2.0       0.80      0.89      0.84         9
        3.0       0.95      1.00      0.97        35
        4.0       0.99      0.99      0.99       158
        5.0       0.97      0.98      0.98        62
        6.0       1.00      0.99      1.00       348
        7.0       0.98      0.98      0.98        45

avg / total       0.99      0.99      0.9

In [635]:
#evaluate_model(nn_clf, (X_train, X_val, y_train_bin, y_val_bin), 'NN')

### Evaluate best model on test set


In [697]:
number_of_features = X_train_val_prepared_PCA.shape[-1]
output_size = y_train_val.shape[-1]

In [698]:

nn_clf = Sequential()
nn_clf.add(Dense(units=15, input_dim=number_of_features, activation='relu'))
nn_clf.add(Dense(units=15, activation='relu'))
nn_clf.add(Dense(units=15, activation='relu'))
nn_clf.add(Dense(units=15, activation='relu'))
nn_clf.add(Dense(units=15, activation='relu'))
nn_clf.add(Dense(units=15, activation='relu'))
nn_clf.add(Dense(units=15, activation='relu'))
nn_clf.add(Dense(units=15, activation='relu'))
nn_clf.add(Dense(units=output_size, activation='sigmoid'))

In [699]:
nn_clf.compile(optimizer='adam', loss=losses.binary_crossentropy, metrics=['accuracy'])

In [700]:
nn_clf.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_175 (Dense)            (None, 15)                195       
_________________________________________________________________
dense_176 (Dense)            (None, 15)                240       
_________________________________________________________________
dense_177 (Dense)            (None, 15)                240       
_________________________________________________________________
dense_178 (Dense)            (None, 15)                240       
_________________________________________________________________
dense_179 (Dense)            (None, 15)                240       
_________________________________________________________________
dense_180 (Dense)            (None, 15)                240       
_________________________________________________________________
dense_181 (Dense)            (None, 15)                240       
__________

In [701]:
history = nn_clf.fit(X_train_val_prepared_PCA, y_train_val, epochs=100, batch_size=30, verbose=2)

Epoch 1/100
 - 2s - loss: 0.3348 - acc: 0.8747
Epoch 2/100
 - 0s - loss: 0.1332 - acc: 0.9520
Epoch 3/100
 - 0s - loss: 0.0707 - acc: 0.9695
Epoch 4/100
 - 0s - loss: 0.0403 - acc: 0.9879
Epoch 5/100
 - 0s - loss: 0.0276 - acc: 0.9922
Epoch 6/100
 - 0s - loss: 0.0225 - acc: 0.9932
Epoch 7/100
 - 1s - loss: 0.0193 - acc: 0.9934
Epoch 8/100
 - 0s - loss: 0.0173 - acc: 0.9941
Epoch 9/100
 - 0s - loss: 0.0156 - acc: 0.9944
Epoch 10/100
 - 0s - loss: 0.0155 - acc: 0.9939
Epoch 11/100
 - 0s - loss: 0.0152 - acc: 0.9945
Epoch 12/100
 - 0s - loss: 0.0129 - acc: 0.9956
Epoch 13/100
 - 0s - loss: 0.0151 - acc: 0.9944
Epoch 14/100
 - 1s - loss: 0.0128 - acc: 0.9955
Epoch 15/100
 - 0s - loss: 0.0116 - acc: 0.9962
Epoch 16/100
 - 0s - loss: 0.0112 - acc: 0.9961
Epoch 17/100
 - 0s - loss: 0.0125 - acc: 0.9957
Epoch 18/100
 - 0s - loss: 0.0141 - acc: 0.9950
Epoch 19/100
 - 0s - loss: 0.0092 - acc: 0.9965
Epoch 20/100
 - 0s - loss: 0.0087 - acc: 0.9971
Epoch 21/100
 - 1s - loss: 0.0074 - acc: 0.9974
E

In [781]:
evaluate_nn(nn_clf, (X_train_val_prepared_PCA, X_test_prepared_PCA, y_train_val_num, y_test_num), "NN")

Model: NN
Train set report: 
             precision    recall  f1-score   support

        1.0       1.00      0.99      0.99       289
        2.0       0.90      1.00      0.95        27
        3.0       0.98      1.00      0.99       105
        4.0       1.00      1.00      1.00       479
        5.0       1.00      1.00      1.00       188
        6.0       1.00      1.00      1.00      1054
        7.0       1.00      1.00      1.00       136

avg / total       1.00      1.00      1.00      2278

Validation set report: 
             precision    recall  f1-score   support

        1.0       0.93      0.97      0.95       143
        2.0       0.58      0.54      0.56        13
        3.0       0.94      1.00      0.97        51
        4.0       0.99      0.98      0.99       236
        5.0       1.00      0.99      0.99        93
        6.0       1.00      0.99      0.99       520
        7.0       0.97      1.00      0.99        67

avg / total       0.98      0.98      0.9

In [782]:
print(get_f1_score_nn(nn_clf, X_test_prepared_PCA, y_test_num))

0.9794591077255936


In [783]:
print(get_acc_score_nn(nn_clf, X_test_prepared_PCA, y_test_num))

0.9795191451469278


In [817]:

print_acc_by_class(get_acc_by_class_nn(nn_clf, X_test_prepared_PCA, y_test_num))

96.5
53.85
100.0
98.31
98.92
98.65
100.0


# --------


## kNN

In [762]:
kNN_clf = KNeighborsClassifier(n_neighbors=10, weights='distance')

In [763]:
kNN_clf = train_model(kNN_clf, (X_train, y_train_num))

In [764]:
evaluate_model(kNN_clf, (X_train, X_val, y_train_num, y_val_num), "kNN")

Model: kNN
Train set report: 
             precision    recall  f1-score   support

        1.0       1.00      1.00      1.00       194
        2.0       1.00      1.00      1.00        18
        3.0       1.00      1.00      1.00        70
        4.0       1.00      1.00      1.00       321
        5.0       1.00      1.00      1.00       126
        6.0       1.00      1.00      1.00       706
        7.0       1.00      1.00      1.00        91

avg / total       1.00      1.00      1.00      1526

Validation set report: 
             precision    recall  f1-score   support

        1.0       0.95      1.00      0.97        95
        2.0       1.00      0.44      0.62         9
        3.0       0.95      1.00      0.97        35
        4.0       1.00      0.99      0.99       158
        5.0       0.98      0.98      0.98        62
        6.0       1.00      0.99      1.00       348
        7.0       0.98      1.00      0.99        45

avg / total       0.99      0.99      0.

### Evaluating best model on test set

In [818]:
model = KNeighborsClassifier(n_neighbors=10, weights='distance')
model.fit(X_train_val_prepared_PCA, y_train_val_num)
evaluate_model(model, (X_train_val_prepared_PCA, X_test_prepared_PCA, y_train_val_num, y_test_num), "Ada Boost regression")
print("Model: KNN")
print("Weighted F1-score: ", get_f1_score(model, X_test_prepared_PCA, y_test_num))
print("Accuracy: ", get_acc_score(model, X_test_prepared_PCA, y_test_num))
print_acc_by_class(get_acc_by_class(model, X_test_prepared_PCA, y_test_num))

Model: Ada Boost regression
Train set report: 
             precision    recall  f1-score   support

        1.0       1.00      1.00      1.00       289
        2.0       1.00      1.00      1.00        27
        3.0       1.00      1.00      1.00       105
        4.0       1.00      1.00      1.00       479
        5.0       1.00      1.00      1.00       188
        6.0       1.00      1.00      1.00      1054
        7.0       1.00      1.00      1.00       136

avg / total       1.00      1.00      1.00      2278

Validation set report: 
             precision    recall  f1-score   support

        1.0       0.93      0.98      0.96       143
        2.0       0.50      0.23      0.32        13
        3.0       0.96      0.98      0.97        51
        4.0       1.00      0.99      1.00       236
        5.0       1.00      0.99      0.99        93
        6.0       0.99      1.00      1.00       520
        7.0       1.00      1.00      1.00        67

avg / total       0.98 

## SVM

In [290]:
svm_clf = svm.SVC(C=0.004, kernel='poly', degree=2)

In [291]:
svm_clf = train_model(svm_clf, (X_train, y_train_num))

In [292]:
evaluate_model(svm_clf, (X_train, X_val, y_train_num, y_val_num), "SVM")

Model: SVM
Train set report: 
             precision    recall  f1-score   support

        1.0       0.98      0.99      0.99       194
        2.0       0.94      0.83      0.88        18
        3.0       1.00      1.00      1.00        70
        4.0       1.00      1.00      1.00       321
        5.0       1.00      1.00      1.00       126
        6.0       1.00      1.00      1.00       706
        7.0       1.00      1.00      1.00        91

avg / total       1.00      1.00      1.00      1526

Validation set report: 
             precision    recall  f1-score   support

        1.0       0.97      0.99      0.98        95
        2.0       0.86      0.67      0.75         9
        3.0       1.00      1.00      1.00        35
        4.0       0.99      0.99      0.99       158
        5.0       1.00      0.97      0.98        62
        6.0       0.99      0.99      0.99       348
        7.0       0.98      0.98      0.98        45

avg / total       0.98      0.98      0.

### Evaluating best model on test set

In [819]:
model = svm.SVC(C=0.004, kernel='poly', degree=2)
model.fit(X_train_val_prepared_PCA, y_train_val_num)
evaluate_model(model, (X_train_val_prepared_PCA, X_test_prepared_PCA, y_train_val_num, y_test_num), "Ada Boost regression")
print("Model: SVM")
print("Weighted F1-score: ", get_f1_score(model, X_test_prepared_PCA, y_test_num))
print("Accuracy: ", get_acc_score(model, X_test_prepared_PCA, y_test_num))
print_acc_by_class(get_acc_by_class(model, X_test_prepared_PCA, y_test_num))

Model: Ada Boost regression
Train set report: 
             precision    recall  f1-score   support

        1.0       0.99      1.00      0.99       289
        2.0       0.96      0.85      0.90        27
        3.0       1.00      1.00      1.00       105
        4.0       1.00      1.00      1.00       479
        5.0       1.00      1.00      1.00       188
        6.0       1.00      1.00      1.00      1054
        7.0       1.00      1.00      1.00       136

avg / total       1.00      1.00      1.00      2278

Validation set report: 
             precision    recall  f1-score   support

        1.0       0.92      0.97      0.95       143
        2.0       0.67      0.62      0.64        13
        3.0       1.00      0.98      0.99        51
        4.0       1.00      1.00      1.00       236
        5.0       1.00      0.98      0.99        93
        6.0       1.00      0.99      0.99       520
        7.0       1.00      1.00      1.00        67

avg / total       0.98 

## Random forest

In [293]:
list(range(2,10,2))

[2, 4, 6, 8]

In [294]:
best_f1_score = 0
best_params = (0,0)

for max_depth in range(2,6):
    for n_estimators in range(200, 1600, 100):
        random_forest_clf = ensemble.RandomForestClassifier(max_depth=max_depth, 
                                                            n_estimators=n_estimators, 
                                                            criterion='gini')
        
        random_forest_clf = train_model(random_forest_clf, (X_train, y_train_num))
        #y_train_pred = model.predict(X_train)
        y_val_pred = random_forest_clf.predict(X_val)
        curr_f1_score = metrics.f1_score(y_val_num, y_val_pred, average='weighted')
        if curr_f1_score > best_f1_score:
            best_f1_score = curr_f1_score
            best_params = (max_depth, n_estimators)

print(best_f1_score, best_params)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


0.978691597838611 (5, 300)


In [301]:
random_forest_clf = ensemble.RandomForestClassifier(max_depth=5, n_estimators=300, criterion='gini', )

In [302]:
random_forest_clf = train_model(random_forest_clf, (X_train, y_train_num))

In [303]:
evaluate_model(random_forest_clf, (X_train, X_val, y_train_num, y_val_num), "Random Forest")

Model: Random Forest
Train set report: 
             precision    recall  f1-score   support

        1.0       0.94      0.98      0.96       194
        2.0       1.00      0.28      0.43        18
        3.0       1.00      1.00      1.00        70
        4.0       1.00      1.00      1.00       321
        5.0       0.99      1.00      1.00       126
        6.0       1.00      1.00      1.00       706
        7.0       1.00      1.00      1.00        91

avg / total       0.99      0.99      0.99      1526

Validation set report: 
             precision    recall  f1-score   support

        1.0       0.91      0.99      0.95        95
        2.0       0.00      0.00      0.00         9
        3.0       1.00      1.00      1.00        35
        4.0       1.00      0.99      1.00       158
        5.0       0.98      0.98      0.98        62
        6.0       0.99      0.99      0.99       348
        7.0       0.98      1.00      0.99        45

avg / total       0.97      0.

  'precision', 'predicted', average, warn_for)


### Evaluating best model on test set

In [823]:
model = ensemble.RandomForestClassifier(max_depth=5, n_estimators=300, criterion='gini', )
model.fit(X_train_val_prepared_PCA, y_train_val_num)
evaluate_model(model, (X_train_val_prepared_PCA, X_test_prepared_PCA, y_train_val_num, y_test_num), "Random Forest regression")
print("Model: Random Forest")
print("Weighted F1-score: ", get_f1_score(model, X_test_prepared_PCA, y_test_num))
print("Accuracy: ", get_acc_score(model, X_test_prepared_PCA, y_test_num))

print_acc_by_class(get_acc_by_class(model, X_test_prepared_PCA, y_test_num))

Model: Random Forest regression
Train set report: 
             precision    recall  f1-score   support

        1.0       0.92      0.98      0.95       289
        2.0       1.00      0.15      0.26        27
        3.0       1.00      1.00      1.00       105
        4.0       1.00      1.00      1.00       479
        5.0       0.98      1.00      0.99       188
        6.0       0.99      1.00      1.00      1054
        7.0       1.00      0.99      1.00       136

avg / total       0.99      0.99      0.98      2278

Validation set report: 
             precision    recall  f1-score   support

        1.0       0.93      0.98      0.96       143
        2.0       1.00      0.08      0.14        13
        3.0       1.00      0.98      0.99        51
        4.0       1.00      0.99      0.99       236
        5.0       0.99      0.98      0.98        93
        6.0       0.98      1.00      0.99       520
        7.0       1.00      1.00      1.00        67

avg / total       0