In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn import model_selection
from sklearn.decomposition import PCA

In [2]:
type_precision = "float32"

In [3]:
tabula_muris_path = "../datasets/tabula_muris_whole/"
all_counts_path = "brain_mouse_matrix_all_counts.csv"
all_data_path = "brain_mouse_matrix_all_data.csv"
all_scaled_path = "brain_mouse_matrix_all_scale_data.csv"

In [4]:
all_counts = pd.read_csv(tabula_muris_path + all_counts_path, sep=" ")

In [5]:
#all_data =  pd.read_csv(tabula_muris_path + all_data_path, sep=" ")

In [6]:
#all_scaled =  pd.read_csv(tabula_muris_path + all_scaled_path, sep=" ")

In [7]:
#all_counts.describe()

In [8]:
all_counts.head()

Unnamed: 0,CELL_ID,0610005C13Rik,0610007C21Rik,0610007L01Rik,0610007N19Rik,0610007P08Rik,0610007P14Rik,0610007P22Rik,0610008F07Rik,0610009B14Rik,...,Zxdc,Zyg11a,Zyg11b,Zyx,Zzef1,Zzz3,a,l7Rn6,zsGreen-transgene,annotation
0,A1.B003290.3_38_F.1.1,0,125,16,0,0,0,0,0,0,...,0,0,0,0,0,0,0,54,0,1
1,A1.B003728.3_56_F.1.1,0,0,0,0,0,324,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,A1.MAA000560.3_10_M.1.1,0,348,0,0,0,5,0,0,0,...,0,0,0,0,195,0,0,113,0,6
3,A1.MAA000564.3_10_M.1.1,0,41,36,0,0,24,0,0,0,...,0,0,0,125,0,1,0,0,0,4
4,A1.MAA000923.3_9_M.1.1,0,53,0,0,0,0,0,0,0,...,0,0,81,0,0,0,0,0,0,1


# Preprocessing

In [9]:
def log_normalize_data(data, scale=1000000.0):
    data_row_sums = np.sum(data, axis=1).reshape(-1, 1)
    return np.log(1 + scale * data / data_row_sums)

## Data cleaning
- one hot encoding of y
- Log normalize all data
- Split test and train data (stratified by y)
- Scale data by normal distribution

In [10]:
X = all_counts.iloc[:,1:-1].to_numpy(dtype=type_precision)
y_num = all_counts.iloc[:, -1].to_numpy(dtype=type_precision)

In [11]:
labelBin = preprocessing.LabelBinarizer()
labelBin.fit(y_num)
y = labelBin.transform(y_num)

In [12]:
X = log_normalize_data(X)

In [13]:
X_train_val, X_test, y_train_val, y_test = model_selection.train_test_split(X, y, test_size=0.33, stratify=y)

In [14]:
scaler = preprocessing.StandardScaler()
scaler.fit(X_train_val)
X_train_val = scaler.transform(X_train_val)
X_test = scaler.transform(X_test)

## Dimension reduction #1 - PCA
- TODO: pick different number of components

In [15]:
pca = PCA(n_components=12)

In [16]:
pca.fit(X_train_val)
X_train_val_prepared_PCA = pca.transform(X_train_val)
X_test_prepared_PCA = pca.transform(X_test)

In [17]:
print("Explained variance: {}".format(np.sum(pca.explained_variance_ratio_)))

Explained variance: 0.14787763357162476


In [18]:
X_train_val_prepared_PCA.shape, X_test_prepared_PCA.shape

((2278, 12), (1123, 12))

## Dimension reduction #2 - Autoencoder
- TODO

# Models 
- models: gradient boosting, NN, kNN(k=30 za pocetak), SVM, random forest 
- clean data
- pick dimension reduction method
- pick model
- split train and validation set

## utility functions

In [19]:
def train_model(model, data):
    X_train, y_train = data
    model.fit(X_train, y_train)
    return model

In [20]:
def evaluate_model(model, data, name):
    X_train, X_val, y_train, y_val = data
    y_train_pred = model.predict(X_train)
    y_val_pred = model.predict(X_val)
    print('Model: {}'.format(name))
    print('Train set report: \n{}'.format(metrics.classification_report(y_train, y_train_pred)))
    print('Validation set report: \n{}'.format(metrics.classification_report(y_val, y_val_pred)))

In [21]:
def evaluate_nn(model, data, name):
    X_train, X_val, y_train, y_val = data
    y_train_pred = np.argmax(model.predict(X_train), axis=1) + 1
    y_val_pred = np.argmax(model.predict(X_val), axis=1) + 1
    print('Model: {}'.format(name))
    print('Train set report: \n{}'.format(metrics.classification_report(y_train, y_train_pred)))
    print('Validation set report: \n{}'.format(metrics.classification_report(y_val, y_val_pred)))

## import libraries

In [22]:
from sklearn import ensemble
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn import svm

In [23]:
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras import losses, optimizers

Using TensorFlow backend.


In [24]:
from sklearn import metrics

## get data

In [25]:
#X_train_val, y_train_val

In [26]:
X_train, X_val, y_train_bin, y_val_bin = model_selection.train_test_split(X_train_val_prepared_PCA, y_train_val, test_size=0.33, stratify=y_train_val)

In [27]:
y_train_num = labelBin.inverse_transform(y_train_bin)
y_val_num = labelBin.inverse_transform(y_val_bin)

In [28]:
print("X train/val shape: ", X_train.shape, X_val.shape)
print("y encoded train/val shape: ", y_train_bin.shape, y_val_bin.shape)
print("y train/val shape: ", y_train_num.shape, y_val_num.shape)

X train/val shape:  (1526, 12) (752, 12)
y encoded train/val shape:  (1526, 7) (752, 7)
y train/val shape:  (1526,) (752,)


## Gradient boosting

In [29]:
grad_boost_clf = ensemble.GradientBoostingClassifier(n_estimators=600, max_depth=2, learning_rate=0.006)

In [30]:
grad_boost_clf = train_model(grad_boost_clf, (X_train, y_train_num))

In [31]:
evaluate_model(grad_boost_clf, (X_train, X_val, y_train_num, y_val_num), "Grad Boost")

Model: Grad Boost
Train set report: 
             precision    recall  f1-score   support

        1.0       0.98      0.99      0.98       194
        2.0       1.00      0.78      0.88        18
        3.0       1.00      1.00      1.00        70
        4.0       1.00      1.00      1.00       321
        5.0       0.99      1.00      1.00       126
        6.0       1.00      1.00      1.00       706
        7.0       1.00      1.00      1.00        91

avg / total       1.00      1.00      1.00      1526

Validation set report: 
             precision    recall  f1-score   support

        1.0       0.92      0.98      0.95        95
        2.0       0.67      0.22      0.33         9
        3.0       0.97      0.97      0.97        35
        4.0       0.99      0.97      0.98       158
        5.0       0.98      0.98      0.98        62
        6.0       0.99      0.99      0.99       348
        7.0       0.96      1.00      0.98        45

avg / total       0.97      0.98 

## ADA Boosting

In [32]:
# Create and fit an AdaBoosted decision tree
ada_boost_clf = ensemble.AdaBoostClassifier(tree.DecisionTreeClassifier(max_depth=2),
                                            algorithm="SAMME",
                                            n_estimators=400,
                                            learning_rate=0.5)


In [33]:
ada_boost_clf = train_model(ada_boost_clf, (X_train, y_train_num))

In [34]:
evaluate_model(ada_boost_clf, (X_train, X_val, y_train_num, y_val_num), "Ada Boost")

Model: Ada Boost
Train set report: 
             precision    recall  f1-score   support

        1.0       0.96      0.94      0.95       194
        2.0       0.48      0.61      0.54        18
        3.0       1.00      1.00      1.00        70
        4.0       1.00      1.00      1.00       321
        5.0       1.00      1.00      1.00       126
        6.0       1.00      1.00      1.00       706
        7.0       1.00      1.00      1.00        91

avg / total       0.99      0.99      0.99      1526

Validation set report: 
             precision    recall  f1-score   support

        1.0       0.91      0.95      0.93        95
        2.0       0.38      0.33      0.35         9
        3.0       0.97      0.97      0.97        35
        4.0       1.00      0.97      0.99       158
        5.0       1.00      1.00      1.00        62
        6.0       1.00      0.99      1.00       348
        7.0       0.96      1.00      0.98        45

avg / total       0.98      0.98  

## XGBoost

In [35]:
## - pip install xgboost
## -- import xgboost as xgb

In [36]:
import xgboost as xgb

In [37]:
xgb_clf = xgb.XGBClassifier(objective='multi:softprob', max_depth=3, n_estimators=650, learning_rate=0.045)

In [38]:
xgb_clf = train_model(xgb_clf, (X_train, y_train_num))

In [39]:
evaluate_model(xgb_clf, (X_train, X_val, y_train_num, y_val_num), "kNN")

Model: kNN
Train set report: 
             precision    recall  f1-score   support

        1.0       1.00      1.00      1.00       194
        2.0       1.00      1.00      1.00        18
        3.0       1.00      1.00      1.00        70
        4.0       1.00      1.00      1.00       321
        5.0       1.00      1.00      1.00       126
        6.0       1.00      1.00      1.00       706
        7.0       1.00      1.00      1.00        91

avg / total       1.00      1.00      1.00      1526

Validation set report: 
             precision    recall  f1-score   support

        1.0       0.94      0.97      0.95        95
        2.0       0.60      0.33      0.43         9
        3.0       1.00      1.00      1.00        35
        4.0       1.00      0.97      0.99       158
        5.0       0.98      1.00      0.99        62
        6.0       0.99      0.99      0.99       348
        7.0       0.96      1.00      0.98        45

avg / total       0.98      0.98      0.

  if diff:
  if diff:


## Neural network

In [40]:
number_of_features = X_train.shape[-1]
output_size = y_train_bin.shape[-1]

In [41]:
number_of_features

12

In [42]:

nn_clf = Sequential()
nn_clf.add(Dense(units=30, input_dim=number_of_features, activation='relu'))
nn_clf.add(Dense(units=100, activation='relu'))
nn_clf.add(Dense(units=30, activation='relu'))
nn_clf.add(Dense(units=output_size, activation='sigmoid'))

In [43]:
nn_clf.compile(optimizer='adam', loss=losses.binary_crossentropy, metrics=['accuracy'])

In [44]:
nn_clf.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 30)                390       
_________________________________________________________________
dense_2 (Dense)              (None, 100)               3100      
_________________________________________________________________
dense_3 (Dense)              (None, 30)                3030      
_________________________________________________________________
dense_4 (Dense)              (None, 7)                 217       
Total params: 6,737
Trainable params: 6,737
Non-trainable params: 0
_________________________________________________________________


In [57]:
history = nn_clf.fit(X_train, y_train_bin, epochs=40, batch_size=30, verbose=2, validation_data=(X_val, y_val_bin))

Train on 1526 samples, validate on 752 samples
Epoch 1/40
 - 0s - loss: 0.0070 - acc: 0.9978 - val_loss: 0.0170 - val_acc: 0.9947
Epoch 2/40
 - 0s - loss: 0.0060 - acc: 0.9981 - val_loss: 0.0165 - val_acc: 0.9943
Epoch 3/40
 - 0s - loss: 0.0053 - acc: 0.9982 - val_loss: 0.0184 - val_acc: 0.9941
Epoch 4/40
 - 0s - loss: 0.0061 - acc: 0.9979 - val_loss: 0.0184 - val_acc: 0.9939
Epoch 5/40
 - 0s - loss: 0.0046 - acc: 0.9984 - val_loss: 0.0180 - val_acc: 0.9945
Epoch 6/40
 - 0s - loss: 0.0061 - acc: 0.9978 - val_loss: 0.0171 - val_acc: 0.9947
Epoch 7/40
 - 0s - loss: 0.0048 - acc: 0.9983 - val_loss: 0.0163 - val_acc: 0.9947
Epoch 8/40
 - 0s - loss: 0.0048 - acc: 0.9979 - val_loss: 0.0175 - val_acc: 0.9945
Epoch 9/40
 - 0s - loss: 0.0043 - acc: 0.9981 - val_loss: 0.0176 - val_acc: 0.9953
Epoch 10/40
 - 0s - loss: 0.0052 - acc: 0.9983 - val_loss: 0.0181 - val_acc: 0.9941
Epoch 11/40
 - 0s - loss: 0.0044 - acc: 0.9987 - val_loss: 0.0164 - val_acc: 0.9949
Epoch 12/40
 - 0s - loss: 0.0053 - acc

In [58]:
evaluate_nn(nn_clf, (X_train, X_val, y_train_num, y_val_num), "NN")

Model: NN
Train set report: 
             precision    recall  f1-score   support

        1.0       1.00      1.00      1.00       194
        2.0       1.00      1.00      1.00        18
        3.0       1.00      1.00      1.00        70
        4.0       1.00      1.00      1.00       321
        5.0       1.00      1.00      1.00       126
        6.0       1.00      1.00      1.00       706
        7.0       1.00      1.00      1.00        91

avg / total       1.00      1.00      1.00      1526

Validation set report: 
             precision    recall  f1-score   support

        1.0       0.97      0.97      0.97        95
        2.0       0.67      0.67      0.67         9
        3.0       1.00      0.97      0.99        35
        4.0       1.00      0.99      1.00       158
        5.0       0.98      1.00      0.99        62
        6.0       0.99      1.00      1.00       348
        7.0       1.00      1.00      1.00        45

avg / total       0.99      0.99      0.9

In [47]:
#evaluate_model(nn_clf, (X_train, X_val, y_train_bin, y_val_bin), 'NN')

## kNN

In [48]:
kNN_clf = KNeighborsClassifier(n_neighbors=6)

In [49]:
kNN_clf = train_model(kNN_clf, (X_train, y_train_num))

In [50]:
evaluate_model(kNN_clf, (X_train, X_val, y_train_num, y_val_num), "kNN")

Model: kNN
Train set report: 
             precision    recall  f1-score   support

        1.0       0.95      0.99      0.97       194
        2.0       1.00      0.44      0.62        18
        3.0       0.99      1.00      0.99        70
        4.0       1.00      1.00      1.00       321
        5.0       0.98      0.99      0.98       126
        6.0       1.00      0.99      1.00       706
        7.0       1.00      1.00      1.00        91

avg / total       0.99      0.99      0.99      1526

Validation set report: 
             precision    recall  f1-score   support

        1.0       0.93      0.97      0.95        95
        2.0       0.50      0.33      0.40         9
        3.0       0.97      1.00      0.99        35
        4.0       1.00      0.99      0.99       158
        5.0       1.00      1.00      1.00        62
        6.0       1.00      1.00      1.00       348
        7.0       1.00      1.00      1.00        45

avg / total       0.98      0.98      0.

## SVM

In [51]:
svm_clf = svm.SVC(C=0.004, kernel='poly', degree=2)

In [52]:
svm_clf = train_model(svm_clf, (X_train, y_train_num))

In [53]:
evaluate_model(svm_clf, (X_train, X_val, y_train_num, y_val_num), "SVM")

Model: SVM
Train set report: 
             precision    recall  f1-score   support

        1.0       0.98      1.00      0.99       194
        2.0       1.00      0.78      0.88        18
        3.0       1.00      1.00      1.00        70
        4.0       1.00      1.00      1.00       321
        5.0       1.00      1.00      1.00       126
        6.0       1.00      1.00      1.00       706
        7.0       1.00      1.00      1.00        91

avg / total       1.00      1.00      1.00      1526

Validation set report: 
             precision    recall  f1-score   support

        1.0       0.95      0.97      0.96        95
        2.0       0.75      0.67      0.71         9
        3.0       1.00      1.00      1.00        35
        4.0       0.99      0.99      0.99       158
        5.0       0.98      0.98      0.98        62
        6.0       1.00      1.00      1.00       348
        7.0       1.00      1.00      1.00        45

avg / total       0.99      0.99      0.

## Random forest

In [54]:
random_forest_clf = ensemble.RandomForestClassifier(max_depth=2, n_estimators=200, criterion='entropy')

In [55]:
random_forest_clf = train_model(random_forest_clf, (X_train, y_train_num))

In [56]:
evaluate_model(random_forest_clf, (X_train, X_val, y_train_num, y_val_num), "Random Forest")

Model: Random Forest
Train set report: 
             precision    recall  f1-score   support

        1.0       0.91      0.82      0.86       194
        2.0       0.00      0.00      0.00        18
        3.0       1.00      0.73      0.84        70
        4.0       0.93      0.97      0.95       321
        5.0       1.00      0.24      0.38       126
        6.0       0.83      1.00      0.91       706
        7.0       1.00      0.98      0.99        91

avg / total       0.89      0.88      0.86      1526

Validation set report: 
             precision    recall  f1-score   support

        1.0       0.92      0.80      0.85        95
        2.0       0.00      0.00      0.00         9
        3.0       1.00      0.63      0.77        35
        4.0       0.93      0.97      0.95       158
        5.0       1.00      0.29      0.45        62
        6.0       0.83      1.00      0.90       348
        7.0       0.98      1.00      0.99        45

avg / total       0.88      0.

  'precision', 'predicted', average, warn_for)
