# Data Modeling 01

In this notebook, we'll begin building the classifier to show that Layer 4 neurons do not exist in the organoid data. We will do this in the following manner.

1. Identify cells in the primary data by which layer of the cortex they are in.
2. Train a classifier on the primary data.
3. Under the assumption that the space of gene expression is the same in organoids, classify the organoid cells to their respective cortex layer and show that none get classified as layer 4.
4. Conclude that layer 4 cells do not exist in the organoid data.

In [28]:
import pandas as pd 
import matplotlib.pyplot as plt 
import numpy as np
import umap
import hdbscan
from collections import Counter
import seaborn as sns
import plotly.express as px 
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
import dask.dataframe as dd
from dask_ml.model_selection import train_test_split

In [29]:
labels = pd.read_csv('../data/processed/labels/primary_labels_neighbors_15_components_50_clust_size_50.csv').iloc[0:100, :]
labels['# label'] = labels['# label'].astype(int) + 1

df = pd.read_csv('../data/processed/umap/primary_reduction_neighbors_50_components_3.csv', index_col='Unnamed: 0').iloc[0:100, :]

In [30]:
df = dd.from_pandas(df, npartitions=2)
labels = dd.from_pandas(labels, npartitions=2)

In [32]:
X_train, X_test, y_train, y_test = train_test_split(df, labels, test_size=0.1, shuffle=True)

In [33]:
class GeneClassifier:
    def __init__(self, est, params):
        self.est = est
        self.params = params
        
    def generate_model(self, X, y, n_iter=10):
        grid = RandomizedSearchCV(
            n_iter=n_iter,
            estimator=self.est,
            param_distributions=self.params,
            scoring='balanced_accuracy'
        )

        self.grid = grid.fit(X, y)
    
    def best_score(self):
        return self.grid.best_score_
    
    def best_model(self):
        return self.grid.best_estimator_
    
    def best_params(self):
        return self.grid.best_params_

Now we begin the classification process

In [40]:
from dask_ml.xgboost import XGBClassifier
from dask_ml.model_selection import RandomizedSearchCV

params = {
    'eta' : np.linspace(0, 1, 20),
    'gamma': np.linspace(0, 1000, 20),
    'max_depth': np.linspace(0, 1000, 20, dtype=int),
}

xgb_est = GeneClassifier(XGBClassifier(), params)

In [42]:
from dask.distributed import Client
client = Client('scheduler-address:8786')

xgb_est = xgb_est.generate_model(X_train.values, y_train.values, n_iter=2)
print(xgb_est.best_score(), xgb_est.best_params())

OSError: Timed out trying to connect to tcp://scheduler-address:8786 after 30 s

In [39]:
X_train.values.compute()

array([[0.4481088 , 3.796412  , 2.5979095 ],
       [0.4983933 , 3.9593854 , 2.3622296 ],
       [0.33222336, 3.5287693 , 2.4303825 ],
       [0.05900061, 3.8804545 , 2.4287891 ],
       [0.40310133, 3.6861796 , 2.0468378 ],
       [0.22065888, 3.6910384 , 2.8645082 ],
       [0.23353353, 3.8845916 , 2.318612  ],
       [0.19925004, 3.740593  , 2.7131994 ],
       [0.1758697 , 3.5272355 , 2.4797056 ],
       [0.30395132, 3.7074225 , 2.3528247 ],
       [0.25804892, 3.6220398 , 2.6360617 ],
       [0.34219316, 3.8034708 , 2.8354907 ],
       [0.22701918, 3.3649328 , 2.6206775 ],
       [0.3596108 , 3.506909  , 2.6567767 ],
       [0.05308454, 3.7815006 , 2.6551347 ],
       [0.2358424 , 3.4101896 , 2.5498316 ],
       [0.57771707, 3.6255016 , 3.0535412 ],
       [0.13456306, 3.684509  , 2.5766418 ],
       [0.21065377, 3.3899477 , 2.5953205 ],
       [0.29221654, 3.6722684 , 2.644179  ],
       [0.17226812, 3.6118958 , 2.5589767 ],
       [0.50930506, 3.6159663 , 2.5196338 ],
       [0.