In [1]:
import pickle

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.metrics import classification_report, ConfusionMatrixDisplay
import os

In [2]:
datadir = '/Users/ashesh.ashesh/Documents/PhD/mbldata_solvation/data_with_tiff'
use_features = ['SSC (Violet)-W', 'SSC (Imaging)-W', 'SSC (Violet)-A',
       'Size (SSC (Imaging))', 'Diffusivity (SSC (Imaging))',
       'SSC (Imaging)-A', 'Total Intensity (SSC (Imaging))', 'FSC-W',
       'Size (FSC)', 'Diffusivity (FSC)', 'UV4 (440)-H', 'UV5 (460)-H',
       'UV3 (420)-H', 'UV6 (475)-H', 'Diffusivity (Green*)', 'B4 (545)-H',
       'UV5 (460)-W', 'UV4 (440)-W', 'UV6 (475)-W', 'B5 (575)-H']

# load train data
datatype = 'train'
with open(os.path.join(datadir,f"{datatype}_ds.bin"), mode="rb") as f:
    data = pickle.load(f)

raw_train_data = data[f"{datatype}_data"]
raw_train_labels = data[f"{datatype}_labels"]
feature_names = data["feature_names"]
clean_mask = raw_train_data[:,-1] > 0
# train_data  = raw_train_data[clean_mask,:].copy()
# train_labels = raw_train_labels[clean_mask].copy()
# 0, 1 have notiff
# 2, 3 have tiff
# 0,2 => label 0
# 1,3 => label 1
mask0 = np.logical_and(raw_train_labels == 0, raw_train_data[:,-1] == 0)
mask1 = np.logical_and(raw_train_labels == 1, raw_train_data[:,-1] == 0)
mask2 = np.logical_and(raw_train_labels == 0, raw_train_data[:,-1] == 1)
mask3 = np.logical_and(raw_train_labels == 1, raw_train_data[:,-1] == 1)

train_labels = -1 * np.ones(raw_train_labels.shape)
train_labels[mask0] = 0
train_labels[mask1] = 1
train_labels[mask2] = 2
train_labels[mask3] = 3
assert set(np.unique(train_labels)) == set([0,1,2,3])
assert np.sum(mask0 * mask1) ==0
assert np.sum(mask0 * mask2) ==0
assert np.sum(mask0 * mask3) ==0
assert np.sum(mask1 * mask2) ==0
assert np.sum(mask1 * mask3) ==0
assert np.sum(mask2 * mask3) ==0


In [3]:
pd.Series(train_labels).value_counts().sort_index()

0.0    252617
1.0     98628
2.0      2527
3.0      8482
dtype: int64

C6818 is 0
Emiliana is 1, 


In [4]:
pd.Series(raw_train_labels).value_counts()

0    255144
1    107110
dtype: int64

In [5]:
pd.Series(train_labels).value_counts()

0.0    252617
1.0     98628
3.0      8482
2.0      2527
dtype: int64

In [6]:
nan_feature_names = np.array(feature_names)[np.isnan(raw_train_data).any(axis=0)].tolist()
nan_feature_names
# unused_feature_idx = [feature_names.index(f) for f in nan_feature_names]
# unused_feature_idx += [feature_names.index(f) for f in unused_features]
used_feature_idx = [feature_names.index(f) for f in use_features] if use_features is not None else None

print('Nan features:', nan_feature_names)
# print('Unused features:', unused_features)
if use_features is not None:
    print('Used features:', use_features)
    unused_feature_idx  = []


def get_used_feature_idx():
    if used_feature_idx is not None:
        return used_feature_idx
    
    valid_feature_idx = []
    for i in range(len(feature_names)):
        if i not in unused_feature_idx:
            valid_feature_idx.append(i)
    return valid_feature_idx

def remove_unused_features(input_data):
    valid_feature_idx = get_used_feature_idx()
    input_data = input_data[:, valid_feature_idx]
    print(input_data.shape, train_labels.shape)
    return input_data

# train_data = remove_unused_features(train_data)
train_data = remove_unused_features(raw_train_data)

Nan features: ['PlateLocationY']
Used features: ['SSC (Violet)-W', 'SSC (Imaging)-W', 'SSC (Violet)-A', 'Size (SSC (Imaging))', 'Diffusivity (SSC (Imaging))', 'SSC (Imaging)-A', 'Total Intensity (SSC (Imaging))', 'FSC-W', 'Size (FSC)', 'Diffusivity (FSC)', 'UV4 (440)-H', 'UV5 (460)-H', 'UV3 (420)-H', 'UV6 (475)-H', 'Diffusivity (Green*)', 'B4 (545)-H', 'UV5 (460)-W', 'UV4 (440)-W', 'UV6 (475)-W', 'B5 (575)-H']
(362254, 20) (362254,)


## Just working with relevant data.

In [7]:
mask = np.logical_or(mask2, mask3)
train_data = train_data[mask,:]
train_labels = train_labels[mask]

In [8]:
# get a validation set
random_indices = np.random.choice(len(train_data), len(train_data), replace=False)
val_N = int(0.1*len(train_data))
print(val_N)
valid_data = train_data[random_indices[-val_N:]]
valid_labels = train_labels[random_indices[-val_N:]]

train_data = train_data[random_indices[:-val_N]]
train_labels = train_labels[random_indices[:-val_N]]

print(train_data.shape, valid_data.shape)

1100
(9909, 20) (1100, 20)


In [9]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

normalizer = StandardScaler()
X = normalizer.fit_transform(train_data)

In [10]:
def train_one_svm(f1_idx, f2_idx, filtered_X, filtered_labels):
    svm0 = SVC(kernel='linear', probability=True)
    print('in train_one_svm', filtered_X.shape)
    svm0.fit(filtered_X[:,[f1_idx,f2_idx]], filtered_labels)
    return svm0

def get_filter_mask(svm, cur_X, target_idx):
    mask = svm.predict(cur_X) == target_idx
    return mask

def get_filtered_training_data(svm, cur_X, cur_y, fidx_tuple, target_idx):
    mask = get_filter_mask(svm, cur_X[:,fidx_tuple], target_idx)
    return cur_X[mask,:], cur_y[mask]

def get_prediction(svm_list,feature_idx_list, cur_X, target_idx):
    output = np.ones(len(cur_X)) * -955
    for svm, fidx_tuple in zip(svm_list, feature_idx_list):
        cur_pred = svm.predict(cur_X[:,fidx_tuple])
        everythingelsemask = cur_pred != target_idx
        output[everythingelsemask] = 0
    
    output[output != 0] = 1
    return output

In [11]:
target_idx = 3
feature_idx_list = [(13,14), (0,1), (2,3), (3,4), (5,6), (7,8), (9,10), (11,12), (13,14), (15,16), (17,18)]
svm_list = []
cur_X = X
cur_y = train_labels
for fidx_tuple in feature_idx_list:
    print('Training data', cur_X.shape)
    svm = train_one_svm(fidx_tuple[0], fidx_tuple[1], cur_X, cur_y)
    print(pd.Series(cur_y).value_counts())
    svm_list.append(svm)
    cur_X,cur_y = get_filtered_training_data(svm, cur_X, cur_y, fidx_tuple, target_idx)
    

Training data (9909, 20)
in train_one_svm (9909, 20)
3.0    7663
2.0    2246
dtype: int64
Training data (7840, 20)
in train_one_svm (7840, 20)
3.0    7593
2.0     247
dtype: int64
Training data (7756, 20)
in train_one_svm (7756, 20)
3.0    7592
2.0     164
dtype: int64
Training data (7756, 20)
in train_one_svm (7756, 20)
3.0    7592
2.0     164
dtype: int64
Training data (7756, 20)
in train_one_svm (7756, 20)
3.0    7592
2.0     164
dtype: int64
Training data (7756, 20)
in train_one_svm (7756, 20)
3.0    7592
2.0     164
dtype: int64
Training data (7756, 20)
in train_one_svm (7756, 20)
3.0    7592
2.0     164
dtype: int64
Training data (7756, 20)
in train_one_svm (7756, 20)
3.0    7592
2.0     164
dtype: int64
Training data (7756, 20)
in train_one_svm (7756, 20)
3.0    7592
2.0     164
dtype: int64
Training data (7756, 20)
in train_one_svm (7756, 20)
3.0    7592
2.0     164
dtype: int64
Training data (7756, 20)
in train_one_svm (7756, 20)
3.0    7592
2.0     164
dtype: int64


In [12]:
pred = get_prediction(svm_list,feature_idx_list, X, target_idx)
np.mean(pred == (train_labels==target_idx))

0.9762841860934504

In [None]:
w = svm0.coef_[0]   
b = svm0.intercept_[0]     