In [1]:
import pickle

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.metrics import classification_report, ConfusionMatrixDisplay
import os

In [2]:
datadir = '/Users/ashesh.ashesh/Documents/PhD/mbldata_solvation/data_with_tiff'
use_features = ['SSC (Violet)-W', 'SSC (Imaging)-W', 'SSC (Violet)-A',
       'Size (SSC (Imaging))', 'Diffusivity (SSC (Imaging))',
       'SSC (Imaging)-A', 'Total Intensity (SSC (Imaging))', 'FSC-W',
       'Size (FSC)', 'Diffusivity (FSC)', 'UV4 (440)-H', 'UV5 (460)-H',
       'UV3 (420)-H', 'UV6 (475)-H', 'Diffusivity (Green*)', 'B4 (545)-H',
       'UV5 (460)-W', 'UV4 (440)-W', 'UV6 (475)-W', 'B5 (575)-H']

# load train data
datatype = 'train'
with open(os.path.join(datadir,f"{datatype}_ds.bin"), mode="rb") as f:
    data = pickle.load(f)

def get_data_with_masks(raw_data_dict):
    raw_train_data = data[f"{datatype}_data"]
    raw_train_labels = data[f"{datatype}_labels"]
    feature_names = data["feature_names"]
    # clean_mask = raw_train_data[:,-1] > 0
    # train_data  = raw_train_data[clean_mask,:].copy()
    # train_labels = raw_train_labels[clean_mask].copy()
    # 0, 1 have notiff
    # 2, 3 have tiff
    # 0,2 => label 0
    # 1,3 => label 1
    mask0 = np.logical_and(raw_train_labels == 0, raw_train_data[:,-1] == 0)
    mask1 = np.logical_and(raw_train_labels == 1, raw_train_data[:,-1] == 0)
    mask2 = np.logical_and(raw_train_labels == 0, raw_train_data[:,-1] == 1)
    mask3 = np.logical_and(raw_train_labels == 1, raw_train_data[:,-1] == 1)

    train_labels = -1 * np.ones(raw_train_labels.shape)
    train_labels[mask0] = 0
    train_labels[mask1] = 1
    train_labels[mask2] = 2
    train_labels[mask3] = 3
    assert set(np.unique(train_labels)) == set([0,1,2,3])
    assert np.sum(mask0 * mask1) ==0
    assert np.sum(mask0 * mask2) ==0
    assert np.sum(mask0 * mask3) ==0
    assert np.sum(mask1 * mask2) ==0
    assert np.sum(mask1 * mask3) ==0
    assert np.sum(mask2 * mask3) ==0

    return {'X':raw_train_data, 'y':train_labels, 'feature_names':feature_names, 'masks': [mask0, mask1, mask2, mask3]}

train_data_dict = get_data_with_masks(data)
raw_train_data = train_data_dict['X']
train_labels = train_data_dict['y']
feature_names = train_data_dict['feature_names']
mask0, mask1, mask2, mask3 = train_data_dict['masks']


In [3]:
pd.Series(train_labels).value_counts().sort_index()

0.0    252617
1.0     98628
2.0      2527
3.0      8482
dtype: int64

C6818 is 0
Emiliana is 1, 


In [4]:
nan_feature_names = np.array(feature_names)[np.isnan(raw_train_data).any(axis=0)].tolist()
nan_feature_names
# unused_feature_idx = [feature_names.index(f) for f in nan_feature_names]
# unused_feature_idx += [feature_names.index(f) for f in unused_features]
used_feature_idx = [feature_names.index(f) for f in use_features] if use_features is not None else None

print('Nan features:', nan_feature_names)
# print('Unused features:', unused_features)
if use_features is not None:
    print('Used features:', use_features)
    unused_feature_idx  = []


def get_used_feature_idx():
    if used_feature_idx is not None:
        return used_feature_idx
    
    valid_feature_idx = []
    for i in range(len(feature_names)):
        if i not in unused_feature_idx:
            valid_feature_idx.append(i)
    return valid_feature_idx

def remove_unused_features(input_data):
    valid_feature_idx = get_used_feature_idx()
    input_data = input_data[:, valid_feature_idx]
    print(input_data.shape, train_labels.shape)
    return input_data

# train_data = remove_unused_features(train_data)
train_data = remove_unused_features(raw_train_data)

Nan features: ['PlateLocationY']
Used features: ['SSC (Violet)-W', 'SSC (Imaging)-W', 'SSC (Violet)-A', 'Size (SSC (Imaging))', 'Diffusivity (SSC (Imaging))', 'SSC (Imaging)-A', 'Total Intensity (SSC (Imaging))', 'FSC-W', 'Size (FSC)', 'Diffusivity (FSC)', 'UV4 (440)-H', 'UV5 (460)-H', 'UV3 (420)-H', 'UV6 (475)-H', 'Diffusivity (Green*)', 'B4 (545)-H', 'UV5 (460)-W', 'UV4 (440)-W', 'UV6 (475)-W', 'B5 (575)-H']
(362254, 20) (362254,)


## Just working with relevant data.

In [46]:
target_idx = 1

In [47]:
all_masks = [mask0, mask1, mask2, mask3]
pos_mask = all_masks[target_idx]
posN = min(np.sum(pos_mask), max(np.sum(mask2), np.sum(mask3)))
if posN < pos_mask.sum():
    print('subsampling the positive class')
    pos_idx = np.random.choice(np.where(pos_mask)[0], size=posN)
    big_pos_mask = pos_mask.copy()
    pos_mask = np.zeros_like(pos_mask) != 0
    pos_mask[pos_idx] = True
    
neg_idx_list = []
for idx in range(4):
    if idx == target_idx:
        continue
    neg_mask = all_masks[idx]
    neg_idx_list.append(np.random.choice(np.where(mask0)[0], size= posN//3))

neg_idx = np.concatenate(neg_idx_list)

neg_mask = np.zeros_like(pos_mask) != 0
neg_mask[neg_idx] = True



subsampling the positive class


In [48]:
assert np.logical_and(pos_mask, neg_mask).sum() ==0
mask = np.logical_or(pos_mask, neg_mask)

cur_train_data = train_data[mask,:]
cur_train_labels = train_labels[mask]
cur_train_labels = (cur_train_labels == target_idx ).astype(int)
print(cur_train_data.shape)

(16464, 20)


In [49]:
# get a validation set
random_indices = np.random.choice(len(cur_train_data), len(cur_train_data), replace=False)
val_N = int(0.1*len(cur_train_data))
print(val_N)
valid_data = cur_train_data[random_indices[-val_N:]]
cur_valid_labels = cur_train_labels[random_indices[-val_N:]]

cur_train_data = cur_train_data[random_indices[:-val_N]]
cur_train_labels = cur_train_labels[random_indices[:-val_N]]

print(cur_train_data.shape, valid_data.shape)

1646
(14818, 20) (1646, 20)


In [50]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

normalizer = StandardScaler()
X = normalizer.fit_transform(cur_train_data)

In [51]:
def train_one_svm(f1_idx, f2_idx, filtered_X, filtered_labels):
    svm0 = SVC(kernel='linear', probability=True)
    print('in train_one_svm', filtered_X.shape)
    svm0.fit(filtered_X[:,[f1_idx,f2_idx]], filtered_labels)
    return svm0

def get_filter_mask(svm, cur_X, target_idx):
    mask = svm.predict(cur_X) == target_idx
    return mask

def get_filtered_training_data(svm, cur_X, cur_y, fidx_tuple):
    mask = get_filter_mask(svm, cur_X[:,fidx_tuple], 1)
    return cur_X[mask,:], cur_y[mask]

def get_prediction(svm_list,feature_idx_list, cur_X):
    target_idx = 1
    output = np.ones(len(cur_X)) * -955
    for svm, fidx_tuple in zip(svm_list, feature_idx_list):
        cur_pred = svm.predict(cur_X[:,fidx_tuple])
        everythingelsemask = cur_pred != target_idx
        output[everythingelsemask] = 0
    
    output[output != 0] = 1
    return output

In [52]:
pd.Series(cur_valid_labels).value_counts()

0    847
1    799
dtype: int64

In [53]:
feature_idx_list = [(0,1), (2,3), (3,4), (5,6), (7,8), (9,10), (11,12), (13,14), (15,16), (17,18)]
svm_list = []
cur_X = X
cur_y = cur_train_labels
for fidx_tuple in feature_idx_list:
    print('Training data', cur_X.shape)
    svm = train_one_svm(fidx_tuple[0], fidx_tuple[1], cur_X, cur_y)
    print('Target', pd.Series(cur_y).value_counts())
    svm_list.append(svm)
    cur_X,cur_y = get_filtered_training_data(svm, cur_X, cur_y, fidx_tuple)
    

Training data (14818, 20)
in train_one_svm (14818, 20)


In [33]:
pred = get_prediction(svm_list,feature_idx_list, X)
np.mean(pred == (cur_train_labels==1))

0.9566365837552279

In [34]:
pd.Series(pred).value_counts()

0.0    2438
1.0    2105
dtype: int64

In [35]:
with open(os.path.join(datadir,f"test_ds.bin"), mode="rb") as f:
    test_data_dict = pickle.load(f)

test_data_dict = get_data_with_masks(data)
raw_test_data = test_data_dict['X']
test_labels = test_data_dict['y']
# feature_names = test_data_dict['feature_names']
# mask0, mask1, mask2, mask3 = test_data_dict['masks']



In [36]:
testX = normalizer.transform(remove_unused_features(raw_test_data))

(362254, 20) (362254,)


In [37]:
pred = get_prediction(svm_list,feature_idx_list, testX)
np.mean(pred == (test_labels==target_idx))

0.9959834812037962

In [38]:
predtest_fname = f'testPrediction_class{target_idx}.npy'
np.save(predtest_fname, pred)

In [39]:
!ls .

baseline_allFeatures_flexibleRange.ipynb
[31mclassification.ipynb[m[m
[31mclassification_4class.ipynb[m[m
[31mdataset.ipynb[m[m
gating_1.ipynb
[31msvm_sequential_classification.ipynb[m[m
testPrediction_class2.npy
testPrediction_class3.npy


In [19]:
# w = svm0.coef_[0]   
# b = svm0.intercept_[0]     