In [1]:
import pandas as pd
import numpy as np

train_data = pd.read_excel("Data/tr.xlsx")

In [2]:
over_missing_sample = []
row_value_count = train_data.apply(pd.Series.value_counts,axis=1,dropna=False)

In [3]:
import tools
#Data transform
train_y = pd.DataFrame(train_data['outcome']) 
train_X = pd.DataFrame(train_data.drop(['outcome'],axis=1))

train_X = tools.data_transform(train_X)

In [4]:
threshhold = len(train_X.columns)*0.25
over_missing = row_value_count[np.nan]<=threshhold
class_0 = train_y['outcome']==1

print(sorted(train_y.value_counts()))
train_X = train_X[over_missing|class_0]
train_y = train_y[over_missing|class_0]
print(sorted(train_y.value_counts()))

[879, 51280]
[879, 27076]


In [5]:
# Data split
from sklearn.model_selection import train_test_split

train_X, val_X, train_y, val_y = train_test_split(
    train_X ,
    train_y,
    test_size=0.2,
    random_state=42)

print(sorted(train_y.value_counts()))
print(sorted(val_y.value_counts()))

[718, 21646]
[161, 5430]


In [6]:
from imblearn.under_sampling import RandomUnderSampler as RUS

print(sorted(train_y.value_counts()))

rus = RUS(sampling_strategy=0.1,random_state=42)
train_X,train_y = rus.fit_resample(train_X,train_y)

print(sorted(train_y.value_counts()))

[718, 21646]
[718, 7180]


In [7]:
from sklearn.feature_selection import VarianceThreshold as VT

vt= VT(0.2)

vt.fit(train_X)
print(train_X.shape)

train_X = pd.DataFrame(vt.transform(train_X)) 
val_X =  pd.DataFrame(vt.transform(val_X))

print(train_X.shape)

(7898, 66)
(7898, 30)


In [8]:
# Missing value imputation
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import tools
feature_kind = tools.init_feature_kind(train_X)
cont,cate = tools.get_feature_kind(train_X,feature_kind)  

strategy = 'median'

imp_mean = IterativeImputer(max_iter=100,random_state=0)
imp_mean.fit(train_X[cont])

train_X[cont] = imp_mean.transform(train_X[cont])
val_X[cont] = imp_mean.transform(val_X[cont])

print("filled continuous missing value with "+strategy)

strategy = 'constant'

imp = SimpleImputer(missing_values=np.nan, strategy=strategy,fill_value=10.0)
imp.fit(train_X[cate])

train_X[cate] = imp.transform(train_X[cate])
val_X[cate] = imp.transform(val_X[cate])


print("filled nominal missing value with ",strategy)

filled continuous missing value with median
filled nominal missing value with  constant


In [9]:
from sklearn import preprocessing


cont,cate = tools.get_feature_kind(train_X,feature_kind)

scaler = preprocessing.StandardScaler().fit(train_X[cont])
train_X[cont] = scaler.transform(train_X[cont])
val_X[cont] = scaler.transform(val_X[cont])

In [10]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
best_params = {'C': 100, 'degree': 1, 'gamma': 'auto', 'kernel': 'rbf'}

parameters = {
    'C':[0.1,1,10,100],
    'degree':[1,2,3,4],
    'gamma':('scale','auto'),
    'kernel':('linear','poly','rbf','sigmoid'),
    
    }
svc = SVC(class_weight={0:0.1,1:1},max_iter=1000)
GS = GridSearchCV(svc, parameters,n_jobs=-1,scoring='f1')
GS.fit(train_X, train_y['outcome'])


sorted(GS.cv_results_.keys())



['mean_fit_time',
 'mean_score_time',
 'mean_test_score',
 'param_C',
 'param_degree',
 'param_gamma',
 'param_kernel',
 'params',
 'rank_test_score',
 'split0_test_score',
 'split1_test_score',
 'split2_test_score',
 'split3_test_score',
 'split4_test_score',
 'std_fit_time',
 'std_score_time',
 'std_test_score']

In [11]:
def set_pandas_display_options() -> None:
    display = pd.options.display
    display.max_columns = 100
    display.max_rows = 100
    display.max_colwidth = 199
    display.width = None
set_pandas_display_options()

In [16]:
print(GS.best_params_)
#print(pd.DataFrame(GS.cv_results_))

{'C': 100, 'degree': 1, 'gamma': 'auto', 'kernel': 'rbf'}


In [14]:

best_params = {'C': 100, 'degree': 1, 'gamma': 'auto', 'kernel': 'rbf','class_weight':{0:0.1,1:1}}

clf = SVC(**best_params)



clf = SVC(**best_params)

clf.fit(train_X,train_y['outcome'])
result = clf.predict(val_X)
print(pd.DataFrame({
    'train':tools.get_performance(train_y,clf.predict(train_X)),
    'val':tools.get_performance(val_y,result)
    }
    ))

                                      train                       val
confusion matrix   [[6787, 393], [10, 708]]  [[4969, 461], [48, 113]]
acc                                0.948974                  0.908961
precision                          0.643052                  0.196864
f1_score                            0.77845                  0.307483
recall                             0.986072                  0.701863
matthews_corrcoef                  0.772999                  0.339934
