In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import KFold, StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.decomposition import PCA
from sklearn.svm import SVC
# ignore warnings
import warnings
warnings.simplefilter('ignore')

In [2]:
random_state = 72
!rm -rf predictions2
!mkdir predictions2
!rm -rf oofs2
!mkdir oofs2

In [3]:
train_df = pd.read_csv('data/cleaned_train.csv',index_col=0)
test_df = pd.read_csv('data/cleaned_test.csv',index_col=0)

In [4]:
features = train_df.columns.tolist()
features.remove('class4')
features.remove('class2')

In [5]:
event_train = train_df[train_df['class2'] == 'event']
event_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,68,69,70,71,72,73,74,75,class2,class4
4,-0.780036,-0.67099,-0.669236,0.527349,0.005534,-1.001033,-0.53222,0.292039,-0.292112,-0.471555,...,-0.119527,-0.304354,0.505167,-0.416789,-0.84378,-0.626883,-0.814273,-1.054749,event,Ib
5,-0.599112,-0.595942,-0.63461,1.022967,0.470993,-1.245619,-0.6027,0.55394,0.035291,0.082399,...,-2.465953,2.575293,0.513758,-0.411206,-0.874503,-0.427167,-0.739697,-0.313856,event,II
6,-0.704235,-0.663633,-0.660192,1.430529,0.88599,-0.927109,-0.758143,0.977237,-0.251718,-0.36797,...,0.633667,0.07398,0.543975,-0.434971,-0.24428,-0.36523,-0.503018,-0.685542,event,II
7,-0.521596,-0.666978,-0.635634,1.423584,0.818817,-0.733433,-0.225165,1.213665,-0.168807,-0.189078,...,0.346103,0.742595,0.536949,-0.403696,-0.449119,0.068332,-0.011604,-0.607945,event,Ib
9,-0.768058,-0.726051,-0.727538,1.074819,1.629404,-1.048424,-0.368766,0.47414,-0.360203,-0.411925,...,-0.53214,-0.456377,0.497521,-0.449575,-0.342285,-0.3517,-0.988119,-0.84409,event,Ia


In [6]:
with open('data/stage2_indexes.pkl','rb') as f:
    stage2_indexes = pickle.load(f)

event_test = test_df.iloc[stage2_indexes]
len(event_test),len(test_df)

(452, 965)

In [7]:
X = event_train[features]
y4 = event_train['class4'].map({'II':2,'Ia':0,'Ib':1})
X_test = event_test[features]

In [8]:
def mapper(x):
    if x != 0:
        return 1
    else:
        return 0
    
def process(x):
    results = []
    for element in x:
        results.append(mapper(element))
    return results


def run_cv(get_model, stratified=True, name='dummy', test=False, select=None):
    global X, X_test
    if select is not None:
        combined_X = pd.concat([X,X_test])
        pca = PCA(n_components=select)
        combined_X = pca.fit_transform(combined_X)
        current_X = pd.DataFrame(combined_X[:len(X)])
        current_X_test = pd.DataFrame(combined_X[len(X):])
    else:
        current_X = X.copy()
        current_X_test = X_test.copy()
        
    if stratified:
        kf = StratifiedKFold(n_splits=5,shuffle=True,random_state=random_state)
        kf.get_n_splits(current_X,y4)
    else:
        kf = KFold(n_splits=5,shuffle=True,random_state=random_state)
    oof4 = np.zeros(len(y4))
    oof_probas = [0 for _ in range(len(y4))]
    for index,(train_index, test_index) in enumerate(kf.split(current_X,y4)):
        X_train, X_val = current_X.iloc[train_index], current_X.iloc[test_index]
        y_train, y_val = y4.iloc[train_index], y4.iloc[test_index]
        model = get_model()
        model.fit(X_train, y_train)
        temp = model.predict(X_val)
        oof4[test_index] = temp
        
        if not test:
            # predictions
            preds = model.predict_proba(current_X_test)
            with open(f'predictions2/{name}_{index}.pkl','wb') as f:
                pickle.dump(preds,f)
            # cross validation
            oof_proba = model.predict_proba(X_val)
            for idx,index in enumerate(test_index):
                oof_probas[index] = oof_proba[idx]
        
    error4 = accuracy_score(y4, oof4)
    with open(f'oofs2/{name}.pkl','wb') as f:
        pickle.dump(oof_probas,f)
    return error4

In [9]:
# hyper-parameters tuning for rf
# max_depths = [10 * i for i in range(1,11)]
# max_depths.append(None)

# rf = RandomForestClassifier()
# rf_params = {'n_estimators':[50 * i for i in range(1,21)],
#             'max_depth':max_depths}
# rf_random = RandomizedSearchCV(estimator=rf,param_distributions=rf_params,random_state=random_state)
# rf_random.fit(X, y4)
# print(rf_random.best_params_)

In [10]:
get_rf_model = lambda: RandomForestClassifier(random_state=random_state,n_estimators=450,max_depth=50)
run_cv(get_rf_model,name='rf')

0.5458515283842795

In [11]:
# # hyper-parameters tuning for gb
# max_depths = [10 * i for i in range(1,21)]
# max_depths.append(None)

# gb = GradientBoostingClassifier()
# gb_params = {'n_estimators':[50 * i for i in range(1,21)],
#             'max_depth':max_depths}
# gb_random = RandomizedSearchCV(estimator=gb,param_distributions=gb_params,random_state=random_state)
# gb_random.fit(X, y4)
# print(gb_random.best_params_)

In [12]:
get_gb_model = lambda: GradientBoostingClassifier(random_state=random_state)
run_cv(get_gb_model,name='gb')

0.5414847161572053

In [13]:
# # hyper-parameters tuning for lr
# lr = LogisticRegression()
# lr_params = {'max_iter':[100 * i for i in range(15,36)]}
# lr_random = RandomizedSearchCV(estimator=lr,param_distributions=lr_params,random_state=random_state)
# lr_random.fit(X, y4)
# print(lr_random.best_params_)

In [14]:
get_svm_model = lambda: SVC(random_state=random_state,probability=True)
run_cv(get_svm_model,name='svm',select=20)

0.537117903930131