In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import KFold, StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from lightgbm import LGBMClassifier
# ignore warnings
import warnings
warnings.simplefilter('ignore')

In [2]:
random_state = 72
!rm -rf predictions1
!mkdir predictions1
!rm -rf oofs1
!mkdir oofs1

In [3]:
train_df = pd.read_csv('data/cleaned_train.csv',index_col=0)
test_df = pd.read_csv('data/cleaned_test.csv',index_col=0)

In [4]:
train_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,68,69,70,71,72,73,74,75,class2,class4
0,0.23771,-0.309011,-0.44704,-1.323625,-1.34572,-1.253581,-1.051472,-1.19394,-0.045588,-0.21501,...,2.052707,0.927877,0.324054,-0.018559,-1.744173,-1.915203,0.116689,0.138621,nonevent,nonevent
1,-0.622351,-0.878554,-0.80768,-1.229485,-1.210446,-1.036006,-0.906886,-1.318914,0.084745,0.266698,...,0.088352,-0.256184,-2.775113,3.25475,-1.321825,-1.521228,-0.750197,-0.558482,nonevent,nonevent
2,-0.746998,-0.947298,-0.908132,-1.239856,-1.228877,-1.133512,-0.628584,-1.266641,-0.306451,-0.570413,...,-0.1069,-0.264368,0.624072,-0.228494,-1.61313,-1.51376,-0.576771,-1.12515,nonevent,nonevent
3,-0.288672,-0.415869,-0.294635,-1.287518,-1.300299,-0.365133,-0.991164,-1.144773,0.565953,1.151443,...,0.08067,-0.217254,-0.638016,1.102341,-0.408554,-0.929189,-0.650076,-0.274089,nonevent,nonevent
4,-0.780036,-0.67099,-0.669236,0.527349,0.005534,-1.001033,-0.53222,0.292039,-0.292112,-0.471555,...,-0.119527,-0.304354,0.505167,-0.416789,-0.84378,-0.626883,-0.814273,-1.054749,event,Ib


In [5]:
test_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,66,67,68,69,70,71,72,73,74,75
0,-0.467394,-0.844612,-0.831325,-1.181806,-1.215169,-0.715284,-0.954526,-1.395449,-0.218031,-0.50602,...,0.987728,-1.189677,0.148885,-0.307758,0.561095,-0.409494,-0.569197,-1.037969,-0.722484,-1.11807
1,0.791149,-0.866857,-0.844977,-0.927701,-0.835214,-1.533052,-1.078775,-1.296271,2.130504,1.992439,...,-0.742018,0.699835,4.548434,1.872111,0.473505,-0.486268,-2.36532,-2.388677,0.994464,-0.147984
2,-0.586822,-0.624548,-0.630906,0.717856,0.673655,-0.662732,-0.033306,1.02329,-0.287586,-0.35052,...,-0.728472,0.411539,-0.505118,0.271496,0.599053,-0.33813,-0.173767,-0.15775,-0.513011,-0.606934
3,0.074288,-0.023355,-0.154381,0.975136,1.58563,-0.299773,0.293422,0.309753,-0.305471,-0.50006,...,-0.579328,0.296396,-0.169522,-0.46624,0.502173,-0.498616,-0.278577,0.499394,-0.133755,-0.286908
4,-0.028912,0.331237,0.084871,-0.591826,-0.357407,1.472421,3.580687,-0.194309,-0.157671,-0.275366,...,-0.08018,-0.616217,-1.386771,1.346765,-3.012171,4.245149,0.571922,0.675382,-0.64018,4.412803


In [6]:
features = train_df.columns.tolist()
features.remove('class4')
features.remove('class2')

In [7]:
X = train_df[features]
y2 = train_df['class2'].map({'nonevent':0,'event':1})
X_test = test_df[features]

In [8]:
def mapper(x):
    if x != 0:
        return 1
    else:
        return 0
    
def process(x):
    results = []
    for element in x:
        results.append(mapper(element))
    return results


def run_cv(get_model, stratified=True, name='dummy', test=False, select=None):
    global X, X_test
    if select is not None:
        combined_X = pd.concat([X,X_test])
        pca = PCA(n_components=select)
        combined_X = pca.fit_transform(combined_X)
        current_X = pd.DataFrame(combined_X[:len(X)])
        current_X_test = pd.DataFrame(combined_X[len(X):])
    else:
        current_X = X.copy()
        current_X_test = X_test.copy()
        
    if stratified:
        kf = StratifiedKFold(n_splits=5,shuffle=True,random_state=random_state)
        kf.get_n_splits(current_X,y2)
    else:
        kf = KFold(n_splits=5,shuffle=True,random_state=random_state)
    oof2 = np.zeros(len(y2))
    oof_probas = [0 for _ in range(len(y2))]
    for index,(train_index, test_index) in enumerate(kf.split(current_X,y2)):
        X_train, X_val = current_X.iloc[train_index], current_X.iloc[test_index]
        y_train, y_val = y2.iloc[train_index], y2.iloc[test_index]
        model = get_model()
        model.fit(X_train, y_train)
        temp = model.predict(X_val)
        oof2[test_index] = temp
        
        if not test:
            # predictions       
            preds = model.predict_proba(current_X_test)
            with open(f'predictions1/{name}_{index}.pkl','wb') as f:
                pickle.dump(preds,f)
            # cross validation
            oof_proba = model.predict_proba(X_val)
            for idx,index in enumerate(test_index):
                oof_probas[index] = oof_proba[idx]
            with open(f'oofs1/{name}.pkl','wb') as f:
                pickle.dump(oof_probas,f)
        
    error2 = accuracy_score(y2, oof2)
    return error2

In [9]:
# # hyper-parameters tuning for rf
# max_depths = [10 * i for i in range(1,21)]
# max_depths.append(None)

# rf = RandomForestClassifier()
# rf_params = {'n_estimators':[50 * i for i in range(1,21)],
#             'max_depth':max_depths}
# rf_random = RandomizedSearchCV(estimator=rf,param_distributions=rf_params,random_state=random_state)
# rf_random.fit(X, y2)
# print(rf_random.best_params_)

In [10]:
get_rf_model = lambda: RandomForestClassifier(random_state=random_state,n_estimators=100,max_depth=50)
run_cv(get_rf_model,name='rf')

0.8646288209606987

In [11]:
# # hyper-parameters tuning for lr
# lr = LogisticRegression()
# lr_params = {'max_iter':[100 * i for i in range(5,16)]}
# lr_random = RandomizedSearchCV(estimator=lr,param_distributions=lr_params,random_state=random_state)
# lr_random.fit(X, y2)
# print(lr_random.best_params_)

In [12]:
get_lr_model = lambda: LogisticRegression(random_state=random_state,max_iter=2000)
run_cv(get_lr_model,name='lr',select=20)

0.8602620087336245

In [13]:
# # hyper-parameters tuning for LightGBM
# lgb = LGBMClassifier()
# lgb_params = {'num_leaves':[100 * i for i in range(5,16)]}
# lgb_random = RandomizedSearchCV(estimator=lgb,param_distributions=lgb_params,random_state=random_state)
# lgb_random.fit(X, y2)
# print(lgb_random.best_params_)

In [14]:
get_lgb_model = lambda: LGBMClassifier(random_state=random_state)
run_cv(get_lgb_model,name='lgb')

0.8930131004366813