In [52]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import clear_output

In [4]:
def sigmoid(predict):
    return(1/(1 + np.exp(-predict)))

def loss(y, yhat):
    return -np.mean(y*np.log(yhat)) + (1-y)*np.log(1-yhat)

In [5]:
class Scaler():
    def fit(self, xss, dummy_columns, numeric_columns):
        
        self.numeric_columns = numeric_columns
        self.dummy_columns = dummy_columns
        
        ## process numeric matrix
        numeric_vector = xss.iloc[:,numeric_columns]
        self.mean = np.mean(numeric_vector)
        self.std = np.std(numeric_vector)
        
        ## process dummy matrix
        dummy_vector = xss.iloc[:,dummy_columns]
        self.proportion = np.mean(dummy_vector)
        
    def transform_dummy(self,xs,proportion):
        trans_xs = [1 - proportion if x == 1 else proportion for x in xs]
        return trans_xs
        
    def transform_numeric(self,xs, mean, std):
        xs = (xs-mean)/std
        return(xs)

    def transform(self, xss):
        row, col = xss.shape
        df = []
        for c in range(col):
            xs = xss.iloc[:,c]
            if c in self.dummy_columns:
                dff = xs
                
                idx = self.dummy_columns.index(c)
                proportion = self.proportion[idx]
                dff = self.transform_dummy(xs,proportion)
            else:
                idx = self.numeric_columns.index(c)
                mean = self.mean[idx]
                std = self.std[idx]
                dff = self.transform_numeric(xs,mean,std)
                
            df.append(dff)
        df = (np.column_stack(df))
        return(pd.DataFrame(df))
        

In [6]:
def get_dummy_id(X_train):
    df = X_train.describe().T
    row, col = df.shape
    dummy_columns = []
    numeric_columns = []
    for r in range(row):
        if df['max'][r] <= 1 and df['min'][r] >= 0:
            dummy_columns.append(r)
        else:
            numeric_columns.append(r)
    
    return({'dummy_columns': dummy_columns,
            'numeric_columns': numeric_columns})

In [10]:
def getData():
    X_train = pd.read_csv('data/X_train')
    X_test = pd.read_csv('data/X_test')
    Y_train = pd.read_csv('data/Y_train')
    
    return {'X_train':X_train, 'Y_train':Y_train, 'X_test': X_test}

## read data

In [11]:
res = getData()
X_train = res['X_train']
Y_train = res['Y_train']
X_test = res['X_test']

res = get_dummy_id(X_train)
dummy_columns = res['dummy_columns']
numeric_columns = res['numeric_columns']

scaler = Scaler()
scaler.fit(X_train, dummy_columns, numeric_columns)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [127]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X_train, Y_train, test_size=0.25, random_state=1)

## tuning the hyper parameters

In [128]:
hyper = range(80,106,2)
his_train = []
his_test = []

for h in hyper:
    #clf = LogisticRegression(random_state=0, solver='newton-cg')
    regr = RandomForestRegressor(max_depth=10, random_state=0,
                            n_estimators=h)
    
    regr.fit(x_train, y_train.values.ravel())
    y_pred = regr.predict(x_train)
    value =[ 0 if i <= 0.5 else 1 for i in y_pred]
    acc = np.mean(value == y_train)
    his_train.append(acc)
    
    y_pred = regr.predict(x_test)
    value =[ 0 if i <= 0.5 else 1 for i in y_pred]
    acc = np.mean(value == y_test)
    his_test.append(acc)
    
    clear_output()
    print(his_test)
    print(his_train)

[label    0.864513
dtype: float64, label    0.864636
dtype: float64, label    0.864513
dtype: float64, label    0.864513
dtype: float64, label    0.86439
dtype: float64, label    0.863899
dtype: float64, label    0.864267
dtype: float64, label    0.864144
dtype: float64, label    0.864144
dtype: float64, label    0.864022
dtype: float64, label    0.863899
dtype: float64, label    0.863776
dtype: float64, label    0.864022
dtype: float64]
[label    0.875471
dtype: float64, label    0.875676
dtype: float64, label    0.875266
dtype: float64, label    0.875184
dtype: float64, label    0.875266
dtype: float64, label    0.875143
dtype: float64, label    0.87498
dtype: float64, label    0.874898
dtype: float64, label    0.874816
dtype: float64, label    0.874898
dtype: float64, label    0.875143
dtype: float64, label    0.875307
dtype: float64, label    0.875512
dtype: float64]


## implement random forest 

In [130]:
regr = RandomForestRegressor(max_depth=10, random_state=0,
                            n_estimators=82)

regr.fit(X_train,Y_train.values.ravel())
y_pred = regr.predict(X_test)

## implement logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0, solver='newton-cg').fit(X_train, Y_train.values.ravel())
y_pred = clf.predict(X_test)

## implement pytorch

## submit answer

In [131]:
outfile = 'random-forest-10-82'
row , col = X_test.shape
ans = []
for i in range((row)):
    val = int(y_pred[i] > 0.5)
    ans.append([i+1 ,val])

ans = pd.DataFrame(ans,columns=['id', 'label'])
ans.to_csv('data/'+ outfile +'.csv', index=False)

In [136]:
lab1 = (pd.read_csv('data/random-forest-10-80.csv')['label'])
lab2 = (pd.read_csv('data/random-forest-10-82.csv')['label'])
lab3 = (pd.read_csv('data/random.csv')['label'])

np.mean(lab1==lab2), np.mean(lab2==lab3),np.mean(lab1==lab3)

(0.9991401019593391, 0.9834776733615871, 0.9833548307843498)