In [1]:
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
import pickle
import numpy as np
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler

In [2]:
df = pd.read_csv('./data/train.csv')
test_df = pd.read_csv('./data/test.csv')

In [3]:
df.head()

Unnamed: 0,id,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_41,feature_42,feature_43,feature_44,feature_45,feature_46,feature_47,feature_48,feature_49,target
0,0,0,0,1,0,1,0,0,0,0,...,0,0,21,0,0,0,0,0,0,Class_2
1,1,0,0,0,0,2,1,0,0,0,...,0,0,0,0,0,0,0,0,0,Class_1
2,2,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,13,2,0,Class_1
3,3,0,0,0,0,0,0,0,3,0,...,0,0,0,0,0,0,0,1,0,Class_4
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,Class_2


In [4]:
test_df.head()

Unnamed: 0,id,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_40,feature_41,feature_42,feature_43,feature_44,feature_45,feature_46,feature_47,feature_48,feature_49
0,100000,0,0,0,0,0,0,4,4,0,...,0,0,0,0,0,0,0,0,0,0
1,100001,0,0,1,0,0,0,2,0,7,...,3,0,1,0,0,0,1,0,2,1
2,100002,0,0,0,0,0,0,0,0,1,...,0,0,0,1,0,0,0,0,6,0
3,100003,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,6,9,14,3
4,100004,0,0,0,0,0,0,1,0,4,...,1,0,0,0,0,0,0,0,0,0


In [5]:
target = df.pop('target')
df.pop('id')
test_df.pop('id')

0        100000
1        100001
2        100002
3        100003
4        100004
          ...  
49995    149995
49996    149996
49997    149997
49998    149998
49999    149999
Name: id, Length: 50000, dtype: int64

In [6]:
def scale(x):
    scaler = MinMaxScaler()
    x = scaler.fit_transform(x)
    return x

In [7]:
df = scale(df)
test_df = scale(test_df)

In [8]:
def pca(x):
    pca = PCA(n_components=50)
    x = pca.fit_transform(x)
    print(sum(pca.explained_variance_ratio_))
    return x

In [9]:
df = pca(df)
test_df = pca(test_df)

0.9999999999999997
0.9999999999999999


In [10]:
df.shape

(100000, 50)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(df, target, test_size=0.3, random_state=42)

In [12]:
svm = SVC(verbose=True,probability=True)
svm.fit(X_train, y_train)

[LibSVM]

SVC(probability=True, verbose=True)

In [13]:
rfc = RandomForestClassifier(random_state=0)
rfc.fit(X_train, y_train)

RandomForestClassifier(random_state=0)

In [14]:
gbc = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
gbc.fit(X_train, y_train)

GradientBoostingClassifier(learning_rate=1.0, max_depth=1, random_state=0)

In [34]:
svm_preds =  svm.predict_proba(test_df)

In [35]:
rfc_preds = rfc.predict_proba(test_df)

In [36]:
gbc_preds = gbc.predict_proba(test_df)

In [37]:
ensemble = (svm_preds + rfc_preds + gbc_preds)/3

In [39]:
svm_preds = pd.DataFrame(data = svm_preds, columns = ['Class_1', 'Class_2', 'Class_3', 'Class_4'])
rfc_preds = pd.DataFrame(data = rfc_preds, columns = ['Class_1', 'Class_2', 'Class_3', 'Class_4'])
gbc_preds = pd.DataFrame(data = gbc_preds, columns = ['Class_1', 'Class_2', 'Class_3', 'Class_4'])
ensemble = pd.DataFrame(data = ensemble, columns = ['Class_1', 'Class_2', 'Class_3', 'Class_4'])

In [40]:
index = []
index = [i for i in range(100000,150000)]
index = pd.DataFrame(data = index, columns = ['id'])

In [41]:
svm_preds = pd.concat((index,svm_preds), axis = 1)
rfc_preds = pd.concat((index,rfc_preds), axis = 1)
gbc_preds = pd.concat((index,gbc_preds), axis = 1)
ensemble = pd.concat((index,ensemble), axis = 1)

In [42]:
svm_preds = svm_preds.set_index('id')
rfc_preds = rfc_preds.set_index('id')
gbc_preds = gbc_preds.set_index('id')
ensemble = ensemble.set_index('id')

In [43]:
svm_preds.to_csv('./svm_preds.csv')
rfc_preds.to_csv('./rfc_preds.csv')
gbc_preds.to_csv('./gbc_preds.csv')
ensemble.to_csv('./ensemble.csv')