In [1]:
import pandas as pd
import numpy as np
import matplotlib
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression

In [2]:
df = pd.read_csv('./data/train.csv')


In [3]:
# split cabin into num let
cabin_loc_let = []
cabin_loc_num = []

for i in df['Cabin']:
    x = str(i)
    cabin_loc_let.append(x[0])
    if x[1] == 'a':
        cabin_loc_num.append(0)
    else:
        cabin_loc_num.append(x[1:])
        
df['cabin_loc_let'] = cabin_loc_let
df['cabin_loc_num'] = cabin_loc_num

In [4]:
# label encoder
cll_le = preprocessing.LabelEncoder() 
cll_enc = cll_le.fit_transform(df['cabin_loc_let'])
sex_le = preprocessing.LabelEncoder() 
sex_enc = sex_le.fit_transform(df['Sex'])
pclass_le = preprocessing.LabelEncoder() 
pclass_enc = pclass_le.fit_transform(df['Pclass'])
embarked_le = preprocessing.LabelEncoder() 
embarked_enc = embarked_le.fit_transform(df['Embarked'])

In [5]:
# one hot Pclass, Embarked, Cabin, Ticket, Sex
enc = OneHotEncoder(sparse = True)

cll_enc = cll_enc.reshape(-1,1)
sex_enc = sex_enc.reshape(-1,1)
pclass_enc = pclass_enc.reshape(-1,1)
embarked_enc = embarked_enc.reshape(-1,1)


cll_onehot = enc.fit_transform(cll_enc).toarray()
cll_cat = enc.categories_
sex_onehot = enc.fit_transform(sex_enc).toarray()
sex_cat = enc.categories_
pclass_onehot = enc.fit_transform(pclass_enc).toarray()
pclass_cat = enc.categories_
embarked_onehot = enc.fit_transform(embarked_enc).toarray()
embarked_cat = enc.categories_


In [6]:
# get column names
cll_cat = np.asarray(cll_cat)
sex_cat = np.asarray(sex_cat)
pclass_cat = np.asarray(pclass_cat)
embarked_cat = np.asarray(embarked_cat)

cll_cat = cll_cat.reshape(-1,1)
sex_cat = sex_cat.reshape(-1,1)
pclass_cat = pclass_cat.reshape(-1,1)
embarked_cat = embarked_cat.reshape(-1,1)

cll_inv = cll_le.inverse_transform(cll_cat)
sex_inv = sex_le.inverse_transform(sex_cat)
pclass_inv = pclass_le.inverse_transform(pclass_cat)
embarked_inv = embarked_le.inverse_transform(embarked_cat)


  return f(*args, **kwargs)


In [7]:
cll_df = pd.DataFrame(data = cll_onehot, columns = cll_inv)
sex_df = pd.DataFrame(data = sex_onehot, columns = sex_inv)
pclass_df = pd.DataFrame(data = pclass_onehot, columns = pclass_inv)
embarked_df = pd.DataFrame(data = embarked_onehot, columns = embarked_inv)

df2 = pd.concat([df, cll_df, sex_df, pclass_df, embarked_df], axis = 1)


In [8]:
df2 = df2.drop(['Ticket', 'Sex', 'Cabin', 'Embarked', 'Pclass', 'Name', 'cabin_loc_let'], axis = 1)

In [9]:
# replace nan
# df2.columns[df2.isna().any()].tolist()
mean_age = df2['Age'].mean()
mean_fare = df2['Fare'].mean()
df2['Age'] = df2['Age'].fillna(value = mean_age)
df2['Fare'] = df2['Fare'].fillna(value = mean_fare)


df2.columns[df2.isna().any()].tolist()



[]

In [23]:
# gbm model, iterate for params
from sklearn.ensemble import GradientBoostingClassifier
X_train = df2.drop(['Survived'], axis = 1)
y_train = df2['Survived']

In [24]:
df3 = pd.read_csv('./data/test.csv')

In [25]:
clf10 = GradientBoostingClassifier(n_estimators=10, learning_rate=.1, max_depth=1, random_state=7, verbose = .1).fit(X_train, y_train)
clf100 = GradientBoostingClassifier(n_estimators=100, learning_rate=.1, max_depth=1, random_state=7, verbose = .1).fit(X_train, y_train)
clf500 = GradientBoostingClassifier(n_estimators=500, learning_rate=.1, max_depth=1, random_state=7, verbose = .1).fit(X_train, y_train)
clf1000 = GradientBoostingClassifier(n_estimators=1000, learning_rate=.1, max_depth=1, random_state=7, verbose = .1).fit(X_train, y_train)



      Iter       Train Loss   Remaining Time 
         1           1.3164            0.82s
         2           1.2769            0.79s
         3           1.2447            0.73s
         4           1.2184            0.62s
         5           1.1968            0.51s
         6           1.1791            0.41s
         7           1.1630            0.31s
         8           1.1494            0.21s
         9           1.1370            0.10s
        10           1.1260            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3164           11.48s
         2           1.2769           11.56s
         3           1.2447           11.15s
         4           1.2184           11.04s
         5           1.1968           10.72s
         6           1.1791           10.36s
         7           1.1630           10.04s
         8           1.1494            9.84s
         9           1.1370            9.65s
        10           1.1260            9.53s
        

In [27]:
# save model as file
import pickle

filename = 'clf10.sav'
pickle.dump(clf10, open(filename, 'wb'))

filename = 'clf100.sav'
pickle.dump(clf100, open(filename, 'wb'))

filename = 'clf500.sav'
pickle.dump(clf500, open(filename, 'wb'))

filename = 'clf1000.sav'
pickle.dump(clf1000, open(filename, 'wb'))

In [12]:
# use model for test