In [1]:
# Kaggle: Titanic competition

# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

# Importing the dataset
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Dropping unecessary features
train = train.drop(['PassengerId','Ticket','Cabin','Name','Embarked'], axis=1)
PassengerId = test['PassengerId']
test = test.drop(['Name','Ticket','Cabin','Name','Embarked','PassengerId'], axis=1)

# Filling missing values in the training set, with the most occurency
test['Fare'].fillna(test['Fare'].median(), inplace = True)
train['Age'].fillna(train['Age'].mean(), inplace = True)
test['Age'].fillna(test['Age'].mean(), inplace = True)

# Getting childs 
def get_person(passenger):
    age, sex = passenger
    return 'child' if age < 16 else sex

train['Person'] = train[['Age','Sex']].apply(get_person, axis=1)
test['Person'] = test[['Age','Sex']].apply(get_person, axis=1)

train.drop(['Sex'],axis = 1, inplace = True)
test.drop(['Sex'],axis = 1, inplace = True)

# create dummy variables for Person column, & drop Male as it has the lowest average of survived passengers
person_dummies_titanic  = pd.get_dummies(train['Person'])
person_dummies_titanic.columns = ['Child','Female','Male']
person_dummies_titanic.drop(['Male'], axis=1, inplace=True)

person_dummies_test  = pd.get_dummies(test['Person'])
person_dummies_test.columns = ['Child','Female','Male']
person_dummies_test.drop(['Male'], axis=1, inplace=True)

train = train.join(person_dummies_titanic)
test = test.join(person_dummies_test)

train.drop(['Person'], axis = 1, inplace = True)
test.drop(['Person'], axis = 1, inplace = True)

# create dummy variables for Person column, & drop Male as it has the lowest average of survived passengers
sns.factorplot('Pclass','Survived',order=[1,2,3], data=train,size=5)
pclass_dummies = pd.get_dummies(train['Pclass'])
pclass_dummies.columns = ['class1','class2','class3']
pclass_dummies.drop(['class3'], axis=1, inplace = True)

pclass_dummies_train = pd.get_dummies(train['Pclass'])
pclass_dummies_train.columns = ['class1','class2','class3']
pclass_dummies_train.drop(['class3'], axis=1, inplace = True)

pclass_dummies_test = pd.get_dummies(test['Pclass'])
pclass_dummies_test.columns = ['class1','class2','class3']
pclass_dummies_test.drop(['class3'], axis=1, inplace = True)

train.drop(['Pclass'], axis=1, inplace = True)
test.drop(['Pclass'], axis=1, inplace = True)

# Handling family members
train['Family'] = train['SibSp'] + train['Parch']
train['Family'].loc[train['Family'] > 0] = 1
train['Family'].loc[train['Family'] == 0] = 0

test['Family'] = test['SibSp'] + test['Parch']
test['Family'].loc[test['Family'] > 0] = 1
test['Family'].loc[test['Family'] == 0] = 0

train.drop(['SibSp','Parch'],axis=1, inplace=True)
test.drop(['SibSp','Parch'],axis=1, inplace=True)

# Getting the training data and target variable
y_train = train['Survived']
X_train = train.drop(['Survived'], axis=1)
X_test = test

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

  stat_data = remove_na(group_data)
The minimum supported version is 2.4.6

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [2]:
from hyperopt import fmin, rand, tpe, hp, STATUS_OK, Trials
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, mean_squared_error

In [3]:
space_base = {'n_estimators': 200,
            'criterion': 'entropy',
            'max_depth': 2,
            'min_samples_split': 2}
            #'min_samples_leaf': 2}

space_hp = {'n_estimators': hp.choice('estm', [100,200,300,400,500]),
            'criterion': hp.choice('crit', ['gini', 'entropy']),
            'max_depth': hp.quniform('dep', 1,10,1),
            'min_samples_split': hp.choice('samp', [0.1, 0.3, 0.5])}
            #'min_samples_leaf': hp.quniform('leaf',2,10,1)}

In [4]:
def optim_target(X_train, y_train, n_estimators, criterion, max_depth, min_samples_split, random_state = 0):
    np.random.seed(42)
    
    classifier = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth, min_samples_split=min_samples_split, random_state = 0)
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_train)
    
    acc = accuracy_score(y_train, y_pred)
    
    params = {'n_estimators': n_estimators,
            'criterion': criterion,
            'max_depth': max_depth,
            'min_samples_split': min_samples_split}
            #'min_samples_leaf': min_samples_leaf}
    
    return acc, params 

In [5]:
acc, params = optim_target(X_train, y_train, **space_base)

print('{}'.format(acc))

0.792368125701459


In [6]:
# wrapper for hyperopt #

# hyperopt minimizes functions, so our target value is set
class hp_wrapper:
    
    def __init__(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train
    
    def target(self, space):
        
        acc, params = optim_target(self.X_train, self.y_train, **space)
    
        # a dict with 'loss' and 'status' is required
        return {'loss': 1-acc,
                'status': STATUS_OK,
                'parameters': params}

In [7]:
opt_task = hp_wrapper(X_train, y_train)
opt_task.target(space_base)['loss']

0.20763187429854102

In [8]:
# random search #

# initializing wrapper - hyperopt
opt_task_hp = hp_wrapper(X_train, y_train)

# trials object stores the evaluations
trials_rand = Trials()

# using the fmin function from hyperopt
best = fmin(fn=opt_task_hp.target, algo=rand.suggest, space=space_hp, max_evals=50, trials=trials_rand)

# storing the results
#experiment_dict['well_sep']['rand'] = [trials_rand.trials[i]['result'] for i in range(len(trials_rand.trials))]

In [9]:
print(best)

{'crit': 1, 'samp': 0, 'estm': 0, 'dep': 9.0}


In [10]:
classifier = RandomForestClassifier(n_estimators=100, criterion='entropy', max_depth=7.0, min_samples_split=0.1, random_state = 0)
classifier.fit(X_train, y_train)
classifier.score(X_train, y_train)

0.8002244668911336