In [1]:
import numpy as np
import pandas as pd
import os

from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score



In [36]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [37]:
combineTrainTest = train.append(test)

In [38]:
print train.shape
print test.shape
print combineTrainTest.shape

(891, 12)
(418, 11)
(1309, 12)


In [40]:
def preprocessing(data):
    data = data.fillna(0)
    
    # Create title feature
    data['Title'] = data['Name'].apply(lambda x: x.split(',')[1]).apply(lambda x: x.split('.')[0])
    data.drop(['Name','Ticket','Cabin'], axis=1, inplace=True)
    # Remove Name, Ticket, Cabin Column
    
    categoricalFeatures = ['Sex','Embarked', 'Title']
    data = pd.get_dummies(data,columns = categoricalFeatures) # Create categorical columns
    
    return data

In [41]:
combineTrainTest = preprocessing(combineTrainTest)

In [42]:
train = combineTrainTest[0:891]
test = combineTrainTest[891:]

In [60]:
train.head()

Unnamed: 0,Age,Fare,Parch,PassengerId,Pclass,SibSp,Survived,Sex_female,Sex_male,Embarked_0,...,Title_ Master,Title_ Miss,Title_ Mlle,Title_ Mme,Title_ Mr,Title_ Mrs,Title_ Ms,Title_ Rev,Title_ Sir,Title_ the Countess
0,22.0,7.25,0,1,3,1,0.0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
1,38.0,71.2833,0,2,1,1,1.0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
2,26.0,7.925,0,3,3,0,1.0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
3,35.0,53.1,0,4,1,1,1.0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
4,35.0,8.05,0,5,3,0,0.0,0,1,0,...,0,0,0,0,1,0,0,0,0,0


In [43]:
featuresList = list(train.columns.values)
featuresList.remove('Survived')

In [45]:
#split features and labels
features = train[featuresList].values
labels = train['Survived'].values

In [46]:
# Cross validation
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.30, random_state=42)

In [47]:
# Naive Bayes
clf = GaussianNB()
clf.fit(features_train,labels_train)
pred = clf.predict(features_test)

print accuracy_score(pred,labels_test)

0.727611940299


In [48]:
testFeatures = test[featuresList].values
test['Survived'] = clf.predict(testFeatures)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [56]:
clf2 = DecisionTreeClassifier(min_samples_split = 100, random_state = 1122)
clf2.fit(features_train,labels_train)
pred2 = clf2.predict(features_test)

print accuracy_score(pred2,labels_test)

0.813432835821


In [57]:
clf2.feature_importances_

array([ 0.03170431,  0.13231203,  0.        ,  0.02992318,  0.1844149 ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.02507888,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.55249456,
        0.        ,  0.        ,  0.04407214,  0.        ,  0.        ])

In [58]:
test['Survived'] = clf2.predict(testFeatures)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [59]:
final = test[['PassengerId','Survived']]
final.to_csv('naive_prediction.csv', index=False)