In [8]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeRegressor
from statistics import mean 
from sklearn.model_selection import train_test_split

import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns


In [9]:
import warnings
warnings.filterwarnings("ignore")

#taking data

train_data= pd.read_csv("data/train.csv")
test_data=pd.read_csv("data/test.csv")

In [35]:
#Preprocessing the data


###################### train_data is preprocessed ########################

#missing Age with mean value 
train_data['Age'].fillna(train_data['Age'].mean(), inplace=True)
#missing Fare with back fill value 
train_data['Fare'].fillna(train_data['Fare'].mean(), inplace=True)

#missing Cabin with Unique value 
train_data['Cabin'].fillna('Missing', inplace=True)

#missing Embarked with most embarked value using Histogram 
train_data['Embarked'].fillna(train_data['Embarked'].mode()[0], inplace=True)
    
#missing every thing else with front fill value 
train_data.fillna(method='ffill', inplace=True)
    
###################  TEST DATA IS PREPROCESSED ###########################
    
#Impute Age with mean value 
test_data['Age'].fillna(test_data['Age'].mean(), inplace=True)

#Impute Fare with back fill value 
test_data['Fare'].fillna(test_data["Fare"].mean(), inplace=True)

#Impute Cabin with Unique value 
test_data['Cabin'].fillna('Missing', inplace=True)

#Impute Embarked with most embarked value using Histogram 
test_data['Embarked'].fillna(test_data['Embarked'].mode()[0], inplace=True)
    
#Impute every thing else with front fill value 
test_data.fillna(method='ffill', inplace=True)

In [36]:
#Create feautures and build the subset out of the input data 
features = ['PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']
train_X = train_data[features]
train_y = train_data.Survived


#Split input data 
train_X, val_X, train_y, val_y = train_test_split(train_X, train_y, random_state = 1)


#Building test data 
test_X = test_data[features]

In [37]:
### label encode the categorical values and convert them to numbers 
le = LabelEncoder()
le.fit(train_X['Sex'].astype(str))
train_X['Sex'] = le.transform(train_X['Sex'].astype(str))
test_X['Sex'] = le.transform(test_X['Sex'].astype(str))



In [38]:
#Create Model
#titanic_model = RandomForestRegressor(random_state = 1)
titanic_model = DecisionTreeRegressor(random_state=1)

#Train Model
titanic_model.fit(train_X,train_y)


#Validate Model



#Predict Model
titanic_prediction = titanic_model.predict(test_X)

# print(titanic_prediction)


#Create the data for submission

submission = pd.DataFrame({"PassengerId": test_X["PassengerId"], "Survived": titanic_prediction}) 


#Save the data file in to local disk
filename = 'data/submission.csv'
submission.to_csv(filename,index=False)
print('Saved file: ' + filename)
print(submission)

Saved file: data/submission.csv
     PassengerId  Survived
0            892       0.0
1            893       0.0
2            894       0.0
3            895       0.0
4            896       1.0
5            897       0.0
6            898       0.0
7            899       0.0
8            900       0.0
9            901       0.0
10           902       0.0
11           903       1.0
12           904       1.0
13           905       0.0
14           906       1.0
15           907       0.0
16           908       0.0
17           909       0.0
18           910       1.0
19           911       0.0
20           912       0.0
21           913       1.0
22           914       1.0
23           915       0.0
24           916       1.0
25           917       0.0
26           918       1.0
27           919       0.0
28           920       1.0
29           921       0.0
..           ...       ...
388         1280       0.0
389         1281       0.0
390         1282       0.0
391         1283       