In [20]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

# one-hot encoding
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

# Import Data

In [2]:
import os
os.getcwd()

train_df = pd.read_csv('./Data/train.csv')
test_df = pd.read_csv('./Data/test.csv')

# Exploratory Analysis

In [3]:
print("Training examples are",len(train_df))
print("Test data is",len(test_df))
print("The columns are",train_df.columns.values)

Training examples are 891
Test data is 418
The columns are ['PassengerId' 'Survived' 'Pclass' 'Name' 'Sex' 'Age' 'SibSp' 'Parch'
 'Ticket' 'Fare' 'Cabin' 'Embarked']


In [4]:
# preview the first 5 rows
train_df.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
#Data cleaning

combined_data = train_df.append(test_df)

combined_data.Age.fillna(value=combined_data.Age.mean(), inplace=True)
combined_data.Fare.fillna(value=combined_data.Fare.mean(), inplace=True)
combined_data.Embarked.fillna(value=(combined_data.Embarked.value_counts().idxmax()), inplace=True)
combined_data.Survived.fillna(value=-1, inplace=True) 

# drop columns that are not needed
combined_data.drop('Name', axis=1, inplace=True)
combined_data.drop('Cabin', axis=1, inplace=True)
combined_data.drop('Ticket', axis=1, inplace=True)


In [10]:
# Write cleaned data out

train = combined_data[combined_data['Survived']!=-1]
# train.to_csv("./Data/train-clean.csv")

test = combined_data[combined_data['Survived']==-1]
test.drop('Survived', axis=1, inplace=True)
# test.to_csv("./Data/test-clean.csv")


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [53]:
# One-hot encoding
train_encoded = pd.get_dummies(train, columns = ['Embarked', 'Pclass', 'Sex'])
test_encoded = pd.get_dummies(test, columns = ['Embarked', 'Pclass', 'Sex'])

# Rearrange columns
list_of_features = ['Age','Embarked_C','Embarked_Q','Embarked_S','Fare','Parch','Pclass_1',\
                    'Pclass_2','Pclass_3','Sex_female','Sex_male','SibSp']
list_of_columns = list_of_features + ['Survived']
train_encoded = train_encoded[list_of_columns]
test_encoded = test_encoded[list_of_features]

# Transform training data into np arrays
features_train = train_encoded[list_of_features].values
features_test = train_encoded[list_of_features].values
labels_train = train_encoded['Survived'].values
