In [179]:
import os
import pandas
import numpy
from sklearn import preprocessing
from sklearn import linear_model
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score

# Load Files

In [180]:
train = pandas.read_csv("../data/train.csv",sep=",")
print(train.shape)
train.head()

(891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Data Exploration

In [181]:
#Get the type of each columns
train.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [182]:
# Number of unique values
train.nunique()

PassengerId    891
Survived         2
Pclass           3
Name           891
Sex              2
Age             88
SibSp            7
Parch            7
Ticket         681
Fare           248
Cabin          147
Embarked         3
dtype: int64

In [183]:
# Ratio of empty columns
train.isnull().sum()/len(train)

PassengerId    0.000000
Survived       0.000000
Pclass         0.000000
Name           0.000000
Sex            0.000000
Age            0.198653
SibSp          0.000000
Parch          0.000000
Ticket         0.000000
Fare           0.000000
Cabin          0.771044
Embarked       0.002245
dtype: float64

## label encoder 

In [184]:
def transform_categorical_values(column):
    le = preprocessing.LabelEncoder()
    return le.fit_transform(column)


## Variable Dummification

In [185]:
def get_dummies(dataset):
    df = pandas.DataFrame(index=dataset.index)
    for i in dataset.columns:
        df = pandas.concat([df, pandas.get_dummies(dataset[i])],axis = 1)
    return df

In [186]:
# get categorical values
train.columns[train.dtypes == numpy.object]

Index(['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], dtype='object')

In [187]:
get_dummies(train[['Embarked','Sex']]).head()

Unnamed: 0,C,Q,S,female,male
0,0,0,1,0,1
1,1,0,0,1,0
2,0,0,1,1,0
3,0,0,1,1,0
4,0,0,1,0,1


# Feature engineering

## Embarked 

In [188]:
train.Embarked  = train.Embarked.fillna("S")


## Age 

In [189]:
train.Age = train.Age.copy()
age = numpy.mean(train.Age[(numpy.isnan(train.Age)==False)])
meanAge = round(numpy.mean(train.Age[(numpy.isnan(train.Age)==False)]),0)
print(meanAge)
train.Age = train.Age.fillna(meanAge)
train.Age = train.Age.astype(int)

30.0


## Transform age into categorical values

In [190]:
train.Age.describe()
train.Age = pandas.qcut(train.Age, [0, .25, .5, .75, 1],labels=["Age young",
                                                                "Age mid",
                                                                "Age old",
                                                                "Age Very old"])
train.Age.head()

0       Age young
1    Age Very old
2         Age mid
3         Age old
4         Age old
Name: Age, dtype: category
Categories (4, object): [Age young < Age mid < Age old < Age Very old]

In [191]:
train.Age.value_counts()

Age mid         357
Age young       231
Age Very old    217
Age old          86
Name: Age, dtype: int64

## Cabin 

In [192]:
tr = train.Cabin.fillna(value=0)
print("Number of non-zero = ",numpy.count_nonzero(tr))

Number of non-zero =  204


## SiibSpouse

In [193]:
train.SibSp.value_counts()

0    608
1    209
2     28
4     18
3     16
8      7
5      5
Name: SibSp, dtype: int64

## Parch 

In [194]:
train.Parch.describe()
train.Parch.value_counts()

0    678
1    118
2     80
5      5
3      5
4      4
6      1
Name: Parch, dtype: int64

## Fare 

In [195]:
print(train.Fare.describe())
train.Fare = train.Fare.astype('int')
train.Fare = pandas.qcut(train.Fare, [0, .25, .5, .75, 1],labels=["Fare Very Low",
                                                                "Fare Low",
                                                                "Fare Mid",
                                                                "Fare Expensive"])
train.Fare.head()

count    891.000000
mean      32.204208
std       49.693429
min        0.000000
25%        7.910400
50%       14.454200
75%       31.000000
max      512.329200
Name: Fare, dtype: float64


0     Fare Very Low
1    Fare Expensive
2     Fare Very Low
3    Fare Expensive
4          Fare Low
Name: Fare, dtype: category
Categories (4, object): [Fare Very Low < Fare Low < Fare Mid < Fare Expensive]

In [196]:
train.Fare.value_counts()

Fare Very Low     241
Fare Mid          223
Fare Low          216
Fare Expensive    211
Name: Fare, dtype: int64

## Name Occurence

In [197]:
names = train.Name.str.split(",").str.get(0)
families = names.value_counts()
nameList = families.index.to_series()
nameOccurences = pandas.DataFrame(nameList)
nameOccurences['numbers'] = families
nameOccurences.columns = ['Name','Occs']
train['JustNames'] = names
print(train.head())
print(nameOccurences.head())

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex           Age  \
0                            Braund, Mr. Owen Harris    male     Age young   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  Age Very old   
2                             Heikkinen, Miss. Laina  female       Age mid   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female       Age old   
4                           Allen, Mr. William Henry    male       Age old   

   SibSp  Parch            Ticket            Fare Cabin Embarked  JustNames  
0      1      0         A/5 21171   Fare Very Low   NaN        S     Braund  
1      1      0          PC 17599  Fare Expensive   C85        C    Cumings  
2      0      0  STON/O2. 3101282   Fare Very Low   NaN        S  Heikkinen  
3 

In [198]:
nameOccurences['Survived'] = 0
for row in nameOccurences.Name:
    nameOccurences.ix[nameOccurences.Name==row,'Survived'] = sum(train.Survived[train.JustNames == row])
nameOccurences['SurvivingRation'] = nameOccurences.Survived/nameOccurences.Occs
print(nameOccurences.head(10))

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  This is separate from the ipykernel package so we can avoid doing imports until


                Name  Occs  Survived  SurvivingRation
Andersson  Andersson     9         2         0.222222
Sage            Sage     7         0         0.000000
Goodwin      Goodwin     6         0         0.000000
Panula        Panula     6         0         0.000000
Skoog          Skoog     6         0         0.000000
Johnson      Johnson     6         3         0.500000
Carter        Carter     6         4         0.666667
Rice            Rice     5         0         0.000000
Harris        Harris     4         2         0.500000
Ford            Ford     4         0         0.000000


# Build dataset 

In [199]:
df = pandas.DataFrame(index = train.index)
df = pandas.concat([train[train.columns[train.dtypes==numpy.int]], get_dummies(train[['Embarked','Sex','Age','Fare']])],axis = 1)
print(df.shape)
df.head()

(891, 18)


Unnamed: 0,PassengerId,Survived,Pclass,SibSp,Parch,C,Q,S,female,male,Age young,Age mid,Age old,Age Very old,Fare Very Low,Fare Low,Fare Mid,Fare Expensive
0,1,0,3,1,0,0,0,1,0,1,1,0,0,0,1,0,0,0
1,2,1,1,1,0,1,0,0,1,0,0,0,0,1,0,0,0,1
2,3,1,3,0,0,0,0,1,1,0,0,1,0,0,1,0,0,0
3,4,1,1,1,0,0,0,1,1,0,0,0,1,0,0,0,0,1
4,5,0,3,0,0,0,0,1,0,1,0,0,1,0,0,1,0,0


In [200]:
train[train.columns[train.dtypes==numpy.int]].head()

Unnamed: 0,PassengerId,Survived,Pclass,SibSp,Parch
0,1,0,3,1,0
1,2,1,1,1,0
2,3,1,3,0,0
3,4,1,1,1,0
4,5,0,3,0,0


## Remove passengerID 

In [201]:
df.drop('PassengerId',axis = 1,inplace = True)

## Display final dataset 

In [202]:
print(df.shape)
df.head()

(891, 17)


Unnamed: 0,Survived,Pclass,SibSp,Parch,C,Q,S,female,male,Age young,Age mid,Age old,Age Very old,Fare Very Low,Fare Low,Fare Mid,Fare Expensive
0,0,3,1,0,0,0,1,0,1,1,0,0,0,1,0,0,0
1,1,1,1,0,1,0,0,1,0,0,0,0,1,0,0,0,1
2,1,3,0,0,0,0,1,1,0,0,1,0,0,1,0,0,0
3,1,1,1,0,0,0,1,1,0,0,0,1,0,0,0,0,1
4,0,3,0,0,0,0,1,0,1,0,0,1,0,0,1,0,0


# Save Data 

In [204]:
df.to_excel("../data/clean_train_dataset.xlsx")