In [1]:
#import libraries
import pandas as pd
import numpy as np
import scipy as sp
from IPython import display

In [2]:
#import model helpers
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics

In [3]:
#import visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
from pandas.tools.plotting import scatter_matrix

In [4]:
#import data
data_raw = pd.read_csv('./train.csv')
data_val = pd.read_csv('./test.csv')

#create a data copy
data1 = data_raw.copy(deep = True)


data_cleaner = [data1, data_val]

In [5]:
#preview data
data_raw.info()
data_raw.sample(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
728,729,0,2,"Bryhl, Mr. Kurt Arnold Gottfrid",male,25.0,1,0,236853,26.0,,S
130,131,0,3,"Drazenoic, Mr. Jozef",male,33.0,0,0,349241,7.8958,,C
38,39,0,3,"Vander Planke, Miss. Augusta Maria",female,18.0,2,0,345764,18.0,,S
543,544,1,2,"Beane, Mr. Edward",male,32.0,1,0,2908,26.0,,S
482,483,0,3,"Rouse, Mr. Richard Henry",male,50.0,0,0,A/5 3594,8.05,,S
477,478,0,3,"Braund, Mr. Lewis Richard",male,29.0,1,0,3460,7.0458,,S
196,197,0,3,"Mernagh, Mr. Robert",male,,0,0,368703,7.75,,Q
280,281,0,3,"Duane, Mr. Frank",male,65.0,0,0,336439,7.75,,Q
358,359,1,3,"McGovern, Miss. Mary",female,,0,0,330931,7.8792,,Q
231,232,0,3,"Larsson, Mr. Bengt Edvin",male,29.0,0,0,347067,7.775,,S


In [6]:
print('Train columns with null values:\n', data1.isnull().sum())
print("-"*50)
print('Test columns with null values:\n', data_val.isnull().sum())

Train columns with null values:
 PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
--------------------------------------------------
Test columns with null values:
 PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


In [7]:
#Complete missing values
for a in data_cleaner:
    a['Age'].fillna(a['Age'].median(), inplace=True)
    a['Embarked'].fillna(a['Embarked'].mode()[0], inplace=True)
    a['Fare'].fillna(a['Fare'].median(), inplace=True)

In [8]:
#Delete Train columns thath i dont need
dropcolumns = ['PassengerId', 'Ticket', 'Cabin']
data1.drop(dropcolumns, axis=1, inplace= True)

In [9]:
print('Train columns with null values:\n', data1.isnull().sum())
print("-"*50)
print('Test columns with null values:\n', data_val.isnull().sum())

Train columns with null values:
 Survived    0
Pclass      0
Name        0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64
--------------------------------------------------
Test columns with null values:
 PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          327
Embarked         0
dtype: int64


In [10]:
#Convert objects to category using Label Encoder for train and test dataset
label = LabelEncoder()
for data in data_cleaner:
        data['Sex_code'] = label.fit_transform(data['Sex'])
        data['Embarked_code'] = label.fit_transform(data['Embarked'])
data1.sample(5)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Sex_code,Embarked_code
562,0,2,"Norman, Mr. Robert Douglas",male,28.0,0,0,13.5,S,1,2
631,0,3,"Lundahl, Mr. Johan Svensson",male,51.0,0,0,7.0542,S,1,2
792,0,3,"Sage, Miss. Stella Anna",female,28.0,8,2,69.55,S,0,2
841,0,2,"Mudd, Mr. Thomas Charles",male,16.0,0,0,10.5,S,1,2
362,0,3,"Barbara, Mrs. (Catherine David)",female,45.0,0,1,14.4542,C,0,0


In [11]:
#Define Y variable
Target = ['Survived']

#Define x variable:
data1_x = ['Sex','Pclass', 'Embarked', 'SibSp', 'Parch', 'Age', 'Fare'] #name for chart
data1_x_calc = ['Sex_code','Pclass', 'Embarked_code', 'SibSp', 'Parch', 'Age', 'Fare'] #Variable x for calculetion
data1_xy =  Target + data1_x
print('Original X Y: ', data1_xy, '\n')

Original X Y:  ['Survived', 'Sex', 'Pclass', 'Embarked', 'SibSp', 'Parch', 'Age', 'Fare'] 

