In [40]:
# Encoding categorical features using Sci-kit learn

In [41]:
import pandas as pd

In [42]:
tit = pd.read_csv('https://raw.githubusercontent.com/justmarkham/pandas-videos/master/data/titanic_train.csv')

In [43]:
tit.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [44]:
# remove the nulls from the feature data 

tit = tit.loc[tit.Embarked.notna(),['Survived','Pclass','Sex','Embarked']]
tit.isnull().sum()

Survived    0
Pclass      0
Sex         0
Embarked    0
dtype: int64

In [45]:
tit.head()

Unnamed: 0,Survived,Pclass,Sex,Embarked
0,0,3,male,S
1,1,1,female,C
2,1,3,female,S
3,1,1,female,S
4,0,3,male,S


In [46]:
X = tit.loc[:,['Pclass']]
y = tit.Survived

In [47]:
X.shape

(889, 1)

In [48]:
y.shape

(889,)

In [49]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [52]:
#cross validate logistic regression model with one feature, Pclass (Passengar class)
# mean accuracy of 5 folds of cv = .678

logreg = LogisticRegression()
round(cross_val_score(logreg,X,y,cv=5,scoring='accuracy').mean(),4)

0.6783

In [54]:
#61.7% of the passengars didn't survive, we can't rely heavly on the accuracy of the model using Logistic Regression

print(round(y.value_counts(normalize = True),2)*100)

0    62.0
1    38.0
Name: Survived, dtype: float64


In [55]:
#motivating question = add more features to model and cv ... how do i do that = 
#pipeline but first encode sex and embark column

In [56]:
tit.head() #sex and embarked are categorical

Unnamed: 0,Survived,Pclass,Sex,Embarked
0,0,3,male,S
1,1,1,female,C
2,1,3,female,S
3,1,1,female,S
4,0,3,male,S


In [57]:
from sklearn.preprocessing import OneHotEncoder
one = OneHotEncoder(sparse=False)

In [58]:
#learning data exercise: create a numpy array with two columns, first = female| second = male 

one.fit_transform(tit[['Sex']])
one.categories_

[array(['female', 'male'], dtype=object)]

In [59]:
#learning data exercise
one.fit_transform(tit[['Embarked']])
one.categories_

[array(['C', 'Q', 'S'], dtype=object)]

In [60]:
# drop the target column (this will be our y axis)
X = tit.drop('Survived',axis='columns')

In [61]:
# applying preprocessing to the categorical data points for Sex and Embarked
# column transformer, apply one hot encorder to one of these two columns and remainder of columns pass through 

from sklearn.compose import make_column_transformer

column_trans = make_column_transformer((OneHotEncoder(),['Sex','Embarked']),remainder = 'passthrough')

column_trans.fit_transform(X)

array([[0., 1., 0., 0., 1., 3.],
       [1., 0., 1., 0., 0., 1.],
       [1., 0., 0., 0., 1., 3.],
       ...,
       [1., 0., 0., 0., 1., 3.],
       [0., 1., 1., 0., 0., 1.],
       [0., 1., 0., 1., 0., 3.]])

In [62]:
from sklearn.pipeline import make_pipeline

In [63]:
#Using pipe, we're now building the model to fit the column transformer and logistic regression (apart of preprocessing)

pipe = make_pipeline(column_trans,logreg)

In [64]:
#Success! The model accuracy score improved by 10% (previously 0.68)

round(cross_val_score(pipe,X,y,cv=5,scoring='accuracy').mean(),4)

0.7728

In [65]:
# Sample of 5 data points to show model is ready to predict outcomes for future iterations

X_new = X.sample(5,random_state=99)

In [66]:
pipe.fit(X,y)
pipe.predict(X_new)

array([1, 0, 1, 1, 0], dtype=int64)