# PIPELINES

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer

In [10]:
df = pd.read_csv('titanic.csv')

In [33]:
df.Embarked.value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [11]:
df.drop(['PassengerId','Cabin','Name','Ticket'],axis=1,inplace=True)

In [12]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [14]:
X_train,X_test,y_train,y_test = train_test_split(df.drop('Survived',axis=1),df['Survived'],test_size=0.2)

In [15]:
X_train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
733,2,male,23.0,0,0,13.0000,S
247,2,female,24.0,0,2,14.5000,S
317,2,male,54.0,0,0,14.0000,S
14,3,female,14.0,0,0,7.8542,S
455,3,male,29.0,0,0,7.8958,C
...,...,...,...,...,...,...,...
612,3,female,,1,0,15.5000,Q
126,3,male,,0,0,7.7500,Q
714,2,male,52.0,0,0,13.0000,S
815,1,male,,0,0,0.0000,S


In [16]:
y_train

733    0
247    1
317    0
14     0
455    1
      ..
612    1
126    0
714    0
815    0
209    1
Name: Survived, Length: 712, dtype: int64

In [17]:
df.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [18]:
si1 = SimpleImputer()
si2 = SimpleImputer(strategy='most_frequent')

In [37]:
X_train_age = si1.fit_transform(X_train[['Age']])
X_train_embarked = si2.transform(X_train[['Embarked']])

X_test_age = si1.fit_transform(X_test[['Age']])
X_test_embarked = si2.transform(X_test[['Embarked']])

In [38]:
ohe1 = OneHotEncoder(sparse=False, handle_unknown='ignore')
ohe2 = OneHotEncoder(sparse=False, handle_unknown='ignore')

In [39]:
X_train_sex = ohe1.fit_transform(X_train[['Sex']])
X_train_embarked = ohe2.fit_transform(X_train_embarked)

X_test_sex = ohe1.transform(X_test[['Sex']])
X_test_embarked = ohe2.transform(X_test_embarked)



In [41]:
X_train_embarked

array([[0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       ...,
       [0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.]])

In [42]:
X_train_rem = X_train.drop(['Sex','Embarked','Age'],axis=1)

In [43]:
X_test_rem = X_test.drop(['Sex','Embarked','Age'],axis=1)

In [45]:
X_train_transformed = np.concatenate((X_train_rem,X_train_sex,X_train_embarked,X_train_age),axis=1)
X_test_transformed = np.concatenate((X_test_rem,X_test_sex,X_test_embarked,X_test_age),axis=1)

In [48]:
X_train_transformed.shape

(712, 10)

In [50]:
clf = DecisionTreeClassifier()
clf.fit(X_train_transformed,y_train)

In [53]:
y_pred = clf.predict(X_test_transformed)

In [55]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.7988826815642458

In [56]:
import pickle

In [67]:
pickle.dump(ohe1,open('model/ohe_sex.pickle','wb'))
pickle.dump(ohe2,open('model/ohe_embarked.pickle','wb'))
pickle.dump(clf,open('model/dec-tree.pickle','wb'))

In [64]:
pwd

'C:\\Users\\Katta\\data_analysis\\ML for fun'