In [451]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.svm import SVC
from sklearn.svm import SVR
import matplotlib.pyplot as plt
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

In [452]:
train_data=pd.read_csv("titanic_train.csv")
test_data=pd.read_csv("titanic_test.csv") 

In [453]:
#sns.heatmap(data=test_data.isnull())

In [454]:
filt1=train_data['Age'].notnull()
filt2=test_data['Age'].notnull()
train_with_age=train_data.loc[filt1]
test_with_age=test_data.loc[filt2]

In [455]:
train_with_age.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [456]:
test_with_age.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


###### Adding Train and Test into a single DF ,which used to predict the missing Age values in Train Data

In [457]:
df_for_age=pd.concat([train_with_age,test_with_age],axis=0)

###### Droping Null Value

In [460]:
df_for_age=df_for_age.dropna(subset=['Embarked','Fare'],axis=0) #droped the 3 rows with nan value

###### Creating X and y

In [461]:
X_age=df_for_age[['Pclass','Sex','SibSp','Fare','Embarked']]

In [462]:
y_age=df_for_age['Age']

#### Encode the Data

In [463]:
temp_x1=pd.get_dummies(X_age['Embarked'],drop_first=True)
X_age=pd.concat([X_age,temp_x1],axis=1)
X_age.drop(columns='Embarked',inplace=True)

In [464]:
temp_x2=pd.get_dummies(X_age['Sex'],drop_first=True)
X_age=pd.concat([X_age,temp_x2],axis=1)
X_age.drop(columns='Sex',inplace=True)

#### Split Train and Test

In [465]:
from sklearn.model_selection import train_test_split
X_age_train,X_age_test,y_age_train,y_age_test=train_test_split(X_age,y_age,test_size=.2,random_state=0)

#### Normalization

In [466]:
X_age.shape,y_age.shape

((1043, 6), (1043,))

In [467]:
from sklearn.preprocessing import MinMaxScaler

myscaler=MinMaxScaler()
X_age_train=myscaler.fit_transform(X_age_train) #scale with the min and max value in the training data
X_age_test=myscaler.transform(X_age_test)

### Model To Predict Age

#### Linear

In [468]:
from sklearn.linear_model import LinearRegression

reg_lin = LinearRegression().fit(X_age_train, y_age_train)


In [469]:
train_pred_reg=reg_lin.predict(X_age_train)
test_pred_reg=reg_lin.predict(X_age_test)

In [499]:
from sklearn.metrics import mean_absolute_error
print(mean_absolute_error(y_age_train,train_pred_reg),mean_absolute_error(y_age_test,test_pred_reg))

9.633411425971316 10.527559238341224


## Prediction

In [443]:
filt2=train_data['Age'].isnull()
data_predcit_age=train_data.loc[filt2]

In [445]:
X_predict_age=data_predcit_age[['Pclass','Sex','SibSp','Fare','Embarked','Survived']]

#### Encode the  Data to Predict Age (Data with Missing Age value)

In [446]:
temp_x4=pd.get_dummies(X_predict_age['Embarked'],drop_first=True)
X_predict_age=pd.concat([X_predict_age,temp_x4],axis=1)
X_predict_age.drop(columns='Embarked',inplace=True)

In [447]:
temp_x5=pd.get_dummies(X_predict_age['Sex'],drop_first=True)
X_predict_age=pd.concat([X_predict_age,temp_x5],axis=1)
X_predict_age.drop(columns='Sex',inplace=True)

In [448]:
temp_x6=pd.get_dummies(X_predict_age['Survived'],drop_first=True)
X_predict_age=pd.concat([X_predict_age,temp_x6],axis=1)
X_predict_age.drop(columns='Survived',inplace=True)

### Predicting Age

In [493]:
Nul_Age_pred_reg=reg_lin.predict(X_predict_age)

#### Creating Joblib model to reuse in Original Titanic Data set to fill the missing Age

In [497]:
from joblib import dump, load
dump(reg_lin, 'Age_prediction.joblib')  #save the model as filename.joblib

['Age_prediction.joblib']