In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [18]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

# for regression problems
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

# for classification problems
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# to split and standarize the dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# to evaluate regression models
from sklearn.metrics import mean_squared_error

# to evaluate classification models
from sklearn.metrics import roc_auc_score

import warnings

warnings.filterwarnings('ignore')

In [51]:
# load the titanic Dataset with a few variables for demonstration

data = pd.read_csv('/content/drive/MyDrive/Feature Engineering/titanic_train.csv', usecols=['Age', 'SibSp', 'Survived'])
data.head()

Unnamed: 0,Survived,Age,SibSp
0,1,,2
1,0,,0
2,0,0.33,1
3,0,19.0,0
4,1,25.0,0


In [52]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   Survived  100000 non-null  int64  
 1   Age       96708 non-null   float64
 2   SibSp     100000 non-null  int64  
dtypes: float64(1), int64(2)
memory usage: 2.3 MB


In [53]:
# percentage of NA
data.isnull().sum()

Survived       0
Age         3292
SibSp          0
dtype: int64

## 1. Separate the null Values
Separate te null values of 'Age' column and consider as test data

In [59]:
data_null = data[data["Age"].isnull()==True]
data

Unnamed: 0,Survived,Age,SibSp
0,1,,2
1,0,,0
2,0,0.33,1
3,0,19.00,0
4,1,25.00,0
...,...,...,...
99995,1,62.00,0
99996,0,66.00,0
99997,0,37.00,0
99998,0,51.00,0


In [61]:
len(data_null)
data_null.iloc[1, :]

Survived    0.0
Age         NaN
SibSp       0.0
Name: 1, dtype: float64

## 2. Drop the null values from the DataFrame & consider as train data

In [62]:
data.dropna(inplace=True)

In [63]:
data

Unnamed: 0,Survived,Age,SibSp
2,0,0.33,1
3,0,19.00,0
4,1,25.00,0
5,0,35.00,0
6,0,7.00,0
...,...,...,...
99995,1,62.00,0
99996,0,66.00,0
99997,0,37.00,0
99998,0,51.00,0


In [64]:
len(data)

96708

**train data = rows from DataFrame where column "Age" does not have missing values**

In [65]:
data.isnull().sum()

Survived    0
Age         0
SibSp       0
dtype: int64

## 3. Create the X_train, y_train from DataFrame

In [68]:
# y_train means Dataset except df['Age'] features with NULL values
y_train = data.Age

In [69]:
y_train

2         0.33
3        19.00
4        25.00
5        35.00
6         7.00
         ...  
99995    62.00
99996    66.00
99997    37.00
99998    51.00
99999    55.00
Name: Age, Length: 96708, dtype: float64

In [71]:
X_train = data.drop("Age", axis=1)

In [72]:
X_train

Unnamed: 0,Survived,SibSp
2,0,1
3,0,0
4,1,0
5,0,0
6,0,0
...,...,...
99995,1,0
99996,0,0
99997,0,0
99998,0,0


## 4. Build the model

In [73]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

In [75]:
# Train the model on train dataset (X_train, y_train)
lr.fit(X_train, y_train)

LinearRegression()

## 5. Create the X_test from the test data 

In [76]:
X_test = test_data.drop("Age", axis=1)

## 6. Apply the model on X_test and predict the missing values

In [77]:
y_pred = lr.predict(X_test)

In [78]:
y_pred.shape

(3292,)

In [79]:
y_pred

array([36.7432619 , 37.66477502, 41.47954635, ..., 37.66477502,
       37.66477502, 37.66477502])

In [82]:
## 7. Replace the missing values with the predicted values
test_data.loc[test_data.Age.isnull(), "Age"] = y_pred

In [83]:
test_data

Unnamed: 0,Survived,Age,SibSp
0,1,36.743262,2
1,0,37.664775,0
34,1,41.479546,0
35,0,37.664775,0
36,1,41.479546,0
...,...,...,...
99772,0,37.664775,0
99876,1,41.479546,0
99971,0,37.664775,0
99972,0,37.664775,0


In [91]:
df =  pd.concat([data, test_data])
df

Unnamed: 0,Survived,Age,SibSp
2,0,0.330000,1
3,0,19.000000,0
4,1,25.000000,0
5,0,35.000000,0
6,0,7.000000,0
...,...,...,...
99772,0,37.664775,0
99876,1,41.479546,0
99971,0,37.664775,0
99972,0,37.664775,0


In [92]:
len(df)

100000