In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

# for regression problems
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

# for classification problems
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# to split and standarize the dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# to evaluate regression models
from sklearn.metrics import mean_squared_error

# to evaluate classification models
from sklearn.metrics import roc_auc_score

import warnings

warnings.filterwarnings('ignore')

In [6]:
# load the titanic Dataset with a few variables for demonstration

data = pd.read_csv('/content/drive/MyDrive/Feature Engineering/titanic_train.csv', usecols=['Age', 'Fare', 'Survived'])
data.head()

Unnamed: 0,Survived,Age,Fare
0,1,,27.14
1,0,,13.35
2,0,0.33,71.29
3,0,19.0,13.04
4,1,25.0,7.76


In [None]:
# percentage of NA

data.isnull().mean()

## Imputation important

imputation has to be done over the training set and then propagated to the test set. this means that the mean/median to be used to fill missing values both in train and test set, should be extracted from the train set only .....
####  and that to avoid Overfitting

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, data.Survived, test_size=0.3,
                                                    random_state=0)
X_train.shape, X_test.shape

In [None]:
def impute_na(df, variable, median):
    df[variable+'_median'] = df[variable].fillna(median)
    df[variable+'_zero'] = df[variable].fillna(0)

In [None]:
median = X_train.Age.median()
print(median)
median_f = X_train.Fare.median()
print(median_f)


In [None]:
impute_na(X_train, 'Age', median)
X_train.head(15)
impute_na(X_train, 'Fare', median_f)
X_train.head(15)

In [None]:
impute_na(X_test, 'Age', median)
X_test.head(15)
impute_na(X_test, 'Fare', median_f)
X_test.head(15)

In [None]:
print('Original Variance: ', X_train['Age'].std())
print('Variance after median imputation: ', X_train['Age_median'].std())

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
X_train[X_train.Age != 0]['Age'].plot(kind='kde', ax=ax)
X_train.Age_median.plot(kind='kde', ax=ax, color='red')
lines, labels = ax.get_legend_handles_labels()
ax.legend(lines, labels, loc='best')

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
X_train[X_train.Age != 0]['Age'].plot(kind='kde', ax=ax)
X_train.Age_zero.plot(kind='kde', ax=ax, color='red')
lines, labels = ax.get_legend_handles_labels()
ax.legend(lines, labels, loc='best')

## Logistic Regression

In [None]:
logit = LogisticRegression(random_state=44, C=1000)  # c big to avoid regularisation
logit.fit([[X_train'Age_zero', 'Fare_median']], y_train)
print('Train set zero imputation')
pred = logit.predict_proba(X_train[['Age_zero', 'Fare_median']])
print('Logistic Regression roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
print('Test set zero imputation')
pred = logit.predict_proba(X_test[['Age_zero', 'Fare_median']])
print('Logistic Regression roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))
print()

logit = LogisticRegression(random_state=44, C=1000)  # c big to avoid regularisation
logit.fit(X_train[['Age_median', 'Fare_median']], y_train)
print('Train set median imputation')
pred = logit.predict_proba(X_train[['Age_median', 'Fare_median']])
print('Logistic Regression roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
print('Test set median imputation')
pred = logit.predict_proba(X_test[['Age_median', 'Fare_median']])
print('Logistic Regression roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))
print()

In [None]:
print('Average real survival of children: ', X_train[X_train.Age<20].Survived.mean())
print('Average survival of children when using Age imputed with zeroes: ', X_train[X_train.Age_zero<20].Survived.mean())
print('Average survival of children when using Age imputed with median: ', X_train[X_train.Age_median<20].Survived.mean())

## Support Vector Machine

In [None]:
#SVM_model = SVC(random_state=44, probability=True, max_iter=-1, kernel='linear')
#SVM_model.fit(X_train[['Age_zero', 'Fare_median']], y_train)
#print('Train set zero imputation')
#pred = SVM_model.predict_proba(X_train[['Age_zero', 'Fare_median']])
#print('Logistic Regression roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
#print('Test set zero imputation')
#pred = SVM_model.predict_proba(X_test[['Age_zero', 'Fare_median']])
#print('Logistic Regression roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))
#print()

#logit = LogisticRegression(random_state=44, C=1000)  # c big to avoid regularisation
#logit.fit(X_train[['Age_median', 'Fare_median']], y_train)
#print('Train set median imputation')
#pred = logit.predict_proba(X_train[['Age_median', 'Fare_median']])
#print('Logistic Regression roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
#print('Test set median imputation')
#pred = logit.predict_proba(X_test[['Age_median', 'Fare_median']])
#print('Logistic Regression roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))
#print()

## Random Forests

In [None]:
rf = RandomForestClassifier(n_estimators=100, random_state=39, max_depth=3)  
rf.fit(X_train[['Age_zero', 'Fare_median']], y_train)
print('Train set zero imputation')
pred = rf.predict_proba(X_train[['Age_zero', 'Fare_median']])
print('Logistic Regression roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
print('Test set zero imputation')
pred = rf.predict_proba(X_test[['Age_zero', 'Fare_median']])
print('Logistic Regression roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))
print()

rf = RandomForestClassifier(n_estimators=100, random_state=39, max_depth=3)  
rf.fit(X_train[['Age_median', 'Fare_median']], y_train)
print('Train set median imputation')
pred = rf.predict_proba(X_train[['Age_median', 'Fare_median']])
print('Logistic Regression roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
print('Test set median imputation')
pred = rf.predict_proba(X_test[['Age_median', 'Fare_median']])
print('Logistic Regression roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))
print()

# House Price data set

In [None]:
cols_to_use = ['OverallQual', 'TotalBsmtSF', '1stFlrSF','GrLivArea', 'WoodDeckSF', 
               'BsmtUnfSF', 'LotFrontage', 'MasVnrArea', 'GarageYrBlt', 'SalePrice']

In [None]:
data = pd.read_csv('../house_price_train.csv', usecols=cols_to_use)
print(data.shape)
data.head()

In [None]:
data[cols_to_use].isnull().mean()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, data.SalePrice, test_size=0.3,
                                                    random_state=0)
X_train.shape, X_test.shape

In [None]:
median = X_train.LotFrontage.median()
impute_na(X_train, 'LotFrontage', median)
impute_na(X_test, 'LotFrontage', median)

In [None]:
median = X_train.MasVnrArea.median()
impute_na(X_train, 'MasVnrArea', median)
impute_na(X_test, 'MasVnrArea', median)

In [None]:
median = X_train.GarageYrBlt.median()
impute_na(X_train, 'GarageYrBlt', median)
impute_na(X_test, 'GarageYrBlt', median)

In [None]:
cols_to_use.remove('SalePrice')
cols_to_use

In [None]:
cols_zero = [col+'_zero' if col in ['LotFrontage', 'MasVnrArea', 'GarageYrBlt'] else col for col in cols_to_use ]
cols_median = [col+'_median' if col in ['LotFrontage', 'MasVnrArea', 'GarageYrBlt'] else col for col in cols_to_use ]

In [None]:
cols_median

# Linear Regression

In [None]:
linreg = LinearRegression() 
linreg.fit(X_train[cols_zero], y_train)
print('Train set zero imputation')
pred = linreg.predict(X_train[cols_zero])
print('Lineat Regression mse: {}'.format(mean_squared_error(y_train, pred)))
print('Test set zero imputation')
pred = linreg.predict(X_test[cols_zero])
print('Lineat Regression mse: {}'.format(mean_squared_error(y_test, pred)))
print()

rf = RandomForestClassifier(n_estimators=100, random_state=39, max_depth=3)  
rf.fit(X_train[cols_median], y_train)
print('Train set median imputation')
pred = rf.predict(X_train[cols_median])
print('Lineat Regression mse: {}'.format(mean_squared_error(y_train, pred)))
print('Test set median imputation')
pred = rf.predict(X_test[cols_median])
print('Lineat Regression mse: {}'.format(mean_squared_error(y_test, pred)))
print()