Mean Encoding
====

### Reference : https://github.com/Sangarshanan/Implementing-basic-concepts-and-algorithms/blob/master/Mean%20Encodings/Mean%20Encodings.ipynb?source=post_page-----a3d573df31e8----------------------

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

train = pd.read_csv('data/titanic_train.csv')
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [2]:
# drop Cabin column
train.drop(columns = ['Cabin', 'Name', 'Ticket'], axis=1, inplace = True)

In [3]:
# impute missing ages
def impute_age(cols):
    Age = cols[0]
    Pclass = cols[1]
    if pd.isnull(Age):
        
        if Pclass == 1:
            return 37
        elif Pclass == 2:
            return 29
        else:
            return 24
    else:
        return Age

train['Age'] = train[['Age', 'Pclass']].apply(impute_age, axis=1)

In [6]:
# Label encoding and results
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(train['Sex'])
train['Sex'] = le.transform(train['Sex'])

train = train.dropna()

le.fit(train['Embarked'])
train['Embarked'] = le.transform(train['Embarked'])

train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,1,22.0,1,0,7.25,2
1,2,1,1,0,38.0,1,0,71.2833,0
2,3,1,3,0,26.0,0,0,7.925,2
3,4,1,1,0,35.0,1,0,53.1,2
4,5,0,3,1,35.0,0,0,8.05,2


In [9]:
# apply simple logistic regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(train.drop('Survived', axis=1),
                                                   train['Survived'], test_size=0.30,
                                                   random_state=101)
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('The accuracy of the Logistic Regression is', accuracy_score(y_pred, y_test))

The accuracy of the Logistic Regression is 0.8202247191011236




In [11]:
# incorporate mean encodings
train = pd.read_csv('data/titanic_train.csv')
train.drop('Cabin',axis=1,inplace=True)
train.drop('Name',axis=1,inplace=True)
train['Age'] = train[['Age','Pclass']].apply(impute_age,axis=1)
train.drop('Ticket',axis=1,inplace=True)
train = train.dropna()

In [15]:
# mean encodings with expanding mean scheme
cumsum = train.groupby('Sex')['Survived'].cumsum() - train['Survived']
cumcnt = train.groupby('Sex').cumcount()
train['Sex'] = cumsum/cumcnt
train['Sex'].fillna(0.3343, inplace = True)

In [27]:
cumsum = train.groupby('Embarked')['Survived'].cumsum() - train['Survived']
cumcnt = train.groupby('Embarked').cumcount()
train['Embarked'] = cumsum/cumcnt
train['Embarked'].fillna(0.3343, inplace = True)

In [28]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,0.3343,22.0,1,0,7.25,0.3343
1,2,1,1,0.3343,38.0,1,0,71.2833,0.3343
2,3,1,3,1.0,26.0,0,0,7.925,0.0
3,4,1,1,1.0,35.0,1,0,53.1,0.5
4,5,0,3,0.0,35.0,0,0,8.05,0.666667


In [30]:
from sklearn.linear_model import LogisticRegressionCV
import warnings

warnings.filterwarnings('ignore')

LogisticRegressionCV(Cs=10)

X_train, X_test, y_train, y_test = train_test_split(train.drop('Survived', axis=1),
                                                   train['Survived'], test_size=0.30,
                                                   random_state=101)

model = LogisticRegressionCV()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('The accuracy of the Logistic Regression(Mean Encoding applied) is', accuracy_score(y_pred, y_test))

The accuracy of the Logistic Regression(Mean Encoding applied) is 0.8314606741573034
