In [9]:
import pandas as pd
from sklearn import model_selection


In [5]:
# US adult census data
df = pd.read_csv("../data/adult.csv")

In [7]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


Columns:

* age
* workclass
* fnlwgt
* education
* education.num
* marital.status
* occupation
* relationship
* race
* sex
* capital.gain
* capital.loss
* hours.per.week
* native.country
* income


In [8]:
df.income.value_counts()

<=50K    24720
>50K      7841
Name: income, dtype: int64

In [54]:
# US adult census data
df = pd.read_csv("../data/adult.csv")

# For simplicity we drop numerical columns
num_cols = ["fnlwgt", "age", "capital.gain", "capital.loss", "hours.per.week"]

df  = df.drop(num_cols, axis = 1)

# map targets to 0s and 1s
target_mapping = { "<=50K": 0, ">50K": 1}
df.loc[:, "income"] = df.income.map(target_mapping)

# all columns are features except income and kfold columns
features = [ f for f in df.columns if f not in ("kfold", "income")]

# fill all Nan values with None. Note that NONE is string here.
# it does not matter as all are categories
for col in features:
    df.loc[:, col] = df[col].astype(str).fillna("NONE")
    
# we create a new column called kfold and fill it with -1
df["kfold"] = -1

# the next step is to randomize the rows of the data
df = df.sample(frac=1).reset_index(drop=True)

# fetch labels
y = df.income.values

# initiate the kfold class from model_selection module
kf = model_selection.StratifiedKFold(n_splits=5)

# fill the new kfold column
for f, (t_, v_) in enumerate(kf.split(X=df, y=y)):
    df.loc[v_, 'kfold'] = f


In [None]:
#### We do first logistic with one hot encoding

In [39]:
from sklearn import metrics
from sklearn import preprocessing
from sklearn import linear_model

def run_logistic_ohe(fold, data):
    df = data.copy()

    print(f"df shape {df.shape}")
    # all columns are features except id, target and kfold columns
    features = [f for f in df.columns if f not in ("id", "target", "kfold")]

    # get training data using folds
    df_train = df[df.kfold != fold].reset_index(drop=True)

    # get validation data using folds
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    
    # initialize OneHotEncoder from scikit-learn
    ohe = preprocessing.OneHotEncoder()
    
    # fit ohe on training + validation features
    full_data = pd.concat([df_train[features], df_valid[features]], axis = 0)
    ohe.fit(full_data[features])

    # transform to get training data
    x_train = ohe.transform(df_train[features])
    
    # transform to get validation data
    x_valid = ohe.transform(df_valid[features])   
    
    # initialize Logistic Regression model
    model = linear_model.LogisticRegression()

    # fit model on training data
    model.fit(x_train, df_train.income.values)

    # predict on validation data
    # we need the probability values as we are calculating AUC
    # we will use the probability of 1s
    valid_preds = model.predict_proba(x_valid)[:, 1]
    
    # get roc auc score
    auc = metrics.roc_auc_score(df_valid.income.values, valid_preds)
    
    train_preds = model.predict_proba(x_train)[:, 1]
    auc_train = metrics.roc_auc_score(df_train.income.values, train_preds)
    
    # print auc
    print(f"Fold = {fold}, AUC valid = {auc} auc_train = {auc_train}")

In [40]:
for fold in range(5):
    run_logistic_ohe(fold, df)

df shape (32561, 11)
Fold = 0, AUC valid = 1.0 auc_train = 1.0
df shape (32561, 11)
Fold = 1, AUC valid = 1.0 auc_train = 1.0
df shape (32561, 11)
Fold = 2, AUC valid = 1.0 auc_train = 1.0
df shape (32561, 11)
Fold = 3, AUC valid = 1.0 auc_train = 1.0
df shape (32561, 11)
Fold = 4, AUC valid = 1.0 auc_train = 1.0


AUC is too good to be true :) How?

In [43]:
# Let me see one host encoding again
expt_df = pd.DataFrame({"a": [1 , 2, 3], "b": [0,0,1]})
expt_df

Unnamed: 0,a,b
0,1,0
1,2,0
2,3,1


In [45]:
ohe = preprocessing.OneHotEncoder()
ohe.fit(expt_df)

OneHotEncoder()

In [47]:
expt_df_trans = ohe.transform(expt_df)

In [50]:
expt_df_trans.todense()

matrix([[1., 0., 0., 1., 0.],
        [0., 1., 0., 1., 0.],
        [0., 0., 1., 0., 1.]])

We can see that output has three rows as number of rows in expt_df. 
* First column **a** has got three distinct values, so first three columns of expt_df_trans correspond to those three one hot encoded values. 
* Second column **b** has got 2 distinct values, so last 2 columns of expt_df_trans are one hot encodings of **b**
Therefore total number of columns are 3 + 2 = 5

###  Let us try XGBoost

In [56]:

def run_xgb_labelencoding(fold, data):
    df = data.copy()

    print(f"df shape {df.shape}")
    # all columns are features except id, target and kfold columns
    features = [f for f in df.columns if f not in ("id", "target", "kfold")]
    
    # label encode the features
    for col in features:
        # initialize LabelEncoder for each feature column
        lbl = preprocessing.LabelEncoder()
        # fit label encoder on all data
        lbl.fit(data[col])
        # transform all the data
        df.loc[:, col] = lbl.transform(df[col])
        
    # get training data using folds
    df_train = df[df.kfold != fold].reset_index(drop=True)

    # get validation data using folds
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    
    # transform to get training data
    x_train = df_train[features]
    
    # transform to get validation data
    x_valid = df_valid[features]
    
    # initialize xgboost model
    model = xgb.XGBClassifier(n_jobs=-1)

    # fit model on training data
    model.fit(x_train, df_train.income.values)

    # predict on validation data
    # we need the probability values as we are calculating AUC
    # we will use the probability of 1s
    valid_preds = model.predict_proba(x_valid)[:, 1]
    
    # get roc auc score
    auc = metrics.roc_auc_score(df_valid.income.values, valid_preds)
    
    train_preds = model.predict_proba(x_train)[:, 1]
    auc_train = metrics.roc_auc_score(df_train.income.values, train_preds)
    
    # print auc
    print(f"Fold = {fold}, AUC valid = {auc} auc_train = {auc_train}")

In [57]:
for fold in range(5):
    run_xgb_labelencoding(fold, df)

df shape (32561, 11)
Fold = 0, AUC valid = 1.0 auc_train = 1.0
df shape (32561, 11)
Fold = 1, AUC valid = 1.0 auc_train = 1.0
df shape (32561, 11)
Fold = 2, AUC valid = 1.0 auc_train = 1.0
df shape (32561, 11)
Fold = 3, AUC valid = 1.0 auc_train = 1.0
df shape (32561, 11)
Fold = 4, AUC valid = 1.0 auc_train = 1.0


again even with XGBoost , then AUC is too good to be true :) How?

In [63]:
### Let me revist Label encoder
expt_df = pd.DataFrame({"a": ["A", "C", "B"], "b": ["AA","BB", "AA"]})
expt_df

Unnamed: 0,a,b
0,A,AA
1,C,BB
2,B,AA


In [64]:
lbl = preprocessing.LabelEncoder()
lbl.fit(expt_df["a"])
lbl.transform(expt_df["a"])

array([0, 2, 1])

In [66]:
lbl.fit(expt_df["b"])
lbl.transform(expt_df["b"])

array([0, 1, 0])