In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder

In [6]:
col_names = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'martial_status', 'occupation',
             'relationship', 'race', 'sex', 'capital_gain', 'capital_loss',
             'hours_per_week', 'naive_country', 'y']
data1 = pd.read_csv('Data/adult1.csv', sep= ',', names=col_names)
data2 = pd.read_csv('Data/adult2.csv', sep=',', names=col_names)
data = pd.concat([data1, data2], ignore_index=True)
data.to_csv('full_data.csv', index=False)

In [7]:
data.describe()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
count,48842.0,48842.0,48842.0,48842.0,48842.0,48842.0
mean,38.643585,189664.1,10.078089,1079.067626,87.502314,40.422382
std,13.71051,105604.0,2.570973,7452.019058,403.004552,12.391444
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117550.5,9.0,0.0,0.0,40.0
50%,37.0,178144.5,10.0,0.0,0.0,40.0
75%,48.0,237642.0,12.0,0.0,0.0,45.0
max,90.0,1490400.0,16.0,99999.0,4356.0,99.0


# Col names description
Age  
Workclass  
Fnlwgt (Final weight)  
Education
Education numerical  
Martial status  
Occupation  
Relationship  
Race  
Sex  
Capital gain  
Capital loss  


In [8]:
col_list = data.columns.tolist()
features = col_list.copy()
features.remove('y')
num_cols = [col for col in features if data[col].dtype=='int64']
object_cols = [col for col in features if data[col].dtype=='object']
data['y'].replace({' <=50K' : 0, ' <=50K.': 0, ' >50K': 1, ' >50K.': 1}, inplace=True)

In [9]:
label_encoder = LabelEncoder()
data_le = data.copy()
for object_col in object_cols:
    data_le[object_col] = label_encoder.fit_transform(data_le[object_col])
data_le.to_csv('encoded_data.csv', index=False)

In [10]:
data_oh = data.copy()
OHencoded = pd.get_dummies(data_oh[object_cols], drop_first=True)
df_oh = pd.concat([data_oh[num_cols], OHencoded, data['y']], axis=1)
df_oh.to_csv('OHencoded_data.csv', index=False)

In [None]:
data_std = data_oh.copy()
for num_col in num_cols:
    scaler = StandardScaler()
    data_std[num_col] = scaler.fit_transform(data_std[num_col].values.reshape(-1, 1))
data_std.to_csv('stand_scaled_data.csv', index=False)

In [None]:
data_mm = data_oh.copy()
for num_col in num_cols:
    scaler = MinMaxScaler()
    data_mm[num_col] = scaler.fit_transform(data_mm[num_col].values.reshape(-1, 1))
data_mm.to_csv('min_max_scaled_data.csv', index=False)

In [24]:
# Prepare basic model 
def CVlinearReg(df, n_splits=5,  rando_state=2021, features=features, if_print=True, 
                  *args, **kwargs):
    # Prepare KFOLD or StratifiedKFOLD
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=rando_state)
    # Make copy of data
    data = df.copy()
    
    # Prepare empty lists
    train_results = []
    test_results = []
    preds = []
    
    # Prepare int to count fold s
    fold_number = 1
       
    for train, test in kf.split(data_le.index.values, data['y']):
        # Prepare linear model
        model = LinearRegression()
        model.fit(data.loc[train, features], data.loc[train, 'y'])
        
        # Make predictions
        train_preds = model.predict(data.loc[train, features])
        test_preds = model.predict(data.loc[test, features])
        preds.append(test_preds)
        
        # Prepare ROC_AUC score
        train_roc = metrics.roc_auc_score(data.loc[train, 'y'], train_preds)
        test_roc = metrics.roc_auc_score(data.loc[test, 'y'], test_preds)
        
        # Add ROC_AUC to lis
        train_results.append(train_roc)
        test_results.append(test_roc)
        
        if if_print:
            print(f'FOLD NUMBER: {fold_number}')
            print(f'ROC_AUC ON TRAIN SCORE {train_roc}')
            print(f'ROC_AUC ON TEST SCORE {test_roc}')
                  
        fold_number += 1 
        
    return train_results, test_results, preds 

In [25]:
train_results, test_results, preds = CVlinearReg(data_le)

FOLD NUMBER: 1
ROC_AUC ON TRAIN SCORE 0.8409916207178565
ROC_AUC ON TEST SCORE 0.8360843340137879
FOLD NUMBER: 2
ROC_AUC ON TRAIN SCORE 0.8405033082265352
ROC_AUC ON TEST SCORE 0.8385415569460883
FOLD NUMBER: 3
ROC_AUC ON TRAIN SCORE 0.839919239175099
ROC_AUC ON TEST SCORE 0.8399344429455599
FOLD NUMBER: 4
ROC_AUC ON TRAIN SCORE 0.8410937181787237
ROC_AUC ON TEST SCORE 0.8365386603104288
FOLD NUMBER: 5
ROC_AUC ON TRAIN SCORE 0.8378880909357174
ROC_AUC ON TEST SCORE 0.8476773651785559


In [58]:
np.mean(test_results)

0.8397261126521787

In [19]:
def CVlogisticReg(df, n_splits=5,  rando_state=2021, features=features, if_print=True, 
                  *args, **kwargs):
    # Prepare StratifiedKFOLD
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=rando_state)
    
    # Make copy of data
    data = df.copy()
    
    # Prepare empty lists
    train_results = []
    test_results = []
    preds = []
    
    # Prepare int to count fold s
    fold_number = 1
       
    for train, test in kf.split(data_le.index.values, data['y']):
        # Prepare linear model
        model = LogisticRegression(**kwargs)
        model.fit(data.loc[train, features], data.loc[train, 'y'])
        
        # Make predictions
        train_preds = model.predict(data.loc[train, features])
        test_preds = model.predict(data.loc[test, features])
        preds.append(test_preds)
        
        # Prepare ROC_AUC score
        train_roc = metrics.roc_auc_score(data.loc[train, 'y'], train_preds)
        test_roc = metrics.roc_auc_score(data.loc[test, 'y'], test_preds)
        
        # Add ROC_AUC to lis
        train_results.append(train_roc)
        test_results.append(test_roc)
        
        if if_print:
            print(f'FOLD NUMBER: {fold_number}')
            print(f'ROC_AUC ON TRAIN SCORE {train_roc}')
            print(f'ROC_AUC ON TEST SCORE {test_roc}')
                  
        fold_number += 1 
        
    return train_results, test_results, preds 

In [20]:
CVlogisticReg(data_le, max_iter=1000)

FOLD NUMBER: 1
ROC_AUC ON TRAIN SCORE 0.6558496689168114
ROC_AUC ON TEST SCORE 0.6433115659217351
FOLD NUMBER: 2
ROC_AUC ON TRAIN SCORE 0.6169082096450392
ROC_AUC ON TEST SCORE 0.6190289701466782
FOLD NUMBER: 3
ROC_AUC ON TRAIN SCORE 0.6155127062018699
ROC_AUC ON TEST SCORE 0.6216649745912286
FOLD NUMBER: 4
ROC_AUC ON TRAIN SCORE 0.6164403816358268
ROC_AUC ON TEST SCORE 0.6118822046006831
FOLD NUMBER: 5
ROC_AUC ON TRAIN SCORE 0.6170989214858696
ROC_AUC ON TEST SCORE 0.6181444384615743


([0.6558496689168114,
  0.6169082096450392,
  0.6155127062018699,
  0.6164403816358268,
  0.6170989214858696],
 [0.6433115659217351,
  0.6190289701466782,
  0.6216649745912286,
  0.6118822046006831,
  0.6181444384615743],
 [array([0, 0, 1, ..., 0, 0, 0]),
  array([0, 0, 0, ..., 0, 0, 0]),
  array([0, 0, 0, ..., 0, 0, 0]),
  array([0, 0, 0, ..., 0, 0, 1]),
  array([0, 0, 0, ..., 0, 0, 0])])

In [13]:
roc_plitter = PlotRoc()

No handles with labels found to put in legend.


In [18]:
data_le

Unnamed: 0,age,workclass,fnlwgt,education,education_num,martial_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,naive_country,y
0,39,7,77516,9,13,4,1,1,4,1,2174,0,40,39,0
1,50,6,83311,9,13,2,4,0,4,1,0,0,13,39,0
2,38,4,215646,11,9,0,6,1,4,1,0,0,40,39,0
3,53,4,234721,1,7,2,6,0,2,1,0,0,40,39,0
4,28,4,338409,9,13,2,10,5,2,0,0,0,40,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,39,4,215419,9,13,0,10,1,4,0,0,0,36,39,0
48838,64,0,321403,11,9,6,0,2,2,1,0,0,40,39,0
48839,38,4,374983,9,13,2,10,0,4,1,0,0,50,39,0
48840,44,4,83891,9,13,0,1,3,1,1,5455,0,40,39,0
