In [61]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder


In [10]:
df = pd.read_csv("adult/adult.data", header=None)
df.columns = ['age','workclass','fnlwgt','education','education-num','marital-status','occupation','relationship','race','sex','capital-gain','capital-loss','hours-per-week','native-country', 'y']
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,y
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [21]:
df.shape

(32561, 15)

In [142]:
df.sex.value_counts()

 Male      21790
 Female    10771
Name: sex, dtype: int64

In [97]:
ycol = "y"

fields_numeric = ["age", "fnlwgt", "education-num", "capital-gain", "capital-loss", "hours-per-week"]
fields_categorical = ["workclass", "education", "marital-status", "occupation", "relationship", "race", "native-country"]


fields_numeric1 = ["age", "capital-gain",  "hours-per-week"]
fields_numeric2 = [f for f in fields_numeric if f not in fields_numeric1]

fields_categorical1 = ["workclass", "education", "relationship", "race"]
fields_categorical2 = [f for f in fields_categorical if f not in fields_categorical1]

fields_all1 = fields_numeric1 + fields_categorical1
fields_all2 = fields_numeric2 + fields_categorical2

In [98]:
# segment into two populations by sex (M and F)

X = df[fields_all + ["sex"]]
y = df[ycol]
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state=42)


def obtain_classifier(X_train, X_test, y_train, y_test, fields_categorical, classifier):

    # Reference: https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html#sphx-glr-auto-examples-compose-plot-column-transformer-mixed-types-py
    categorical_transformer = OneHotEncoder()

    preprocessor = ColumnTransformer(
        transformers = [
            ("cat", categorical_transformer, fields_categorical)
        ]
    )
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', classifier)
    ])

    pipeline.fit(X_train, y_train)

    y_pred = pipeline.predict(X_test)
    print(classification_report(
        y_test,
        y_pred))
    
    return pipeline, y_pred


clf1, y_pred1 = obtain_classifier(X_train[fields_all1], X_test[fields_all1], y_train, y_test, fields_categorical1, LogisticRegression())
clf2, y_pred2 = obtain_classifier(X_train[fields_all2], X_test[fields_all2], y_train, y_test, fields_categorical2, RandomForestClassifier(random_state=0))

              precision    recall  f1-score   support

       <=50K       0.84      0.94      0.89      4942
        >50K       0.71      0.45      0.55      1571

    accuracy                           0.82      6513
   macro avg       0.78      0.69      0.72      6513
weighted avg       0.81      0.82      0.81      6513

              precision    recall  f1-score   support

       <=50K       0.86      0.91      0.88      4942
        >50K       0.64      0.52      0.57      1571

    accuracy                           0.81      6513
   macro avg       0.75      0.71      0.73      6513
weighted avg       0.80      0.81      0.81      6513



In [99]:
# prepare files for HW

# id | Class | y | Test 1 | Test 2

df_hw = pd.DataFrame(X_test["sex"])
df_hw["Actual"] = pd.Series(y_test, index=df_hw.index)
df_hw["Test1"] = pd.Series(y_pred1, index=df_hw.index)
df_hw["Test2"] = pd.Series(y_pred2, index=df_hw.index)

print(df_hw.shape)
df_hw.head()

(6513, 4)


Unnamed: 0,sex,Actual,Test1,Test2
14160,Female,<=50K,<=50K,<=50K
27048,Female,<=50K,<=50K,>50K
28868,Male,>50K,>50K,>50K
5667,Female,<=50K,<=50K,<=50K
7827,Male,<=50K,<=50K,<=50K


In [109]:
df_hw.replace({" Female": 1, " Male": 2, ' <=50K': 0, ' >50K': 1}, inplace=True)

In [110]:
df_hw.to_csv("HW_adult.csv")

In [135]:
# Calculate 

def calculate_rates(y_test, y_pred):
    
    cm = confusion_matrix(y_test, y_pred)
    TN = cm[0][0]
    TP = cm[1][1]
    
    FN = cm[1][0]
    FP = cm[0][1]
    
    tpr = (TP) / (TP + FN)
    fnr = (FN) / (FN + TP)

    fpr = (FP) / (FP + TN)
    tnr = (TN) / (TN + FP)
        
    return tpr, tnr, fpr, fnr

print(" & ".join([str(round(x, 3)) for x in calculate_rates(df_hw.Actual, df_hw.Test1)]))

print(" & ".join([str(round(x, 3)) for x in calculate_rates(df_hw.Actual, df_hw.Test2)]))

def calculate_base_rates(df_slice):
    base_rate_1 = df_slice[df_slice.sex == 1].Actual.value_counts(normalize=True)[1]
    base_rate_2 = df_slice[df_slice.sex == 2].Actual.value_counts(normalize=True)[1]
    
    return base_rate_1, base_rate_2

print(calculate_base_rates(df_hw))

0.446 & 0.942 & 0.058 & 0.554
0.523 & 0.906 & 0.094 & 0.477
(0.10959548447789276, 0.30499202188283564)


In [117]:
y_test.value_counts()

 <=50K    4942
 >50K     1571
Name: y, dtype: int64