In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import  GaussianNB
from sklearn import metrics
from sklearn.metrics import classification_report

In [2]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
adult = fetch_ucirepo(id=2)

# data (as pandas dataframes)
X = adult.data.features
y = adult.data.targets

In [3]:
print(X.head())

# getting rid from '.' as it's only there in test data and messing with the model
y['income'] = y['income'].apply(lambda x: x.strip('.'))

   age         workclass  fnlwgt  education  education-num  \
0   39         State-gov   77516  Bachelors             13   
1   50  Self-emp-not-inc   83311  Bachelors             13   
2   38           Private  215646    HS-grad              9   
3   53           Private  234721       11th              7   
4   28           Private  338409  Bachelors             13   

       marital-status         occupation   relationship   race     sex  \
0       Never-married       Adm-clerical  Not-in-family  White    Male   
1  Married-civ-spouse    Exec-managerial        Husband  White    Male   
2            Divorced  Handlers-cleaners  Not-in-family  White    Male   
3  Married-civ-spouse  Handlers-cleaners        Husband  Black    Male   
4  Married-civ-spouse     Prof-specialty           Wife  Black  Female   

   capital-gain  capital-loss  hours-per-week native-country  
0          2174             0              40  United-States  
1             0             0              13  United-St

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y['income'] = y['income'].apply(lambda x: x.strip('.'))


In [4]:
# features(columns) with messing values
messing = X.columns[X.isnull().sum() > 0]
print("Columns with missing values:", messing)
    
# replacing messing values
for i in messing:    
    print('\n' ,X[i].unique())
    # print(X[i].value_counts())\n",
    X[i].replace('?', np.NaN, inplace=True)
    X.fillna(X[i].mode()[0],inplace= True)
    # print(X[i].value_counts())
    
print(X.columns[X.isnull().sum() > 0])

Columns with missing values: Index(['workclass', 'occupation', 'native-country'], dtype='object')

 ['State-gov' 'Self-emp-not-inc' 'Private' 'Federal-gov' 'Local-gov' '?'
 'Self-emp-inc' 'Without-pay' 'Never-worked' nan]

 ['Adm-clerical' 'Exec-managerial' 'Handlers-cleaners' 'Prof-specialty'
 'Other-service' 'Sales' 'Craft-repair' 'Transport-moving'
 'Farming-fishing' 'Machine-op-inspct' 'Tech-support' '?'
 'Protective-serv' 'Armed-Forces' 'Priv-house-serv' 'Private']

 ['United-States' 'Cuba' 'Jamaica' 'India' '?' 'Mexico' 'South'
 'Puerto-Rico' 'Honduras' 'England' 'Canada' 'Germany' 'Iran'
 'Philippines' 'Italy' 'Poland' 'Columbia' 'Cambodia' 'Thailand' 'Ecuador'
 'Laos' 'Taiwan' 'Haiti' 'Portugal' 'Dominican-Republic' 'El-Salvador'
 'France' 'Guatemala' 'China' 'Japan' 'Yugoslavia' 'Peru'
 'Outlying-US(Guam-USVI-etc)' 'Scotland' 'Trinadad&Tobago' 'Greece'
 'Nicaragua' 'Vietnam' 'Hong' 'Ireland' 'Hungary' 'Holand-Netherlands'
 'Private']
Index([], dtype='object')


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[i].replace('?', np.NaN, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.fillna(X[i].mode()[0],inplace= True)


In [5]:
# Select categorical columns
# categorical_cols = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
   
categorical_cols = X.select_dtypes(include=['object']).columns 
print("Columns with categorical values:", categorical_cols)
    
# Use pandas get_dummies to encode categorical variables
X_encoded_df = pd.get_dummies(X[categorical_cols]).astype(int)

X = pd.concat([X, X_encoded_df], axis = 1)
X = X.drop(categorical_cols, axis= 1)
print(X.head())

Columns with categorical values: Index(['workclass', 'education', 'marital-status', 'occupation',
       'relationship', 'race', 'sex', 'native-country'],
      dtype='object')
   age  fnlwgt  education-num  capital-gain  capital-loss  hours-per-week  \
0   39   77516             13          2174             0              40   
1   50   83311             13             0             0              13   
2   38  215646              9             0             0              40   
3   53  234721              7             0             0              40   
4   28  338409             13             0             0              40   

   workclass_Federal-gov  workclass_Local-gov  workclass_Never-worked  \
0                      0                    0                       0   
1                      0                    0                       0   
2                      0                    0                       0   
3                      0                    0                       

In [6]:
# Split the dataset into training and testing sets (two-one third method)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3)

print(X_train.shape)
print(X_test.shape)

# model training
model = GaussianNB()
model.fit(X_train,y_train)

(32561, 107)
(16281, 107)


  y = column_or_1d(y, warn=True)


In [7]:

# Calculate predictions
y_pred = model.predict(X_test)
    
# Calculate metrics
confusion_matrix = metrics.confusion_matrix(y_test, y_pred)

print(classification_report(y_test, y_pred))

TN, FP, FN, TP = confusion_matrix.ravel()

# Compute sensitivity (recall) and specificity
sensitivity = TP / (TP + FN)
specificity = TN / (TN + FP)

print("Sensitivity (Recall):", sensitivity)
print("Specificity:", specificity)

posterior_probabilities = TP / (TP + FP)
print("\nPosterior Probability of making > 50k a year:", posterior_probabilities)

              precision    recall  f1-score   support

       <=50K       0.81      0.95      0.88     12312
        >50K       0.67      0.32      0.43      3969

    accuracy                           0.80     16281
   macro avg       0.74      0.63      0.65     16281
weighted avg       0.78      0.80      0.77     16281

Sensitivity (Recall): 0.3179642227261275
Specificity: 0.9496426250812215

Posterior Probability of making > 50k a year: 0.6705632306057385
