## **Predict whether income exceeds $50K/yr based on census data. Also known as "Census Income" dataset.**

#### Approach 1

In [68]:
#Import Libraries
import pandas as pd

In [69]:
# Import data from local system
file_path = r'C:\NumpyNinja\ML classes -Project\ML Assignments\adult\adult.data'

df = pd.read_csv(file_path, delimiter=",", names=["age","workclass","fnlwgt","education","education-num","marital-status","occupation","relationship","race","sex","capital-gain","capital-loss","hours-per-week","native-country","income"])

df

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


#### **The number of rows here are 32561 and it is not matching with the number of instances given in dataset link. We decided to move forward with approach 2**

#### Approach 2

In [3]:
#Installing ucimlrepo package
!pip install ucimlrepo



In [4]:
#Import Libraries
import pandas as pd
import numpy as np
import warnings

# Ignore all warnings
warnings.filterwarnings('ignore')

from ucimlrepo import fetch_ucirepo, list_available_datasets


from IPython.display import Markdown, display
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
import numpy as np
from sklearn import metrics


def printmd(string):
    display(Markdown(string))
    
    
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import cross_val_score

from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
from sklearn.preprocessing import OneHotEncoder

In [5]:
#List of UCI repositories with ID
list_available_datasets()

-------------------------------------
The following datasets are available:
-------------------------------------
Dataset Name                                                                            ID    
------------                                                                            --    
Abalone                                                                                 1     
Adult                                                                                   2     
Annealing                                                                               3     
Audiology (Standardized)                                                                8     
Auto MPG                                                                                9     
Automobile                                                                              10    
Balance Scale                                                                           12    
Balloons                       

In [6]:
# Fetch the 'adult' dataset from UCI repository where id for dataset is 2
adultDataset = fetch_ucirepo(id=2)

In [7]:
#Creating dataframe by joining features and target
Xdata = pd.DataFrame(adultDataset.data.features)
Ydata = pd.DataFrame(adultDataset.data.targets, columns=['income'])
adultDataFrame = pd.concat([Xdata,Ydata], axis = 1)
adultDataFrame

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,39,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States,<=50K.
48838,64,,321403,HS-grad,9,Widowed,,Other-relative,Black,Male,0,0,40,United-States,<=50K.
48839,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States,<=50K.
48840,44,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States,<=50K.


In [8]:
# Displaying first 10 rows of dataset
adultDataFrame.head(10)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


In [9]:
# Displaying 5 rows using sample function
adultDataFrame.sample(n=5,random_state = 42)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
7762,18,Private,423024,HS-grad,9,Never-married,Other-service,Not-in-family,White,Male,0,0,20,United-States,<=50K
23881,17,Private,178953,12th,8,Never-married,Sales,Own-child,White,Female,0,0,20,United-States,<=50K
30507,25,Local-gov,348986,HS-grad,9,Never-married,Handlers-cleaners,Other-relative,Black,Male,0,0,40,United-States,<=50K
28911,20,Private,218215,Some-college,10,Never-married,Sales,Own-child,White,Female,0,0,30,United-States,<=50K
19484,47,Private,244025,HS-grad,9,Never-married,Machine-op-inspct,Unmarried,Amer-Indian-Eskimo,Male,0,0,56,Puerto-Rico,<=50K


In [10]:
# Examining columns using info
adultDataFrame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             48842 non-null  int64 
 1   workclass       47879 non-null  object
 2   fnlwgt          48842 non-null  int64 
 3   education       48842 non-null  object
 4   education-num   48842 non-null  int64 
 5   marital-status  48842 non-null  object
 6   occupation      47876 non-null  object
 7   relationship    48842 non-null  object
 8   race            48842 non-null  object
 9   sex             48842 non-null  object
 10  capital-gain    48842 non-null  int64 
 11  capital-loss    48842 non-null  int64 
 12  hours-per-week  48842 non-null  int64 
 13  native-country  48568 non-null  object
 14  income          48842 non-null  object
dtypes: int64(6), object(9)
memory usage: 5.6+ MB


In [11]:
# Checking for missing values using isnull
adultDataFrame.isnull().sum()

age                 0
workclass         963
fnlwgt              0
education           0
education-num       0
marital-status      0
occupation        966
relationship        0
race                0
sex                 0
capital-gain        0
capital-loss        0
hours-per-week      0
native-country    274
income              0
dtype: int64

In [12]:
# Summary statistics for numerical columns using describe 
adultDataFrame.describe()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
count,48842.0,48842.0,48842.0,48842.0,48842.0,48842.0
mean,38.643585,189664.1,10.078089,1079.067626,87.502314,40.422382
std,13.71051,105604.0,2.570973,7452.019058,403.004552,12.391444
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117550.5,9.0,0.0,0.0,40.0
50%,37.0,178144.5,10.0,0.0,0.0,40.0
75%,48.0,237642.0,12.0,0.0,0.0,45.0
max,90.0,1490400.0,16.0,99999.0,4356.0,99.0


In [13]:
# For checking summary statistics for categorical columns
adultDataFrame.describe(include='object')

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native-country,income
count,47879,48842,48842,47876,48842,48842,48842,48568,48842
unique,9,16,7,15,6,5,2,42,4
top,Private,HS-grad,Married-civ-spouse,Prof-specialty,Husband,White,Male,United-States,<=50K
freq,33906,15784,22379,6172,19716,41762,32650,43832,24720


**Data Cleaning:
Handle missing values by either removing rows or filling them with appropriate values (mean, median, mode, etc.). Correct data types if needed. Address any outliers or anomalies**

In [14]:
#checking for null values in the columns
adultDataFrame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             48842 non-null  int64 
 1   workclass       47879 non-null  object
 2   fnlwgt          48842 non-null  int64 
 3   education       48842 non-null  object
 4   education-num   48842 non-null  int64 
 5   marital-status  48842 non-null  object
 6   occupation      47876 non-null  object
 7   relationship    48842 non-null  object
 8   race            48842 non-null  object
 9   sex             48842 non-null  object
 10  capital-gain    48842 non-null  int64 
 11  capital-loss    48842 non-null  int64 
 12  hours-per-week  48842 non-null  int64 
 13  native-country  48568 non-null  object
 14  income          48842 non-null  object
dtypes: int64(6), object(9)
memory usage: 5.6+ MB


In [15]:
adultDataFrame.isnull().sum()

age                 0
workclass         963
fnlwgt              0
education           0
education-num       0
marital-status      0
occupation        966
relationship        0
race                0
sex                 0
capital-gain        0
capital-loss        0
hours-per-week      0
native-country    274
income              0
dtype: int64

In [16]:
(adultDataFrame == '?').sum()

age                  0
workclass         1836
fnlwgt               0
education            0
education-num        0
marital-status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital-gain         0
capital-loss         0
hours-per-week       0
native-country     583
income               0
dtype: int64

In [17]:

adult_df_cleaned = adultDataFrame.replace('?', np.nan)
adult_df_cleaned.isnull().sum()

age                  0
workclass         2799
fnlwgt               0
education            0
education-num        0
marital-status       0
occupation        2809
relationship         0
race                 0
sex                  0
capital-gain         0
capital-loss         0
hours-per-week       0
native-country     857
income               0
dtype: int64

In [18]:
adult_df_cleaned['workclass'].unique()

array(['State-gov', 'Self-emp-not-inc', 'Private', 'Federal-gov',
       'Local-gov', nan, 'Self-emp-inc', 'Without-pay', 'Never-worked'],
      dtype=object)

In [19]:
adult_df_cleaned['workclass']

0               State-gov
1        Self-emp-not-inc
2                 Private
3                 Private
4                 Private
               ...       
48837             Private
48838                 NaN
48839             Private
48840             Private
48841        Self-emp-inc
Name: workclass, Length: 48842, dtype: object

In [20]:
for column in adult_df_cleaned.columns:
    print(f"--------------------{column.title()}-------------------------")
    print(adult_df_cleaned[column].value_counts())

--------------------Age-------------------------
age
36    1348
35    1337
33    1335
23    1329
31    1325
      ... 
88       6
85       5
87       3
89       2
86       1
Name: count, Length: 74, dtype: int64
--------------------Workclass-------------------------
workclass
Private             33906
Self-emp-not-inc     3862
Local-gov            3136
State-gov            1981
Self-emp-inc         1695
Federal-gov          1432
Without-pay            21
Never-worked           10
Name: count, dtype: int64
--------------------Fnlwgt-------------------------
fnlwgt
203488    21
120277    19
190290    19
125892    18
126569    18
          ..
286983     1
185942     1
234220     1
214706     1
350977     1
Name: count, Length: 28523, dtype: int64
--------------------Education-------------------------
education
HS-grad         15784
Some-college    10878
Bachelors        8025
Masters          2657
Assoc-voc        2061
11th             1812
Assoc-acdm       1601
10th             1389
7th-8

In [21]:
row_withnullval = adult_df_cleaned[adult_df_cleaned['workclass'].isnull()]
row_withnullval.head(20)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
27,54,,180211,Some-college,10,Married-civ-spouse,,Husband,Asian-Pac-Islander,Male,0,0,60,South,>50K
61,32,,293936,7th-8th,4,Married-spouse-absent,,Not-in-family,White,Male,0,0,40,,<=50K
69,25,,200681,Some-college,10,Never-married,,Own-child,White,Male,0,0,40,United-States,<=50K
77,67,,212759,10th,6,Married-civ-spouse,,Husband,White,Male,0,0,2,United-States,<=50K
106,17,,304873,10th,6,Never-married,,Own-child,White,Female,34095,0,32,United-States,<=50K
128,35,,129305,HS-grad,9,Married-civ-spouse,,Husband,White,Male,0,0,40,United-States,<=50K
149,43,,174662,Some-college,10,Divorced,,Not-in-family,White,Female,0,0,40,United-States,<=50K
154,52,,252903,HS-grad,9,Divorced,,Not-in-family,White,Male,0,0,45,United-States,>50K
160,68,,38317,1st-4th,2,Divorced,,Not-in-family,White,Female,0,0,20,United-States,<=50K
187,53,,135105,Bachelors,13,Divorced,,Not-in-family,White,Female,0,0,50,United-States,<=50K


In [22]:
adult_df_mode = adult_df_cleaned['workclass'].fillna(adult_df_cleaned['workclass'].mode().iloc[0])
adult_df_mode.value_counts()


workclass
Private             36705
Self-emp-not-inc     3862
Local-gov            3136
State-gov            1981
Self-emp-inc         1695
Federal-gov          1432
Without-pay            21
Never-worked           10
Name: count, dtype: int64

In [23]:
adult_df_cleaned['workclass'].value_counts()

workclass
Private             33906
Self-emp-not-inc     3862
Local-gov            3136
State-gov            1981
Self-emp-inc         1695
Federal-gov          1432
Without-pay            21
Never-worked           10
Name: count, dtype: int64

In [24]:
adult_df_cleaned['income'].value_counts()

income
<=50K     24720
<=50K.    12435
>50K       7841
>50K.      3846
Name: count, dtype: int64

In [25]:
adult_df_cleaned['income'].replace('<=50K.', '<=50K',inplace = True)
adult_df_cleaned['income'].replace('>50K.', '>50K',inplace = True)
adult_df_cleaned['income'].value_counts()

income
<=50K    37155
>50K     11687
Name: count, dtype: int64

In [26]:
adult_df_cleaned['occupation'].value_counts()

occupation
Prof-specialty       6172
Craft-repair         6112
Exec-managerial      6086
Adm-clerical         5611
Sales                5504
Other-service        4923
Machine-op-inspct    3022
Transport-moving     2355
Handlers-cleaners    2072
Farming-fishing      1490
Tech-support         1446
Protective-serv       983
Priv-house-serv       242
Armed-Forces           15
Name: count, dtype: int64

In [202]:
from sklearn.preprocessing import LabelEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [49]:
label_encoder = LabelEncoder()

# Apply LabelEncoder to multiple columns
for column in ['workclass','education', 'marital-status','occupation','relationship','race','sex',
               'native-country','income']:
    adult_df_cleaned[column + '_encoded'] = label_encoder.fit_transform(adult_df_cleaned[column])

print(adult_df_cleaned)


NameError: name 'LabelEncoder' is not defined

In [112]:
adult_df_cleaned = adult_df_cleaned.drop(columns =['workclass','education', 'marital-status','occupation','relationship','race','sex',
               'native-country','income'])

adult_df_cleaned.head()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,workclass_encoded,education_encoded,marital-status_encoded,occupation_encoded,relationship_encoded,race_encoded,sex_encoded,native-country_encoded,income_encoded
0,39,77516,13,2174,0,40,6,9,4,0,1,4,1,38,0
1,50,83311,13,0,0,13,5,9,2,3,0,4,1,38,0
2,38,215646,9,0,0,40,3,11,0,5,1,4,1,38,0
3,53,234721,7,0,0,40,3,1,2,5,0,2,1,38,0
4,28,338409,13,0,0,40,3,9,2,9,5,2,0,4,0


In [113]:
impute_it = IterativeImputer()
impute_it.fit_transform(adult_df_cleaned)

array([[3.90000e+01, 7.75160e+04, 1.30000e+01, ..., 1.00000e+00,
        3.80000e+01, 0.00000e+00],
       [5.00000e+01, 8.33110e+04, 1.30000e+01, ..., 1.00000e+00,
        3.80000e+01, 0.00000e+00],
       [3.80000e+01, 2.15646e+05, 9.00000e+00, ..., 1.00000e+00,
        3.80000e+01, 0.00000e+00],
       ...,
       [3.80000e+01, 3.74983e+05, 1.30000e+01, ..., 1.00000e+00,
        3.80000e+01, 0.00000e+00],
       [4.40000e+01, 8.38910e+04, 1.30000e+01, ..., 1.00000e+00,
        3.80000e+01, 0.00000e+00],
       [3.50000e+01, 1.82148e+05, 1.30000e+01, ..., 1.00000e+00,
        3.80000e+01, 1.00000e+00]])

In [119]:
adult_df_cleaned['occupation_encoded'].isnull().sum()

np.int64(0)

In [120]:
adultDataFrame['occupation'].isnull().sum()

np.int64(966)

In [136]:
adult_df_cleaned['native-country_encoded'].value_counts()

native-country_encoded
38    43832
25      951
41      857
29      295
10      206
32      184
1       182
7       155
18      151
4       138
8       127
2       122
34      115
22      106
21      105
5       103
23       92
12       88
30       87
39       86
3        85
13       75
31       67
35       65
19       59
11       49
26       49
28       46
6        45
9        38
20       37
16       30
36       30
0        28
37       27
24       23
40       23
27       23
33       21
15       20
17       19
14        1
Name: count, dtype: int64

In [123]:
adultDataFrame['workclass'].value_counts()

workclass
Private             33906
Self-emp-not-inc     3862
Local-gov            3136
State-gov            1981
?                    1836
Self-emp-inc         1695
Federal-gov          1432
Without-pay            21
Never-worked           10
Name: count, dtype: int64

In [128]:
adult_df_cleaned[adult_df_cleaned['workclass_encoded']== '?']

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,workclass_encoded,education_encoded,marital-status_encoded,occupation_encoded,relationship_encoded,race_encoded,sex_encoded,native-country_encoded,income_encoded


In [131]:
adult_df_cleaned.head()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,workclass_encoded,education_encoded,marital-status_encoded,occupation_encoded,relationship_encoded,race_encoded,sex_encoded,native-country_encoded,income_encoded
0,39,77516,13,2174,0,40,6,9,4,0,1,4,1,38,0
1,50,83311,13,0,0,13,5,9,2,3,0,4,1,38,0
2,38,215646,9,0,0,40,3,11,0,5,1,4,1,38,0
3,53,234721,7,0,0,40,3,1,2,5,0,2,1,38,0
4,28,338409,13,0,0,40,3,9,2,9,5,2,0,4,0


In [132]:
adultDataFrame[adultDataFrame['workclass'] == '?']

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
27,54,?,180211,Some-college,10,Married-civ-spouse,?,Husband,Asian-Pac-Islander,Male,0,0,60,South,>50K
61,32,?,293936,7th-8th,4,Married-spouse-absent,?,Not-in-family,White,Male,0,0,40,?,<=50K
69,25,?,200681,Some-college,10,Never-married,?,Own-child,White,Male,0,0,40,United-States,<=50K
77,67,?,212759,10th,6,Married-civ-spouse,?,Husband,White,Male,0,0,2,United-States,<=50K
106,17,?,304873,10th,6,Never-married,?,Own-child,White,Female,34095,0,32,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32530,35,?,320084,Bachelors,13,Married-civ-spouse,?,Wife,White,Female,0,0,55,United-States,>50K
32531,30,?,33811,Bachelors,13,Never-married,?,Not-in-family,Asian-Pac-Islander,Female,0,0,99,United-States,<=50K
32539,71,?,287372,Doctorate,16,Married-civ-spouse,?,Husband,White,Male,0,0,10,United-States,>50K
32541,41,?,202822,HS-grad,9,Separated,?,Not-in-family,Black,Female,0,0,32,United-States,<=50K


In [135]:
adult_df_cleaned.iloc[61]

age                           32
fnlwgt                    293936
education-num                  4
capital-gain                   0
capital-loss                   0
hours-per-week                40
workclass_encoded              8
education_encoded              5
marital-status_encoded         3
occupation_encoded            14
relationship_encoded           1
race_encoded                   4
sex_encoded                    1
native-country_encoded        41
income_encoded                 0
Name: 61, dtype: int64

In [143]:
from sklearn import metrics
from sklearn.model_selection import train_test_split

X = adult_df_cleaned.drop(columns = ['income_encoded'])
y = adult_df_cleaned['income_encoded']
X_train, X_test,y_train,y_test = train_test_split(X,y,test_size=0.3)
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train,y_train)
pred = lr.predict(X_test)
print(metrics.accuracy_score(pred,y_test))


0.789258172387907


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [144]:
adultDataFrame['occupation'].value_counts()

occupation
Prof-specialty       6172
Craft-repair         6112
Exec-managerial      6086
Adm-clerical         5611
Sales                5504
Other-service        4923
Machine-op-inspct    3022
Transport-moving     2355
Handlers-cleaners    2072
?                    1843
Farming-fishing      1490
Tech-support         1446
Protective-serv       983
Priv-house-serv       242
Armed-Forces           15
Name: count, dtype: int64

In [32]:
test_data = adult_df_cleaned[(adult_df_cleaned['workclass'].isnull())].copy()
test_label = test_data.workclass

train_data = adult_df_cleaned[(adult_df_cleaned['workclass'].notnull())].copy()
train_label = train_data.workclass

test_data.drop(columns = ['workclass'], inplace = True)
train_data.drop(columns = ['workclass'], inplace = True)

In [33]:
# Initialize OneHotEncoder with sparse_output instead of sparse
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

# Fit the encoder on the training data
train_encoded = encoder.fit_transform(train_data)

# Apply the same encoding to the test data
test_encoded = encoder.transform(test_data)

# Convert to DataFrame for easier handling
train_data_encoded = pd.DataFrame(train_encoded, columns=encoder.get_feature_names_out())
test_data_encoded = pd.DataFrame(test_encoded, columns=encoder.get_feature_names_out())

In [None]:
#Train and predict the model using logistic regression
log_reg = LogisticRegression()
log_reg.fit(train_data_encoded, train_label)
log_reg_pred = log_reg.predict(test_data_encoded)

In [None]:
#Train and predict the model using decision tree classifier
clf = tree.DecisionTreeClassifier()
clf = clf.fit(train_data_encoded, train_label)
clf_pred = clf.predict(test_data_encoded)

In [None]:
#Train and predict the model using random forest classifier
r_forest = RandomForestClassifier(n_estimators=10)
r_forest.fit(train_data_encoded, train_label)
r_forest_pred = r_forest.predict(test_data_encoded)

In [None]:
# Determine the majority class for 'workclass'
majority_class = adult_df_cleaned.workclass.value_counts().index[0]

# Create DataFrame for predictions from different models
pred_df = pd.DataFrame({'RFor': r_forest_pred, 'DTree': clf_pred, 'LogReg': log_reg_pred})
#pred_df

#Determine the overall prediction using majority voting
overall_pred = pred_df.apply(lambda x: x.value_counts().index[0] if x.value_counts()[0] > 1 else majority_class, axis=1)
#overall_pred

# Ensure the 'overall_pred' has the same index as the rows we want to update
mask = adult_df_cleaned['workclass'].isnull()
adult_df_cleaned.loc[mask, 'workclass'] = overall_pred.values

# Verify the results
print(adult_df_cleaned.workclass.value_counts())
print(adult_df_cleaned.workclass.unique())