# Iman Kianian - 5'th Assignment of Machine Learning course - Winter 2022

In [1]:
import warnings
warnings.filterwarnings('ignore')

#### Import Necessary Packages

In [2]:
import pandas as pd
from pandas.api.types import is_string_dtype
import numpy as np
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import cross_validate
from sklearn import naive_bayes
from mixed_naive_bayes import MixedNB
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
#from sklearn import preprocessing

#### Import Dataset

In [3]:
df = pd.read_csv (r'HW5.csv')
df

Unnamed: 0,Age,WorkClass,FinancialWeight,Education,Education-num,MaritalStatus,Occupation,Relationship,Race,Sex,CapitalGain,CapitalLoss,HourPerWeek,NativeCountry,Income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


## Part 1

#### Columns that are numerical

In [4]:
df_numerics_only = df.select_dtypes(include=np.number)
df_numerics_only.columns

Index(['Age', 'FinancialWeight', 'Education-num', 'CapitalGain', 'CapitalLoss',
       'HourPerWeek'],
      dtype='object')

#### Columns that are strings

In [5]:
df_nonnumerics_only = df.drop(list(df_numerics_only.columns), axis = 1)
df_nonnumerics_only.columns

Index(['WorkClass', 'Education', 'MaritalStatus', 'Occupation', 'Relationship',
       'Race', 'Sex', 'NativeCountry', 'Income'],
      dtype='object')

#### An Example of NaN data (Occupation , WorkClass)

In [6]:
df.iloc[4553]

Age                            22
WorkClass                     NaN
FinancialWeight            291407
Education                    12th
Education-num                   8
MaritalStatus       Never-married
Occupation                    NaN
Relationship            Own-child
Race                        Black
Sex                          Male
CapitalGain                     0
CapitalLoss                     0
HourPerWeek                    40
NativeCountry       United-States
Income                      <=50K
Name: 4553, dtype: object

#### Count of NaNs in each column

In [7]:
df.isna().sum()

Age                   0
WorkClass          1836
FinancialWeight       0
Education             0
Education-num         0
MaritalStatus         0
Occupation         1843
Relationship          0
Race                  0
Sex                   0
CapitalGain           0
CapitalLoss           0
HourPerWeek           0
NativeCountry       583
Income                0
dtype: int64

#### Clean data with Impute missing values for categorical variable

In [8]:
temp = df.isna().sum()
for ind,col in enumerate(df.columns):
    if temp[ind]>0:
        df.loc[df[col].isnull(),col] = df[col].mode()[0]

In [9]:
df.isna().sum()

Age                0
WorkClass          0
FinancialWeight    0
Education          0
Education-num      0
MaritalStatus      0
Occupation         0
Relationship       0
Race               0
Sex                0
CapitalGain        0
CapitalLoss        0
HourPerWeek        0
NativeCountry      0
Income             0
dtype: int64

#### The data which shown before, Repaired

In [10]:
df.iloc[4553]

Age                             22
WorkClass                  Private
FinancialWeight             291407
Education                     12th
Education-num                    8
MaritalStatus        Never-married
Occupation          Prof-specialty
Relationship             Own-child
Race                         Black
Sex                           Male
CapitalGain                      0
CapitalLoss                      0
HourPerWeek                     40
NativeCountry        United-States
Income                       <=50K
Name: 4553, dtype: object

#### Accuracy When we only predict <=50k for income. (BaseLine)

In [11]:
df[df['Income'] == ' <=50K'].shape[0] / df.shape[0]

0.7591904425539756

## Part 2

In [54]:
df_LR = df.copy()

#### Encode Categorical Data and Remove 'Education-num' feature from DataFrame beacuse it is duplicate.

In [55]:
del df_LR['Education-num']
df_numerics_only = df_LR.select_dtypes(include=np.number)
df_nonnumerics_only = df_LR.drop(list(df_numerics_only.columns), axis = 1)
df_nonmetrics = [ind for ind,lbl in enumerate(df_LR.columns) if lbl in df_nonnumerics_only.columns]
df_nonmetrics

[1, 3, 4, 5, 6, 7, 8, 12, 13]

In [56]:
df_LR

Unnamed: 0,Age,WorkClass,FinancialWeight,Education,MaritalStatus,Occupation,Relationship,Race,Sex,CapitalGain,CapitalLoss,HourPerWeek,NativeCountry,Income
0,39,State-gov,77516,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [57]:
for ind,col in zip(df_nonmetrics,df_nonnumerics_only.columns):
    #label_encoder = preprocessing.LabelEncoder()
    label_encoder = LabelEncoder()
    label_encoder.fit(df_LR[col])
    df_LR[col] = label_encoder.transform(df_LR[col])    

In [58]:
df

Unnamed: 0,Age,WorkClass,FinancialWeight,Education,Education-num,MaritalStatus,Occupation,Relationship,Race,Sex,CapitalGain,CapitalLoss,HourPerWeek,NativeCountry,Income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [59]:
df_LR

Unnamed: 0,Age,WorkClass,FinancialWeight,Education,MaritalStatus,Occupation,Relationship,Race,Sex,CapitalGain,CapitalLoss,HourPerWeek,NativeCountry,Income
0,39,6,77516,9,4,0,1,4,1,2174,0,40,38,0
1,50,5,83311,9,2,3,0,4,1,0,0,13,38,0
2,38,3,215646,11,0,5,1,4,1,0,0,40,38,0
3,53,3,234721,1,2,5,0,2,1,0,0,40,38,0
4,28,3,338409,9,2,9,5,2,0,0,0,40,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,3,257302,7,2,12,5,4,0,0,0,38,38,0
32557,40,3,154374,11,2,6,0,4,1,0,0,40,38,1
32558,58,3,151910,11,6,0,4,4,0,0,0,40,38,0
32559,22,3,201490,11,4,0,3,4,1,0,0,20,38,0


#### Preparing X , y for training models.

In [60]:
X = df_LR.iloc[:,:df_LR.shape[1]-1]
y = df_LR.iloc[:,-1]
y=y.astype('int')

In [61]:
X

Unnamed: 0,Age,WorkClass,FinancialWeight,Education,MaritalStatus,Occupation,Relationship,Race,Sex,CapitalGain,CapitalLoss,HourPerWeek,NativeCountry
0,39,6,77516,9,4,0,1,4,1,2174,0,40,38
1,50,5,83311,9,2,3,0,4,1,0,0,13,38
2,38,3,215646,11,0,5,1,4,1,0,0,40,38
3,53,3,234721,1,2,5,0,2,1,0,0,40,38
4,28,3,338409,9,2,9,5,2,0,0,0,40,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,3,257302,7,2,12,5,4,0,0,0,38,38
32557,40,3,154374,11,2,6,0,4,1,0,0,40,38
32558,58,3,151910,11,6,0,4,4,0,0,0,40,38
32559,22,3,201490,11,4,0,3,4,1,0,0,20,38


#### CV on LogisticRegression model.

In [62]:
clf = LogisticRegressionCV(cv=10, random_state=0).fit(X, y)

In [63]:
clf.score(X, y)

0.7991155062805196

## Part 3

#### Gaussian Naive Bayes

In [64]:
gnb = naive_bayes.GaussianNB()

In [65]:
np.mean(cross_validate(gnb, X, y, cv=10)['test_score'])

0.7948774007071521

#### Multinomial Naive Bayes

In [66]:
mnb = naive_bayes.MultinomialNB()

In [67]:
np.mean(cross_validate(mnb, X, y, cv=10)['test_score'])

0.782592680742819

#### Bernoulli Naive Bayes

In [68]:
bnb = naive_bayes.BernoulliNB()

In [69]:
np.mean(cross_validate(bnb, X, y, cv=10)['test_score'])

0.7284171155832193

#### Complement Naive Bayes

In [70]:
cnb = naive_bayes.ComplementNB()

In [71]:
np.mean(cross_validate(cnb, X, y, cv=10)['test_score'])

0.782592680742819

#### Naive Bayes with Gaussian for numerical features and Multinomial(categorical) distribution for Categorical data

In [72]:
del df_nonmetrics[-1]

In [73]:
mixnb = MixedNB(categorical_features=df_nonmetrics)

In [74]:
np.nanmean(cross_validate(mixnb, X, y, cv=10)['test_score'])

[ 8 16  7 14  6  5  2 41]
[ 8 16  7 14  6  5  2 41]
[ 8 16  7 14  6  5  2 41]
[ 8 16  7 14  6  5  2 41]
[ 8 16  7 14  6  5  2 41]
[ 8 16  7 14  6  5  2 41]
[ 8 16  7 14  6  5  2 41]
[ 8 16  7 14  6  5  2 41]
[ 8 16  7 14  6  5  2 41]


0.8222490989607968

# END.