In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler, StandardScaler
from category_encoders import CountEncoder
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score

In [2]:
train_df = pd.read_csv('train.csv', na_values = '?')
train_df.isnull().sum()

age                  0
workclass         1268
fnlwgt               0
education            0
education.num        0
marital.status       0
occupation        1273
relationship         0
race                 0
sex                  0
capital.gain         0
capital.loss         0
hours.per.week       0
native.country     427
income               0
dtype: int64

In [3]:
test_df = pd.read_csv('test.csv', na_values = '?')
test_df.isnull().sum()

age                 0
workclass         568
fnlwgt              0
education           0
education.num       0
marital.status      0
occupation        570
relationship        0
race                0
sex                 0
capital.gain        0
capital.loss        0
hours.per.week      0
native.country    156
income              0
dtype: int64

In [4]:
train_df.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,58,Private,290661,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,<=50K
1,62,Private,109463,Some-college,10,Separated,Sales,Unmarried,White,Female,0,1617,33,United-States,<=50K
2,33,Private,137088,Bachelors,13,Married-civ-spouse,Machine-op-inspct,Husband,Other,Male,0,0,40,Ecuador,<=50K
3,24,Private,117767,Assoc-acdm,12,Never-married,Sales,Own-child,White,Male,0,0,20,United-States,<=50K
4,67,Self-emp-not-inc,431426,HS-grad,9,Married-civ-spouse,Adm-clerical,Wife,White,Female,0,0,2,United-States,<=50K


In [5]:
X_train = train_df.iloc[:, :-1].values
y_train = train_df.iloc[:, -1].values

X_test = test_df.iloc[:, :-1].values
y_test = test_df.iloc[:, -1].values

## Missing Value

In [6]:
def MissingValue(X, y):
    workclass_imputer = SimpleImputer(np.nan, strategy = 'most_frequent')
    X[:, 1] = workclass_imputer.fit_transform(X[:, 1].reshape(-1, 1)).ravel()
    
    occupation_imputer = SimpleImputer(np.nan, strategy = 'most_frequent')
    X[:, 6] = occupation_imputer.fit_transform(X[:, 1].reshape(-1, 1)).ravel()
    
    country_imputer = SimpleImputer(np.nan, strategy = 'most_frequent')
    X[:, -1] = country_imputer.fit_transform(X[:, -1].reshape(-1, 1)).ravel()
    return X, y



In [7]:
X_train, y_train = MissingValue(X_train, y_train)
X_test, y_test = MissingValue(X_train, y_train)

In [8]:
train_dataset = pd.DataFrame.from_records(X_train)
train_dataset.isnull().sum()

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
dtype: int64

In [9]:
test_dataset = pd.DataFrame.from_records(X_test)
test_dataset.isnull().sum()

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
dtype: int64

## Chuẩn hóa dữ liệu

In [10]:
train_df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,58,Private,290661,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,<=50K
1,62,Private,109463,Some-college,10,Separated,Sales,Unmarried,White,Female,0,1617,33,United-States,<=50K
2,33,Private,137088,Bachelors,13,Married-civ-spouse,Machine-op-inspct,Husband,Other,Male,0,0,40,Ecuador,<=50K
3,24,Private,117767,Assoc-acdm,12,Never-married,Sales,Own-child,White,Male,0,0,20,United-States,<=50K
4,67,Self-emp-not-inc,431426,HS-grad,9,Married-civ-spouse,Adm-clerical,Wife,White,Female,0,0,2,United-States,<=50K


In [11]:
# count_encoder = CountEncoder()
# s= count_encoder.fit_transform(X_train[:, 1].reshape(-1, 1))

In [12]:
def Normalize(X, y):
    label_encoder = LabelEncoder()
    one_hot_encoder = OneHotEncoder()
    min_max_encoder = MinMaxScaler()
    count_encoder = CountEncoder()
    X[:, 0] = min_max_encoder.fit_transform(X[:, 0].reshape(-1, 1)).ravel()

#     X[:, 1] = label_encoder.fit_transform(X[:, 1].reshape(-1, 1)).ravel()
    X[:, 1] = count_encoder.fit_transform(X[:, 1].reshape(-1, 1)).values.ravel()
    X[:, 1] = min_max_encoder.fit_transform(X[:, 1].reshape(-1, 1)).ravel()
    
    X[:, 2] = min_max_encoder.fit_transform(X[:, 2].reshape(-1, 1)).ravel()
    
#     X[:, 3] = label_encoder.fit_transform(X[:, 3].reshape(-1, 1)).ravel()
    X[:, 3] = count_encoder.fit_transform(X[:, 3].reshape(-1, 1)).values.ravel()
    X[:, 3] = min_max_encoder.fit_transform(X[:, 3].reshape(-1, 1)).ravel()
    
    
    X[:, 5] = label_encoder.fit_transform(X[:, 5].reshape(-1, 1)).ravel()
    
    X[:, 6] = label_encoder.fit_transform(X[:, 6].reshape(-1, 1)).ravel()
    X[:, 6] = count_encoder.fit_transform(X[:, 6].reshape(-1, 1)).values.ravel()
    X[:, 6] = min_max_encoder.fit_transform(X[:, 6].reshape(-1, 1)).ravel()
    
    
    X[:, 7] = label_encoder.fit_transform(X[:, 6].reshape(-1, 1)).ravel()
    X[:, 8] = label_encoder.fit_transform(X[:, 8].reshape(-1, 1)).ravel()
    X[:, 9] = label_encoder.fit_transform(X[:, 8].reshape(-1, 1)).ravel()
#     X[:, 10] = min_max_encoder.fit_transform(X[:, 10].reshape(-1, 1)).ravel()
#     X[:, 11] = min_max_encoder.fit_transform(X[:, 11].reshape(-1, 1)).ravel()
    
    X[:, 13] = count_encoder.fit_transform(X[:, 13].reshape(-1, 1)).values.ravel()
    X[:, 13] = min_max_encoder.fit_transform(X[:, 13].reshape(-1, 1)).ravel()
#     X[:, 13] = label_encoder.fit_transform(X[:, 13].reshape(-1, 1))
    

    
    y = label_encoder.fit_transform(y)
    return X, y

In [13]:
X_train, y_train = Normalize(X_train, y_train)
X_test, y_test = Normalize(X_train, y_train)

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(22792, 14) (22792,) (22792, 14) (22792,)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [14]:
clf = GaussianNB()
# clf = MultinomialNB()
# clf = BernoulliNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [15]:
print("Accuracy score:", accuracy_score(y_test, y_pred))

Accuracy score: 0.8054580554580555


In [16]:
train_df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,58,Private,290661,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,<=50K
1,62,Private,109463,Some-college,10,Separated,Sales,Unmarried,White,Female,0,1617,33,United-States,<=50K
2,33,Private,137088,Bachelors,13,Married-civ-spouse,Machine-op-inspct,Husband,Other,Male,0,0,40,Ecuador,<=50K
3,24,Private,117767,Assoc-acdm,12,Never-married,Sales,Own-child,White,Male,0,0,20,United-States,<=50K
4,67,Self-emp-not-inc,431426,HS-grad,9,Married-civ-spouse,Adm-clerical,Wife,White,Female,0,0,2,United-States,<=50K


In [17]:
train_df['race'].value_counts().count()

5