In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler, StandardScaler
from category_encoders import CountEncoder
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score

In [4]:
train_df = pd.read_csv('train.csv', na_values = '?')
train_df.isnull().sum()
train_df.loc[ train_df['age'] <= 32, 'new_age'] = 0
train_df.loc[(train_df['age'] > 16) & (train_df['age'] <= 47), 'new_age'] = 1
train_df.loc[(train_df['age'] > 47) & (train_df['age'] <= 61), 'new_age'] = 2
train_df.loc[(train_df['age'] > 61) & (train_df['age'] <= 76), 'new_age'] = 3
train_df.loc[ train_df['age'] > 76, 'new_age'] = 4

In [5]:
test_df = pd.read_csv('test.csv', na_values = '?')
test_df.isnull().sum()
test_df.shape
test_df.loc[ test_df['age'] <= 32, 'new_age'] = 0
test_df.loc[(test_df['age'] > 16) & (test_df['age'] <= 47), 'new_age'] = 1
test_df.loc[(test_df['age'] > 47) & (test_df['age'] <= 61), 'new_age'] = 2
test_df.loc[(test_df['age'] > 61) & (test_df['age'] <= 76), 'new_age'] = 3
test_df.loc[ test_df['age'] > 76, 'new_age'] = 4

In [6]:
train_df['capital'] = train_df['capital.gain'] - train_df['capital.loss']
test_df['capital'] = test_df['capital.gain'] - test_df['capital.loss']

In [7]:
train_df.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income,new_age,capital
0,58,Private,290661,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,<=50K,2.0,0
1,62,Private,109463,Some-college,10,Separated,Sales,Unmarried,White,Female,0,1617,33,United-States,<=50K,3.0,-1617
2,33,Private,137088,Bachelors,13,Married-civ-spouse,Machine-op-inspct,Husband,Other,Male,0,0,40,Ecuador,<=50K,1.0,0
3,24,Private,117767,Assoc-acdm,12,Never-married,Sales,Own-child,White,Male,0,0,20,United-States,<=50K,1.0,0
4,67,Self-emp-not-inc,431426,HS-grad,9,Married-civ-spouse,Adm-clerical,Wife,White,Female,0,0,2,United-States,<=50K,3.0,0


In [8]:
train_df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education.num',
       'marital.status', 'occupation', 'relationship', 'race', 'sex',
       'capital.gain', 'capital.loss', 'hours.per.week', 'native.country',
       'income', 'new_age', 'capital'],
      dtype='object')

In [7]:
feature_columns = ['age', 'workclass', 'fnlwgt', 'education', 'education.num',
       'marital.status', 'occupation', 'relationship', 'race', 'sex',
       'capital.gain', 'capital.loss', 'hours.per.week', 'native.country', 'capital']
label_columns = 'income'

In [8]:
X_train = train_df.loc[:, feature_columns].values
y_train = train_df.loc[:, label_columns].values

X_test = test_df.loc[:, feature_columns].values
y_test = test_df.loc[:, label_columns].values

## Missing Value

In [9]:
def MissingValue(X, y):
    workclass_imputer = SimpleImputer(np.nan, strategy = 'most_frequent')
    X[:, 1] = workclass_imputer.fit_transform(X[:, 1].reshape(-1, 1)).ravel()
    
    occupation_imputer = SimpleImputer(np.nan, strategy = 'most_frequent')
    X[:, 6] = occupation_imputer.fit_transform(X[:, 1].reshape(-1, 1)).ravel()
    
    country_imputer = SimpleImputer(np.nan, strategy = 'most_frequent')
    X[:, 13] = country_imputer.fit_transform(X[:, 13].reshape(-1, 1)).ravel()
    return X, y



In [10]:
X_train, y_train = MissingValue(X_train, y_train)
X_test, y_test = MissingValue(X_test, y_test)

In [11]:
train_dataset = pd.DataFrame.from_records(X_train)
train_dataset.isnull().sum()

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
dtype: int64

In [12]:
test_dataset = pd.DataFrame.from_records(X_test)
test_dataset.isnull().sum()

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
dtype: int64

## Chuẩn hóa dữ liệu

In [13]:
train_df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income,capital
0,58,Private,290661,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,<=50K,0
1,62,Private,109463,Some-college,10,Separated,Sales,Unmarried,White,Female,0,1617,33,United-States,<=50K,-1617
2,33,Private,137088,Bachelors,13,Married-civ-spouse,Machine-op-inspct,Husband,Other,Male,0,0,40,Ecuador,<=50K,0
3,24,Private,117767,Assoc-acdm,12,Never-married,Sales,Own-child,White,Male,0,0,20,United-States,<=50K,0
4,67,Self-emp-not-inc,431426,HS-grad,9,Married-civ-spouse,Adm-clerical,Wife,White,Female,0,0,2,United-States,<=50K,0


In [14]:
def Normalize(X, y):
    label_encoder = LabelEncoder()
    one_hot_encoder = OneHotEncoder()
    min_max_encoder = MinMaxScaler()
    standard_encoder = StandardScaler()
    count_encoder = CountEncoder()
    X[:, 0] = min_max_encoder.fit_transform(X[:, 0].reshape(-1, 1)).ravel() #age

    X[:, 1] = label_encoder.fit_transform(X[:, 1].reshape(-1, 1)).ravel() #workclass
    
    X[:, 2] = min_max_encoder.fit_transform(X[:, 2].reshape(-1, 1)).ravel() #fnlwgt
    
#     X[:, 3] = label_encoder.fit_transform(X[:, 3].reshape(-1, 1)).ravel()
    X[:, 3] = count_encoder.fit_transform(X[:, 3].reshape(-1, 1)).values.ravel() #education
    X[:, 3] = min_max_encoder.fit_transform(X[:, 3].reshape(-1, 1)).ravel()
    
    X[:, 5] = label_encoder.fit_transform(X[:, 5].reshape(-1, 1)).ravel() #marital.status
    
#     X[:, 6] = label_encoder.fit_transform(X[:, 6].reshape(-1, 1)).ravel() #occupation
    X[:, 6] = count_encoder.fit_transform(X[:, 6].reshape(-1, 1)).values.ravel()
    X[:, 6] = min_max_encoder.fit_transform(X[:, 6].reshape(-1, 1)).ravel()
    
    
    X[:, 7] = label_encoder.fit_transform(X[:, 6].reshape(-1, 1)).ravel() #relationship
    
    X[:, 8] = label_encoder.fit_transform(X[:, 8].reshape(-1, 1)).ravel() #race
    
    X[:, 9] = label_encoder.fit_transform(X[:, 8].reshape(-1, 1)).ravel() #sex
#     X[:, 10] = min_max_encoder.fit_transform(X[:, 10].reshape(-1, 1)).ravel() #capital.gain
#     X[:, 11] = min_max_encoder.fit_transform(X[:, 11].reshape(-1, 1)).ravel() #capital.loss

    X[:, 12] = min_max_encoder.fit_transform(X[:, 12].reshape(-1, 1)).ravel() #hours.per.week
    
    X[:, 13] = count_encoder.fit_transform(X[:, 13].reshape(-1, 1)).values.ravel() #native.country
    X[:, 13] = min_max_encoder.fit_transform(X[:, 13].reshape(-1, 1)).ravel()
    
    X[:, 14] = min_max_encoder.fit_transform(X[:, 14].reshape(-1, 1)).ravel()
    
#     race_one_hot = one_hot_encoder.fit_transform(X[:, 1].reshape(-1, 1)).toarray()
#     X = np.delete(X, 1, axis = 1)
#     X = np.concatenate((X[:, 0:1], race_one_hot, X[:, 1:]), axis = 1)
    
    X = np.delete(X, 11, axis = 1)
    X = np.delete(X, 10, axis = 1)
    y = label_encoder.fit_transform(y) #income
    return X, y

In [15]:
X_train, y_train = Normalize(X_train, y_train)
X_test, y_test = Normalize(X_test, y_test)

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(22792, 13) (22792,) (9769, 13) (9769,)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [16]:
clf = GaussianNB()
# clf = MultinomialNB()
# clf = BernoulliNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [17]:
print("Accuracy score:", accuracy_score(y_test, y_pred))

Accuracy score: 0.8158460436073293


In [18]:
train_df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income,capital
0,58,Private,290661,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,<=50K,0
1,62,Private,109463,Some-college,10,Separated,Sales,Unmarried,White,Female,0,1617,33,United-States,<=50K,-1617
2,33,Private,137088,Bachelors,13,Married-civ-spouse,Machine-op-inspct,Husband,Other,Male,0,0,40,Ecuador,<=50K,0
3,24,Private,117767,Assoc-acdm,12,Never-married,Sales,Own-child,White,Male,0,0,20,United-States,<=50K,0
4,67,Self-emp-not-inc,431426,HS-grad,9,Married-civ-spouse,Adm-clerical,Wife,White,Female,0,0,2,United-States,<=50K,0


In [19]:
train_df.describe(include='O')

Unnamed: 0,workclass,education,marital.status,occupation,relationship,race,sex,native.country,income
count,21524,22792,22792,21519,22792,22792,22792,22365,22792
unique,8,16,7,14,6,5,2,41,2
top,Private,HS-grad,Married-civ-spouse,Prof-specialty,Husband,White,Male,United-States,<=50K
freq,15843,7318,10525,2910,9254,19457,15244,20407,17291


In [20]:
test_df.describe(include='O')

Unnamed: 0,workclass,education,marital.status,occupation,relationship,race,sex,native.country,income
count,9201,9769,9769,9199,9769,9769,9769,9613,9769
unique,8,16,7,14,6,5,2,40,2
top,Private,HS-grad,Married-civ-spouse,Prof-specialty,Husband,White,Male,United-States,<=50K
freq,6853,3183,4451,1230,3939,8359,6546,8763,7429


In [21]:
print(pd.cut(train_df['age'], 5))

0          (46.2, 60.8]
1          (60.8, 75.4]
2          (31.6, 46.2]
3        (16.927, 31.6]
4          (60.8, 75.4]
              ...      
22787    (16.927, 31.6]
22788      (46.2, 60.8]
22789      (46.2, 60.8]
22790      (31.6, 46.2]
22791    (16.927, 31.6]
Name: age, Length: 22792, dtype: category
Categories (5, interval[float64]): [(16.927, 31.6] < (31.6, 46.2] < (46.2, 60.8] < (60.8, 75.4] < (75.4, 90.0]]


In [26]:
train_df.loc[ train_df['age'] <= 32, 'new_age'] = 0
train_df.loc[(train_df['age'] > 16) & (train_df['age'] <= 47), 'new_age'] = 1
train_df.loc[(train_df['age'] > 47) & (train_df['age'] <= 61), 'new_age'] = 2
train_df.loc[(train_df['age'] > 61) & (train_df['age'] <= 76), 'new_age'] = 3
train_df.loc[ train_df['age'] > 76, 'new_age'] = 4

In [27]:
train_df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income,capital,new_age
0,58,Private,290661,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,<=50K,0,2.0
1,62,Private,109463,Some-college,10,Separated,Sales,Unmarried,White,Female,0,1617,33,United-States,<=50K,-1617,3.0
2,33,Private,137088,Bachelors,13,Married-civ-spouse,Machine-op-inspct,Husband,Other,Male,0,0,40,Ecuador,<=50K,0,1.0
3,24,Private,117767,Assoc-acdm,12,Never-married,Sales,Own-child,White,Male,0,0,20,United-States,<=50K,0,1.0
4,67,Self-emp-not-inc,431426,HS-grad,9,Married-civ-spouse,Adm-clerical,Wife,White,Female,0,0,2,United-States,<=50K,0,3.0
