# Classification

Prediction task is to determine whether a person makes over 50K a year.

In [1]:
import pandas as pd

In [2]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score

# Read and prepare the data

In [3]:
df = pd.read_csv('../src/data/census.csv')

In [4]:
df.head()

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [5]:

df.isna().sum()

age               0
workclass         0
final-weight      0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loos      0
hour-per-week     0
native-country    0
income            0
dtype: int64

In [6]:
# Full Sample target variable and independent variables
X = df.drop(['income'], axis = 1)
y = df['income']

In [7]:
cat_vars = ['workclass',
            'education',
            'marital-status',
            'occupation',
            'relationship',
            'race',
            'sex',
            'native-country'
]

In [8]:
# Full Sample target variable and independent variables
X = df.drop(['income'], axis = 1)
y = df['income']

In [9]:
# transform target 
labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)

In [10]:
# transform input 
X.shape

(32561, 14)

In [11]:
X = pd.get_dummies(X, columns = cat_vars , drop_first=False)

In [12]:
X.shape

(32561, 108)

In [13]:
#dividir em base de treino e teste
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 4)

In [14]:
## validation
def val_metrics(y_ori, y_pred):
    print('Accuracy :', accuracy_score(y_ori, y_pred))
    print('Matrix :', confusion_matrix(y_ori, y_pred))
    

# 1) Naive Bayes

## 1.1) GaussianNB

In [15]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_train)

In [16]:
val_metrics(y_train, y_pred)

Accuracy : 0.7947964197964198
Matrix : [[16416   903]
 [ 3774  1699]]


In [17]:
#Test 
y_pred_test = model.predict(X_test)
val_metrics(y_test, y_pred_test)

Accuracy : 0.7942471081994062
Matrix : [[7013  388]
 [1622  746]]


## 1.2) MultinomialNB

In [18]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_train)

In [19]:
val_metrics(y_train, y_pred)

Accuracy : 0.7830379080379081
Matrix : [[16563   756]
 [ 4189  1284]]


In [20]:
#Test 
y_pred_test = model.predict(X_test)
val_metrics(y_test, y_pred_test)

Accuracy : 0.781553894973897
Matrix : [[7053  348]
 [1786  582]]


## 1.3) BernoulliNB

In [21]:
from sklearn.naive_bayes import BernoulliNB
model = BernoulliNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_train)

In [22]:
val_metrics(y_train, y_pred)

Accuracy : 0.7598280098280098
Matrix : [[12982  4337]
 [ 1137  4336]]


In [23]:
#Test 
y_pred_test = model.predict(X_test)
val_metrics(y_test, y_pred_test)

Accuracy : 0.7556556454089467
Matrix : [[5520 1881]
 [ 506 1862]]


# 2) Regression logistic

In [24]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(random_state=16)
model.fit(X_train, y_train)
y_pred = model.predict(X_train)

In [25]:
val_metrics(y_train, y_pred)

Accuracy : 0.7975166725166725
Matrix : [[16735   584]
 [ 4031  1442]]


In [26]:
#Test 
y_pred_test = model.predict(X_test)
val_metrics(y_test, y_pred_test)

Accuracy : 0.7970109530146381
Matrix : [[7153  248]
 [1735  633]]


# 3) Decision Tree

In [27]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(criterion='entropy', random_state=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_train)

In [28]:
val_metrics(y_train, y_pred)

Accuracy : 1.0
Matrix : [[17319     0]
 [    0  5473]]


In [29]:
#Test 
y_pred_test = model.predict(X_test)
val_metrics(y_test, y_pred_test)

Accuracy : 0.8148223973794656
Matrix : [[6483  918]
 [ 891 1477]]


# 4) Random Forests

In [30]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=40, criterion='entropy', random_state=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_train)

In [31]:
val_metrics(y_train, y_pred)

Accuracy : 0.9993418743418744
Matrix : [[17318     1]
 [   14  5459]]


In [32]:
#Test 
y_pred_test = model.predict(X_test)
val_metrics(y_test, y_pred_test)

Accuracy : 0.847169618179957
Matrix : [[6821  580]
 [ 913 1455]]


# 5) KNN

In [33]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
model.fit(X_train, y_train)
y_pred = model.predict(X_train)

In [34]:
val_metrics(y_train, y_pred)

Accuracy : 0.8346788346788346
Matrix : [[16570   749]
 [ 3019  2454]]


In [35]:
#Test 
y_pred_test = model.predict(X_test)
val_metrics(y_test, y_pred_test)

Accuracy : 0.7776640393080152
Matrix : [[6857  544]
 [1628  740]]


# 6) Redes neurais

In [36]:
from sklearn.neural_network import MLPClassifier
model = MLPClassifier(verbose = True, max_iter=1000, tol=0.000010)
model.fit(X_train, y_train)
y_pred = model.predict(X_train)

Iteration 1, loss = 11.98901234
Iteration 2, loss = 10.32077895
Iteration 3, loss = 10.57321528
Iteration 4, loss = 10.27841364
Iteration 5, loss = 10.85215688
Iteration 6, loss = 10.78782365
Iteration 7, loss = 10.26434083
Iteration 8, loss = 9.53426067
Iteration 9, loss = 10.38993681
Iteration 10, loss = 10.21418046
Iteration 11, loss = 10.49622028
Iteration 12, loss = 10.29244879
Iteration 13, loss = 10.60205569
Iteration 14, loss = 10.13818465
Iteration 15, loss = 9.74109460
Iteration 16, loss = 9.86660823
Iteration 17, loss = 9.71736898
Iteration 18, loss = 10.84355074
Iteration 19, loss = 10.38686715
Training loss did not improve more than tol=0.000010 for 10 consecutive epochs. Stopping.


In [37]:
val_metrics(y_train, y_pred)

Accuracy : 0.7732537732537732
Matrix : [[15402  1917]
 [ 3251  2222]]


In [38]:
#Test 
y_pred_test = model.predict(X_test)
val_metrics(y_test, y_pred_test)

Accuracy : 0.768655952502815
Matrix : [[6552  849]
 [1411  957]]


# 7) SVM

In [38]:
from sklearn.svm import SVC
model = SVC(kernel='linear')
model.fit(X_train, y_train)
y_pred = model.predict(X_train)

In [None]:
val_metrics(y_train, y_pred)

Accuracy : 0.9993418743418744
Matrix : [[17318     1]
 [   14  5459]]


In [39]:
#Test 
y_pred_test = model.predict(X_test)
val_metrics(y_test, y_pred_test)

Accuracy : 0.768655952502815
Matrix : [[6552  849]
 [1411  957]]
