In [1]:
import pandas as pd
import numpy as np 
from sklearn import preprocessing
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn import linear_model
from sklearn import metrics

In [2]:
data = pd.read_csv('adult.data.txt', header=None)

In [3]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
data.isnull().sum()

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
dtype: int64

In [5]:
# data[1] == " Private"
data[6].value_counts()

 Prof-specialty       4140
 Craft-repair         4099
 Exec-managerial      4066
 Adm-clerical         3770
 Sales                3650
 Other-service        3295
 Machine-op-inspct    2002
 ?                    1843
 Transport-moving     1597
 Handlers-cleaners    1370
 Farming-fishing       994
 Tech-support          928
 Protective-serv       649
 Priv-house-serv       149
 Armed-Forces            9
Name: 6, dtype: int64

In [6]:
def preprocess(df):
    new_df = pd.DataFrame()
    new_df['age'] = df[0]
    new_df['fnlwgt'] = df[2]
    in_us = {' United-States': 0}
    new_df['inside_us'] = pd.Series(1 if s == ' United-States' else 0 for s in df[13])
    new_df['outside_us'] = pd.Series(0 if s == ' United-States' else 1 for s in df[13])
    new_df['is_white'] = pd.Series(1 if s == ' White' else 0 for s in df[8])
    new_df['is_not_white'] = pd.Series(0 if s == ' White' else 1 for s in df[8])
    new_df = pd.concat([new_df, pd.get_dummies(data[9])], axis=1)
    new_df = pd.concat([new_df, pd.get_dummies(data[7])], axis=1)
    
    target = pd.Series(1 if s == ' <=50K' else 0 for s in df[14])

    return new_df, target
    # as features 10, 11 foram excluidas por terem muitos valores repetidos

In [7]:
kf = KFold(n_splits=5)


In [8]:
X, y = preprocess(data)

In [11]:
accuracy_LR = []
accuracy_knn = []
accuracy_nb = []
accuracy_svm = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    min_max_scaler = preprocessing.MinMaxScaler()
    min_max_scaler.fit(X_train)
    
    X_train_std = min_max_scaler.transform(X_train)
    X_test_std = min_max_scaler.transform(X_test)

    logreg = linear_model.LogisticRegression()
    logreg.fit(X_train_std, y_train)
    y_pred_logreg = logreg.predict(X_test)
    accuracy_LR.append(metrics.accuracy_score(y_test, y_pred_logreg))
    
    knn = KNeighborsClassifier()
    knn.fit(X_train_std, y_train)
    y_pred_knn = knn.predict(X_test)
    accuracy_knn.append(metrics.accuracy_score(y_test, y_pred_knn))
    
    nb = GaussianNB()
    nb.fit(X_train_std, y_train)
    y_pred_nb = nb.predict(X_test)
    accuracy_nb.append(metrics.accuracy_score(y_test, y_pred_nb))
    
    svm = SVC()
    svm.fit(X_train_std, y_train)
    y_pred_svm = svm.predict(X_test_std)
    accuracy_svm.append(metrics.accuracy_score(y_test, y_pred_svm))

print(np.mean(accuracy_LR))
print(np.mean(accuracy_knn))
print(np.mean(accuracy_nb))
print(np.mean(accuracy_svm))

0.24080954043528893
0.7591904595647111
0.7591904595647111
0.7585762089504604


In [13]:
knn = KNeighborsClassifier()
knn.fit(X, y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')