In [1]:
import pandas as pd

adult_census = pd.read_csv("../data/adult-census.csv")

In [2]:
import numpy as np

# create column names of interest
target_col = "class"
feature_col = adult_census.drop(columns=target_col).select_dtypes(np.number).columns.values

In [3]:
target = adult_census[target_col]
target

0         <=50K
1         <=50K
2          >50K
3          >50K
4         <=50K
          ...  
48837     <=50K
48838      >50K
48839     <=50K
48840     <=50K
48841      >50K
Name: class, Length: 48842, dtype: object

In [4]:
features = adult_census[feature_col]
features

Unnamed: 0,age,education-num,capital-gain,capital-loss,hours-per-week
0,25,7,0,0,40
1,38,9,0,0,50
2,28,12,0,0,40
3,44,10,7688,0,40
4,18,10,0,0,30
...,...,...,...,...,...
48837,27,12,0,0,38
48838,40,9,0,0,40
48839,58,9,0,0,40
48840,22,9,0,0,20


In [5]:
print(
    f"The dataset contains {features.shape[0]} samples and "
    f"{features.shape[1]} features"
)

The dataset contains 48842 samples and 5 features


In [6]:
# to display nice model diagram
from sklearn import set_config
set_config(display='diagram')

In [7]:
from sklearn.neighbors import KNeighborsClassifier

# 1. define the algorithm
model = KNeighborsClassifier()

# 2. fit the model
model.fit(features, target)

In [8]:
target_predicted = model.predict(features)
target_predicted

array([' <=50K', ' <=50K', ' <=50K', ..., ' <=50K', ' <=50K', ' >50K'],
      dtype=object)

In [9]:
# accuracy of first 5 predictions
target[:5] == target_predicted[:5]

0     True
1     True
2    False
3     True
4     True
Name: class, dtype: bool

In [10]:
(target == target_predicted).mean()

0.8541624012120715

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    features, 
    target, 
    random_state=123, #same split of data
    test_size=0.25,
    stratify=target
)

In [16]:
X_train.shape

(36631, 5)

In [18]:
X_test.shape

(12211, 5)

In [19]:
# 1. define the algorithm
model = KNeighborsClassifier()

# 2. fit the model
model.fit(X_train, y_train)

# 3. score our model on test data
accuracy = model.score(X_test, y_test)

print(f'The test accuracy using {model.__class__.__name__} is {round(accuracy, 4) * 100}%')

The test accuracy using KNeighborsClassifier is 82.5%


In [20]:
# 1. import the LogisticRegression module
from sklearn.linear_model import LogisticRegression

# 2. define the algorithm
model = LogisticRegression()

# 3. fit the model
model.fit(X_train, y_train)

# 4. score our model on test data
accuracy = model.score(X_test, y_test)

print(f'The test accuracy using {model.__class__.__name__} is {round(accuracy, 4) * 100}%')

The test accuracy using LogisticRegression is 80.71000000000001%
