# Income Dataset

## Prep data

In [None]:
import pandas
# this library contains a bunch of datasets & allows you to import them in python!
# this specific dataset isn't available in csv format so this is the best way to get that data
from ucimlrepo import fetch_ucirepo
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split
import random

In [None]:
# get the actual data
fetchedData = fetch_ucirepo(id=2)
# get pandas dataframes from that data
X = fetchedData.data.features.copy()
Y = fetchedData.data.targets.copy()

# Some of the income values are formatted differently than others
Y['income'] = Y['income'].map({'<=50K.': '<=50K', '>50K.': '>50K', '<=50K': '<=50K', '>50K': '>50K'})

# This makes it easier to use the integer-form education and hours-per-week columns later.
X = X.rename(columns={'education-num': 'educationNum', 'hours-per-week': 'hoursPerWeek'})

We'll look at the distribution of income categories before further preparing the data.

In [None]:
plot = Y['income'].value_counts().plot.pie()

In [None]:
Y['income'] = Y['income'].map({ '<=50K': 0, '>50K': 1 })

# create a combined dataframe so we can access both at once
all = pandas.concat([X, Y], axis=1)

We're going to look at a couple more visualizations of our most interesting features before we standardize them.

In [None]:
plt.hist(all.age, bins=30, rwidth=0.95)
plt.xlabel('Age')
plt.ylabel('Quantity')
plt.show()

In [None]:
plt.hist(all.educationNum, bins=16, rwidth=0.95)
plt.xlabel('Education Level')
plt.ylabel('Quantity')
plt.show()
# 9: HS grad
# 10: some college
# 13: bachelor's degree

In [None]:
# Standardize some inputs
X.age = (X.age - X.age.mean()) / X.age.std()
X.educationNum = (X.educationNum - X.educationNum.mean()) / X.educationNum.std()
X.hoursPerWeek = (X.hoursPerWeek - X.hoursPerWeek.mean()) / X.hoursPerWeek.std()

## Logistic Regression

We tried several combinations of the variables we thought would be the most predictive.

In [None]:
lm = LogisticRegression()
Xmodel = X[['age', 'educationNum']]
Ymodel = Y.income
lm.fit(Xmodel, Ymodel)

y_pred = lm.predict(Xmodel)

display(y_pred)
display('predicted range: [{0:.2f},{1:.2f}]'.format(min(y_pred), max(y_pred)))

p,r,f,s = precision_recall_fscore_support(Ymodel, y_pred, labels=[0,1])
display('precision = {}'.format(p))
display('recall = {}'.format(r))
display('f-score = {}'.format(f))

In [None]:
lm = LogisticRegression()
Xmodel = X[['age', 'hoursPerWeek']]
Ymodel = Y.income
lm.fit(Xmodel, Ymodel)

y_pred = lm.predict(Xmodel)

display(y_pred)
display('predicted range: [{0:.2f},{1:.2f}]'.format(min(y_pred), max(y_pred)))

p,r,f,s = precision_recall_fscore_support(Ymodel, y_pred, labels=[0,1])
display('precision = {}'.format(p))
display('recall = {}'.format(r))
display('f-score = {}'.format(f))

In [None]:
lm = LogisticRegression()
Xmodel = X[['educationNum', 'hoursPerWeek']]
Ymodel = Y.income
lm.fit(Xmodel, Ymodel)

y_pred = lm.predict(Xmodel)

display(y_pred)
display('predicted range: [{0:.2f},{1:.2f}]'.format(min(y_pred), max(y_pred)))

p,r,f,s = precision_recall_fscore_support(Ymodel, y_pred, labels=[0,1])
display('precision = {}'.format(p))
display('recall = {}'.format(r))
display('f-score = {}'.format(f))

In [None]:
lm = LogisticRegression()
Xmodel = X[['age', 'educationNum', 'hoursPerWeek']]
Ymodel = Y.income
lm.fit(Xmodel, Ymodel)

y_pred = lm.predict(Xmodel)

display(y_pred)
display('predicted range: [{0:.2f},{1:.2f}]'.format(min(y_pred), max(y_pred)))

p,r,f,s = precision_recall_fscore_support(Ymodel, y_pred, labels=[0,1])
display('precision = {}'.format(p))
display('recall = {}'.format(r))
display('f-score = {}'.format(f))

In [None]:
color = ['b' if y_ == 0 else 'r' for y_ in Ymodel]
plt.scatter(Xmodel.hoursPerWeek, Xmodel.educationNum, c=color, s=2)
plt.xlabel('Hours per Week')
plt.ylabel('Education')

In [None]:
plt.scatter(Xmodel.hoursPerWeek, Xmodel.educationNum, c=color, s=8)
b0 = lm.intercept_[0]
b=lm.coef_[0]

Xdb = [-3.5, 3.5]
ydb = [-(b0+b[0]*x)/b[1] for x in Xdb]

plt.plot(Xdb, ydb)
plt.xlabel('Hours per Week')
plt.ylabel('Education')

In [None]:
plt.scatter(Xmodel.age, Xmodel.hoursPerWeek, c=color, s=2)
plt.ylabel('Hours per Week')
plt.xlabel('Age')

## Support Vector Machines

In [None]:
Xmodel = X[['age', 'educationNum']]
Ymodel = Y.income


### Linear Model

In [None]:
clf = svm.SVC(kernel='linear', class_weight={0:1,1:2})
clf.fit(Xmodel.values, Ymodel)

y_pred = clf.predict(Xmodel)

p,r,f,s = precision_recall_fscore_support(Ymodel, y_pred, labels=[0,1])
display('precision = {}'.format(p))
display('recall = {}'.format(r))
display('f-score = {}'.format(f))
display('support = {}'.format(s))

In [None]:
newx = []
newy = []
newlabel = []
for _ in range(5000):
    px = random.uniform(-2,2)
    py = random.uniform(-2.5,2.5)
    plabel = clf.predict([[px,py]])
    newx.append(px)
    newy.append(py)
    newlabel.append(plabel)

color = ['r' if y_ == 0 else 'b' for y_ in newlabel]
plt.scatter(newx, newy, c=color, marker='o', s=7)

### Polynomial Model

In [None]:
clf = svm.SVC(kernel='poly', class_weight={0:1,1:3}, degree=2)
clf.fit(Xmodel.values, Ymodel)

y_pred = clf.predict(Xmodel)

p,r,f,s = precision_recall_fscore_support(Ymodel, y_pred, labels=[0,1])
display('precision = {}'.format(p))
display('recall = {}'.format(r))
display('f-score = {}'.format(f))
display('support = {}'.format(s))

In [None]:
newx = []
newy = []
newlabel = []
for _ in range(5000):
    px = random.uniform(-2,2)
    py = random.uniform(-2.5,2.5)
    plabel = clf.predict([[px,py]])
    newx.append(px)
    newy.append(py)
    newlabel.append(plabel)

color = ['r' if y_ == 0 else 'b' for y_ in newlabel]
plt.scatter(newx, newy, c=color, marker='o', s=7)

In [None]:
clf = svm.SVC(kernel='poly', class_weight={0:1,1:2}, degree=3)
clf.fit(Xmodel.values, Ymodel)

y_pred = clf.predict(Xmodel)

p,r,f,s = precision_recall_fscore_support(Ymodel, y_pred, labels=[0,1])
display('precision = {}'.format(p))
display('recall = {}'.format(r))
display('f-score = {}'.format(f))
display('support = {}'.format(s))

In [None]:
newx = []
newy = []
newlabel = []
for _ in range(5000):
    px = random.uniform(-2,2)
    py = random.uniform(-2.5,2.5)
    plabel = clf.predict([[px,py]])
    newx.append(px)
    newy.append(py)
    newlabel.append(plabel)

color = ['r' if y_ == 0 else 'b' for y_ in newlabel]
plt.scatter(newx, newy, c=color, marker='o', s=7)

### RBF Model

In [None]:
clf = svm.SVC(kernel='rbf', class_weight={0:1,1:2})
clf.fit(Xmodel.values, Ymodel)

y_pred = clf.predict(Xmodel)

p,r,f,s = precision_recall_fscore_support(Ymodel, y_pred, labels=[0,1])
display('precision = {}'.format(p))
display('recall = {}'.format(r))
display('f-score = {}'.format(f))
display('support = {}'.format(s))

In [None]:
newx = []
newy = []
newlabel = []
for _ in range(5000):
    px = random.uniform(-2,2)
    py = random.uniform(-2.5,2.5)
    plabel = clf.predict([[px,py]])
    newx.append(px)
    newy.append(py)
    newlabel.append(plabel)

color = ['r' if y_ == 0 else 'b' for y_ in newlabel]
plt.scatter(newx, newy, c=color, marker='o', s=7)

## Generalizability

In [None]:
X_train, X_test, y_train, y_test = train_test_split(Xmodel, Ymodel, test_size=0.2, random_state=42)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

p,r,f,s = precision_recall_fscore_support(y_test, y_pred, labels=[0,1])
display('Precision: {}, Recall: {}, f-score = {}, Support: {}'.format(p,r,f,s))