In [None]:
import warnings

warnings.filterwarnings("ignore")

# Learning goals
After today's lesson you should be able to:
- Implement classification models

In [None]:
import numpy as np
import pandas as pd
import geopandas as gpd

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="whitegrid")



# Predicting Income Categories
We are going to use a dataset from the University of California Urvine's [Machine Learning Repository](https://archive.ics.uci.edu/ml/index.php). 

# 0. Data
This is a dataset that has been extract from the 1994 Census and contains 48,842 observations. 

Here is the list of attributes: 

- income: >50K, <=50K.
- age: continuous.
- workclass: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked.
- fnlwgt: continuous.
- education: Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool.
- education-num: continuous.
- marital-status: Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse.
- occupation: Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces.
- relationship: Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried.
- race: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black.
- sex: Female, Male.
- capital-gain: continuous.
- capital-loss: continuous.
- hours-per-week: continuous.
- native-country: United-States, Cambodia, England, Puerto-Rico, Canada, Germany, Outlying-US(Guam-USVI-etc), India, Japan, Greece, South, China, Cuba, Iran, Honduras, Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, Portugal, Ireland, France, Dominican-Republic, Laos, Ecuador, Taiwan, Haiti, Columbia, Hungary, Guatemala, Nicaragua, Scotland, Thailand, Yugoslavia, El-Salvador, Trinadad&Tobago, Peru, Hong, Holand-Netherlands.



## 0.1 Import the data

In [None]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
data = pd.read_csv(url, header=None)
columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']
data.columns = columns

In [None]:
data.head()

## 0.2 Clean the data

In [None]:
## Replace the data coded "?" with NaN
data = data.replace(' ?', np.nan)
## Next, drop all rows with NaN
data = data.dropna()


# 1. Exploratory data analysis

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
fig,ax = plt.subplots(1,1,figsize=(4,2))
sns.countplot(data = data , y='income',hue = 'sex',palette='Set2',edgecolor=".6",ax=ax)

In [None]:
fig,ax = plt.subplots(1,1,figsize=(12,6))
sns.countplot(data = data , y='occupation',hue = 'income',palette='Set2',ax=ax)

Note that we can also make these vertical bar charts, but I think charts the horizontal ones look better because of the text is not squeezed on the y-axis in teh case of many categories. 

In [None]:
fig,ax = plt.subplots(1,1,figsize=(12,6))

sns.countplot(data = data , x='occupation',hue = 'income',palette='Set2',ax=ax)

In [None]:
fig,ax = plt.subplots(1,1,figsize=(12,6))
sns.countplot(data = data , y='occupation',hue = 'income',palette='Set2',ax=ax)

In [None]:
fig,ax = plt.subplots(1,1,figsize=(12,6))
sns.countplot(data = data , y='race',hue = 'income',palette='Set2',ax=ax)

In [None]:
data[['age','fnlwgt','education-num','capital-gain','capital-loss','hours-per-week']].hist(figsize=(12,8))

# 2. Prep data for training

## 2.1 Encode categorical variables to numbers

In [None]:
## Create a new dataframe with only the numerical data
data_num = data[['age','fnlwgt','education-num','capital-gain','capital-loss','hours-per-week']]

In [None]:
# in machine learning, we often want to convert categorical variables to numerical dummy variables 
# Convert categorical variables to numerical
categorical_cols = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country', 'income']
dummy_df_list = []
for col in categorical_cols:
    dummy_df = pd.get_dummies(data[col], prefix=col)
    dummy_df_list.append(dummy_df)

data_dummy = pd.concat(dummy_df_list, axis=1)

In [None]:
data_dummy

In [None]:
data_new = pd.concat([data_num, data_dummy], axis=1)

In [None]:
data_new.head()

In [None]:
## We are going to set all the non-income columns as X
X = data_new[data_new.columns.difference(['income_ >50K', 'income_ <=50K'])]
y = data_new['income_ >50K']

## 2.2 Create a training and test set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

# 3. Modeling

## 3.1 Model using Naive Bayes

Model using our training data. 

In [None]:
%%time
## We can use %%time to see how long it takes to run the code
## %% are called magic functions in Jupyter Notebook

from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

## Train the model on the training data
gnb.fit(X_train, y_train)

Get the prediction for the test set. 

In [None]:
y_pred = gnb.predict(X_test)

In [None]:
y_pred

In [None]:
from sklearn.metrics import accuracy_score
print("Accuracy:",accuracy_score(y_test, y_pred))

Alternatively, we can use the score method to get the accuracy of the model

In [None]:
gnb.score(X_test, y_test)

Let's also take a look at the confusion matrix

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

Divide by the total number to get percentages. 

In [None]:
cm = confusion_matrix(y_test, y_pred)/len(y_test)

In [None]:
cm

In [None]:
print(f"We have a {cm[0][0]*100:.{2}}% true negative rate and a {cm[1][1]*100:.{2}}% % true positive rate.\n We have a {cm[1][0]*100:.{2}}% false negative rate and a {cm[0][1]*100:.{2}}% false positive rate. \n Note that our upper-right and lower-left (TP, TN) sum to the accuracy score. " )

## 3.2 Support Vector Machine

In [None]:
%%time

## SVC() is a support vector classifier 
## There is also a support vector regressor SVR()
from sklearn.svm import SVC
svm = SVC()
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)

Notice that took a lot longer to run than the Naive Bayes! And the model performed slightly worse.

In [None]:
print("Accuracy:",accuracy_score(y_test, y_pred))

We also have a higher false negative rate, too. 

In [None]:
cm = confusion_matrix(y_test, y_pred)/len(y_test)

In [None]:
print(f"We have a {cm[0][0]*100:.{2}}% true negative rate and a {cm[1][1]*100:.{2}}% % true positive rate.\n We have a {cm[1][0]*100:.{2}}% false negative rate and a {cm[0][1]*100:.{2}}% false positive rate. " )

## 3.3 Decision Tree

In [None]:
%%time
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)

In [None]:
print("Accuracy:",accuracy_score(y_test, y_pred))

In [None]:
confusion_matrix(y_test, y_pred)/len(y_test)

## 3.4 Random Forest

In [None]:
%%time
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

In [None]:
print("Accuracy:",accuracy_score(y_test, y_pred))

In [None]:
confusion_matrix(y_test, y_pred)/len(y_test)

Faster than the support vector machine and with better accuracy and false negatives. 

## 3.5 Gradient Boosting Trees

In [None]:
%%time
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier()

gb.fit(X_train, y_train)
y_pred = gb.predict(X_test)

In [None]:
print("Accuracy:",accuracy_score(y_test, y_pred))

In [None]:
confusion_matrix(y_test, y_pred)/len(y_test)

Even better. This will not always be the case. However, ensemble learning methods very often outperform other types of machine learning models because they combine multiple models to make more accurate predictions. 

# Q.1
- Using this same dataset, try to predict to predict gender based on other categories. Which model performed the best and by what metrics (time? accuracy? false predictions?) did you determine this.
- Instead of a 66/33 split, try a 80/20 split on your best performing model. Did this improve your model performance?

