# Classification Warmup

1. Use `pydataset` to load the `voteincome` dataset.

    ```python
    from pydataset import data

    data('voteincome', show_doc=True)

    data('voteincome')
    ```

1. Drop the `state` and `year` columns.

1. Split the data into train and test datasets. We will be predicting whether or
   not someone votes based on the the remaining features.

1. Fit a k-neighbors classifier on the training data. Use 4 for your number of
   neighbors. How accurate is your model? How does it perform on the test data?

1. Try our these values for k: 1, 2, 3, and 4. Which gives the best accuracy?
   Which gives the best accuracy on the test data set?

1. View the classification report for your best model.

    ```python
    from sklearn.metrics import classifciation_report

    print(classification_report(y, predictions))
    ```

1. Within our problem space, what does accuracy mean? Precision? Recall?

In [34]:
import pandas as pd
import numpy as np
from pydataset import data
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
data('voteincome', show_doc=True)

voteincome

PyDataset Documentation (adopted from R Documentation. The displayed examples are in R)

## Sample Turnout and Demographic Data from the 2000 Current Population Survey

### Description

This data set contains turnout and demographic data from a sample of
respondents to the 2000 Current Population Survey (CPS). The states
represented are South Carolina and Arkansas. The data represent only a sample
and results from this example should not be used in publication.

### Usage

    data(voteincome)

### Format

A data frame containing 7 variables ("state", "year", "vote", "income",
"education", "age", "female") and 1500 observations.

`state`

a factor variable with levels equal to "AR" (Arkansas) and "SC" (South
Carolina)

`year`

an integer vector

`vote`

an integer vector taking on values "1" (Voted) and "0" (Did Not Vote)

`income`

an integer vector ranging from "4" (Less than \$5000) to "17" (Greater than
\$75000) denoting family income. See the CPS codebook for more info

In [3]:
df = data('voteincome')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1500 entries, 1 to 1500
Data columns (total 7 columns):
state        1500 non-null object
year         1500 non-null int64
vote         1500 non-null int64
income       1500 non-null int64
education    1500 non-null int64
age          1500 non-null int64
female       1500 non-null int64
dtypes: int64(6), object(1)
memory usage: 93.8+ KB


In [5]:
df.describe()

Unnamed: 0,year,vote,income,education,age,female
count,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0
mean,2000.0,0.855333,12.464,2.651333,49.261333,0.559333
std,0.0,0.351882,3.915643,1.021009,17.471134,0.496633
min,2000.0,0.0,4.0,1.0,18.0,0.0
25%,2000.0,1.0,9.0,2.0,36.0,0.0
50%,2000.0,1.0,13.0,3.0,49.0,1.0
75%,2000.0,1.0,16.0,4.0,62.0,1.0
max,2000.0,1.0,17.0,4.0,85.0,1.0


In [6]:
df['y'] = df.vote
baseline = df.y.mean()
baseline

0.8553333333333333

In [7]:
df.drop(columns=['vote', 'state','year'], inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1500 entries, 1 to 1500
Data columns (total 5 columns):
income       1500 non-null int64
education    1500 non-null int64
age          1500 non-null int64
female       1500 non-null int64
y            1500 non-null int64
dtypes: int64(5)
memory usage: 70.3 KB


In [48]:
X = df.drop(columns='y')
y = df.y
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=12345)
display(X_train.head())
display(y_train.head())
display(X_test.head())
display(y_test.head())

Unnamed: 0,income,education,age,female
1317,12,3,36,1
1185,11,2,54,1
960,15,3,68,0
1096,14,3,68,1
26,15,4,44,0


1317    0
1185    0
960     1
1096    1
26      1
Name: y, dtype: int64

Unnamed: 0,income,education,age,female
1376,17,3,48,1
902,7,4,52,0
747,12,2,41,0
1246,4,1,70,0
18,14,1,45,0


1376    1
902     1
747     1
1246    1
18      1
Name: y, dtype: int64

In [9]:
def scalem(scaler, train, test, **kwargs):
    # transform train
    scaler.fit(train)
    train_scaled = pd.DataFrame(scaler.transform(train), columns=train.columns.values).set_index([train.index.values])
    # transform test
    test_scaled = pd.DataFrame(scaler.transform(test), columns=test.columns.values).set_index([test.index.values])
    return train_scaled, test_scaled

def min_max_scaler(train, test, copy=True, feature_range=(0,1), **kwargs):
    # create scaler object and fit to train
    scaler = MinMaxScaler(copy=copy, feature_range=feature_range).fit(train)
    # scale'm
    train_scaled, test_scaled = scalem(scaler=scaler, test=test, train=train)
    return scaler, train_scaled, test_scaled

In [10]:
scaler, X_train_scaled, X_test_scaled = min_max_scaler(X_train, X_test)

In [11]:
classifier4 = KNeighborsClassifier(n_neighbors=4)
classifier4.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=4, p=2,
                     weights='uniform')

In [12]:
y_pred4 = classifier4.predict(X_test)

In [13]:
print(confusion_matrix(y_test, y_pred4))
print(classification_report(y_test, y_pred4))

[[  9  35]
 [ 38 218]]
              precision    recall  f1-score   support

           0       0.19      0.20      0.20        44
           1       0.86      0.85      0.86       256

    accuracy                           0.76       300
   macro avg       0.53      0.53      0.53       300
weighted avg       0.76      0.76      0.76       300



In [14]:
classifier1 = KNeighborsClassifier(n_neighbors=1)
classifier1.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                     weights='uniform')

In [15]:
y_pred1 = classifier1.predict(X_test)
print(confusion_matrix(y_test, y_pred1))
print(classification_report(y_test, y_pred1))

[[ 17  27]
 [ 17 239]]
              precision    recall  f1-score   support

           0       0.50      0.39      0.44        44
           1       0.90      0.93      0.92       256

    accuracy                           0.85       300
   macro avg       0.70      0.66      0.68       300
weighted avg       0.84      0.85      0.85       300



In [16]:
classifier2 = KNeighborsClassifier(n_neighbors=2)
classifier2.fit(X_train, y_train)
y_pred2 = classifier2.predict(X_test)
print(confusion_matrix(y_test, y_pred2))
print(classification_report(y_test, y_pred2))

[[ 18  26]
 [ 46 210]]
              precision    recall  f1-score   support

           0       0.28      0.41      0.33        44
           1       0.89      0.82      0.85       256

    accuracy                           0.76       300
   macro avg       0.59      0.61      0.59       300
weighted avg       0.80      0.76      0.78       300



In [17]:
classifier3 = KNeighborsClassifier(n_neighbors=3)
classifier3.fit(X_train, y_train)
y_pred3 = classifier3.predict(X_test)
print(confusion_matrix(y_test, y_pred3))
print(classification_report(y_test, y_pred3))

[[  4  40]
 [ 24 232]]
              precision    recall  f1-score   support

           0       0.14      0.09      0.11        44
           1       0.85      0.91      0.88       256

    accuracy                           0.79       300
   macro avg       0.50      0.50      0.49       300
weighted avg       0.75      0.79      0.77       300



In [18]:
classifier5 = KNeighborsClassifier(n_neighbors=5)
classifier5.fit(X_train, y_train)
y_pred5 = classifier5.predict(X_test)
print(confusion_matrix(y_test, y_pred5))
print(classification_report(y_test, y_pred5))

[[  6  38]
 [ 14 242]]
              precision    recall  f1-score   support

           0       0.30      0.14      0.19        44
           1       0.86      0.95      0.90       256

    accuracy                           0.83       300
   macro avg       0.58      0.54      0.55       300
weighted avg       0.78      0.83      0.80       300



In [44]:
classifier9 = KNeighborsClassifier(n_neighbors=9)
classifier9.fit(X_train, y_train)
y_pred9 = classifier9.predict(X_test)
print(confusion_matrix(y_test, y_pred9))
print(classification_report(y_test, y_pred9))

[[  2  42]
 [  8 248]]
              precision    recall  f1-score   support

           0       0.20      0.05      0.07        44
           1       0.86      0.97      0.91       256

    accuracy                           0.83       300
   macro avg       0.53      0.51      0.49       300
weighted avg       0.76      0.83      0.79       300



In [43]:
classifier10 = KNeighborsClassifier(n_neighbors=10)
classifier10.fit(X_train, y_train)
y_pred10 = classifier10.predict(X_test)
print(confusion_matrix(y_test, y_pred10))
print(classification_report(y_test, y_pred10))

[[  4  40]
 [ 11 245]]
              precision    recall  f1-score   support

           0       0.27      0.09      0.14        44
           1       0.86      0.96      0.91       256

    accuracy                           0.83       300
   macro avg       0.56      0.52      0.52       300
weighted avg       0.77      0.83      0.79       300



In [19]:
classifier4s = KNeighborsClassifier(n_neighbors=4)
classifier4s.fit(X_train_scaled, y_train)
y_pred4s = classifier4s.predict(X_test_scaled)
print(confusion_matrix(y_test, y_pred4s))
print(classification_report(y_test, y_pred4s))

[[  8  36]
 [ 42 214]]
              precision    recall  f1-score   support

           0       0.16      0.18      0.17        44
           1       0.86      0.84      0.85       256

    accuracy                           0.74       300
   macro avg       0.51      0.51      0.51       300
weighted avg       0.75      0.74      0.75       300



In [20]:
classifier3s = KNeighborsClassifier(n_neighbors=3)
classifier3s.fit(X_train_scaled, y_train)
y_pred3s = classifier4s.predict(X_test_scaled)
print(confusion_matrix(y_test, y_pred3s))
print(classification_report(y_test, y_pred3s))

[[  8  36]
 [ 42 214]]
              precision    recall  f1-score   support

           0       0.16      0.18      0.17        44
           1       0.86      0.84      0.85       256

    accuracy                           0.74       300
   macro avg       0.51      0.51      0.51       300
weighted avg       0.75      0.74      0.75       300



In [39]:
from sklearn import __version__
__version__

'0.21.2'

In [28]:
y_outcomes = pd.DataFrame((y_test))

In [29]:
y_outcomes.head()

Unnamed: 0,y
1376,1
902,1
747,1
1246,1
18,1


In [33]:
y_outcomes['y_pred1'] = y_pred1
y_outcomes['y_pred2'] = y_pred2
y_outcomes['y_pred3'] = y_pred3
y_outcomes['y_pred3s'] = y_pred3s
y_outcomes['y_pred4'] = y_pred4
y_outcomes['y_pred4s'] = y_pred4s
y_outcomes['y_pred5'] = y_pred5


y_outcomes.head()

Unnamed: 0,y,y_pred1,y_pred2,y_pred3,y_pred3s,y_pred4,y_pred4s,y_pred5
1376,1,1,1,1,1,1,1,1
902,1,1,1,1,1,0,1,1
747,1,1,1,1,1,1,1,1
1246,1,1,1,1,1,1,1,1
18,1,1,1,1,1,1,1,1


In [59]:
mean_pred = []
for pred in y_test.index:
    score = y_outcomes.loc[pred, 'y_pred1':'y_pred5'].mean()
    mean_pred.append(score)
    
y_outcomes['y_pred_mean'] = mean_pred
y_outcomes.head(15)

Unnamed: 0,y,y_pred1,y_pred2,y_pred3,y_pred3s,y_pred4,y_pred4s,y_pred5,score,y_pred_mean
1376,1,1,1,1,1,1,1,1,1.0,1.0
902,1,1,1,1,1,0,1,1,0.857143,0.857143
747,1,1,1,1,1,1,1,1,1.0,1.0
1246,1,1,1,1,1,1,1,1,1.0,1.0
18,1,1,1,1,1,1,1,1,1.0,1.0
1306,1,1,1,1,1,1,1,1,1.0,1.0
1438,1,1,0,1,1,1,1,1,0.857143,0.857143
964,1,1,1,1,1,1,1,1,1.0,1.0
1105,1,1,1,1,1,1,1,1,1.0,1.0
163,1,1,1,1,1,1,1,1,1.0,1.0


In [69]:
y_outcomes['y_correct1'] = (y_outcomes.y_pred1 == y_outcomes.y).astype('int64')
y_outcomes['y_correct2'] = (y_outcomes.y_pred2 == y_outcomes.y).astype('int64')
y_outcomes['y_correct3'] = (y_outcomes.y_pred3 == y_outcomes.y).astype('int64')
y_outcomes['y_correct3s'] = (y_outcomes.y_pred3s == y_outcomes.y).astype('int64')
y_outcomes['y_correct4'] = (y_outcomes.y_pred4 == y_outcomes.y).astype('int64')
y_outcomes['y_correct4s'] = (y_outcomes.y_pred4s == y_outcomes.y).astype('int64')
y_outcomes['y_correct5'] = (y_outcomes.y_pred5 == y_outcomes.y).astype('int64')


y_outcomes.head()

Unnamed: 0,y,y_pred1,y_pred2,y_pred3,y_pred3s,y_pred4,y_pred4s,y_pred5,score,y_pred_mean,y_correct1,y_correct2,y_correct3,y_correct3s,y_correct4,y_correct4s,y_correct5,y_correct_mean
1376,1,1,1,1,1,1,1,1,1.0,1.0,1,1,1,1,1,1,1,True
902,1,1,1,1,1,0,1,1,0.857143,0.857143,1,1,1,1,0,1,1,True
747,1,1,1,1,1,1,1,1,1.0,1.0,1,1,1,1,1,1,1,True
1246,1,1,1,1,1,1,1,1,1.0,1.0,1,1,1,1,1,1,1,True
18,1,1,1,1,1,1,1,1,1.0,1.0,1,1,1,1,1,1,1,True


In [75]:
mean_correct = []
for pred in y_test.index:
    score = y_outcomes.loc[pred, 'y_correct1':'y_correct5'].mean()
    mean_correct.append(score)
    
y_outcomes['y_correct_mean'] = mean_correct
y_outcomes.head(15)

Unnamed: 0,y,y_pred1,y_pred2,y_pred3,y_pred3s,y_pred4,y_pred4s,y_pred5,y_pred_mean,y_correct1,y_correct2,y_correct3,y_correct3s,y_correct4,y_correct4s,y_correct5,y_correct_mean
1376,1,1,1,1,1,1,1,1,1.0,1,1,1,1,1,1,1,1.0
902,1,1,1,1,1,0,1,1,0.857143,1,1,1,1,0,1,1,0.857143
747,1,1,1,1,1,1,1,1,1.0,1,1,1,1,1,1,1,1.0
1246,1,1,1,1,1,1,1,1,1.0,1,1,1,1,1,1,1,1.0
18,1,1,1,1,1,1,1,1,1.0,1,1,1,1,1,1,1,1.0
1306,1,1,1,1,1,1,1,1,1.0,1,1,1,1,1,1,1,1.0
1438,1,1,0,1,1,1,1,1,0.857143,1,0,1,1,1,1,1,0.857143
964,1,1,1,1,1,1,1,1,1.0,1,1,1,1,1,1,1,1.0
1105,1,1,1,1,1,1,1,1,1.0,1,1,1,1,1,1,1,1.0
163,1,1,1,1,1,1,1,1,1.0,1,1,1,1,1,1,1,1.0


In [81]:
y_outcomes_check = y_outcomes.describe().T
y_outcomes_check = y_outcomes_check[['count','mean']]
y_outcomes_check

Unnamed: 0,count,mean
y,300.0,0.853333
y_pred1,300.0,0.886667
y_pred2,300.0,0.786667
y_pred3,300.0,0.906667
y_pred3s,300.0,0.833333
y_pred4,300.0,0.843333
y_pred4s,300.0,0.833333
y_pred5,300.0,0.933333
y_pred_mean,300.0,0.860476
y_correct1,300.0,0.853333
