In [0]:
# import drive 
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


# Chapter 2: Classification

### 2.1.1. What Happens when the Response is a Dummy?

* *Conditional mean* becomes a *conditional probability*

  * we average 1's and 0's instead

    * eg. mean(c(0,1,1,0,0,0,1,0)) = 0.375

    * `> 0.5` can indicate 1, `< 0.5` can indicate 0

* In **multi-class** settings we would usually predict the class with the **highest probability** to be true

 * we could use **m** or **m-1** dummy variables

* *Or* we may just make use of all the predicted class probabilities ...

* the regression function ***r()*** still applies, but we predict probabilities

* we have regression applications and classification applications, but for both the function applies

### 2.1.2. We may not need a formal class prediction

* for example, even if the probability is less than 0.5, it could tell us info about the data

  * eg. it tells use someone has only a certain probaility of cancer

  * or there is a slight chance of a cell phone customer changing providors
    * could be useful to current providor

* if we have **multiple classes** we may keep *all* of our output variables, as well

  * we might even be interested in the relationship between each

## 2.2. Telco Churn Data

* the marketing term ***churn*** refers to customers moving from one company to another

* the **Telco Churn Data** (from [Kaggle](https://www.kaggle.com/blastchar/telco-customer-churn)) can be used to predict customer behaviour in order to improve retention

  * includes demographic info
  * account info
  * other service sign ups

In [0]:
import pandas as pd
### INSERT YOUR PATH to telco data HERE: ###
my_path = '/content/drive/My Drive/ecs171_yancey/Lecture_Notes/Chapter_2/WA_Fn-UseC_-Telco-Customer-Churn.csv'

# load CSV using pandas library
telco = pd.read_csv(my_path)

# look at the first few rows and show features 
telco.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


* many appear to be *categorical*:
* note, that columns with **mixed types** or factor are stored with the **object** dtype in pd df

In [0]:
print(len(telco))
# many appear to be factors, eg.:
telco.dtypes

7043


customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [0]:
telco.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


* TotalCharges should be float/numeric 
* errors ='coerce' means invalid entries will become NaN

In [0]:
telco['TotalCharges'] = telco['TotalCharges'].apply(lambda x: pd.to_numeric(x, errors ='coerce'))

* lets print out the rows that were non-numeric first:

In [0]:
is_NaN = telco.isnull()
row_has_NaN = is_NaN.any(axis=1)

rows_with_NaN = telco[row_has_NaN]

print(rows_with_NaN)

      customerID  gender  SeniorCitizen  ... MonthlyCharges TotalCharges  Churn
488   4472-LVYGI  Female              0  ...          52.55          NaN     No
753   3115-CZMZD    Male              0  ...          20.25          NaN     No
936   5709-LVOEQ  Female              0  ...          80.85          NaN     No
1082  4367-NUYAO    Male              0  ...          25.75          NaN     No
1340  1371-DWPAZ  Female              0  ...          56.05          NaN     No
3331  7644-OMVMY    Male              0  ...          19.85          NaN     No
3826  3213-VVOLG    Male              0  ...          25.35          NaN     No
4380  2520-SGTTA  Female              0  ...          20.00          NaN     No
5218  2923-ARZLG    Male              0  ...          19.70          NaN     No
6670  4075-WKNIU  Female              0  ...          73.35          NaN     No
6754  2775-SEFEE    Male              0  ...          61.90          NaN     No

[11 rows x 21 columns]


* looks like the 11 the rows with NA (from the textbook) where coming from **TotalCharges**, but we would not have seen this unless we did **.info()** first to check the datatypes! 

  * eg. pandas did not know it was supposed to be numeric

* be sure to check your data

* remove rows with NA
* show the number removed

* we also remove **customerID** because it wont help predict
  * since there is just an ID for each person

  * eg. if we converted this to factors we would have WAY to many predictors since we only have one row for each person

* **its always important to understand your inputs to a function and what the function is doing with them**

In [0]:
telco = telco.drop(['customerID'],axis=1)

* lets convert everything accept tenure and '**MonthlyCharges**', '**TotalCharges**' from factors to dummies
  * **drop_first = True** gives us **k-1** dummies for **k** factor levels

In [0]:
tc = pd.get_dummies(telco, columns=['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'Churn'], drop_first=True)

print(list(tc.columns))
print(len(tc.columns))

['tenure', 'MonthlyCharges', 'TotalCharges', 'gender_Male', 'SeniorCitizen_1', 'Partner_Yes', 'Dependents_Yes', 'PhoneService_Yes', 'MultipleLines_No phone service', 'MultipleLines_Yes', 'InternetService_Fiber optic', 'InternetService_No', 'OnlineSecurity_No internet service', 'OnlineSecurity_Yes', 'OnlineBackup_No internet service', 'OnlineBackup_Yes', 'DeviceProtection_No internet service', 'DeviceProtection_Yes', 'TechSupport_No internet service', 'TechSupport_Yes', 'StreamingTV_No internet service', 'StreamingTV_Yes', 'StreamingMovies_No internet service', 'StreamingMovies_Yes', 'Contract_One year', 'Contract_Two year', 'PaperlessBilling_Yes', 'PaymentMethod_Credit card (automatic)', 'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check', 'Churn_Yes']
31


* now it has 31 columns!

  * eg. because some were converted to dummies

  * lets assume that columns it chose were correct (because that is what the book has as well)

In [0]:
# remove rows with NA
tc1 = tc.dropna()
print(len(tc)-len(tc1),'rows with NA removed')
len(tc1)

11 rows with NA removed


7032

* remember that we only need **m-1** dummy columns for each column in **X**, where **m** is the number of factor levels

* that is why we use `drop_first=True` in the call to `get_dummies()` above



* Eg. for '**Contract**' we only need 2 factors for this one since:

In [0]:
# note that these were the levels in the original df
my_tab = pd.crosstab(index=telco["Contract"],columns="count") 
my_tab

col_0,count
Contract,Unnamed: 1_level_1
Month-to-month,3875
One year,1473
Two year,1695


* we can see how many categories we have, and how many instances of each with `pd.crosstab`

* now lets look at the levels in the dummy df
* the number of 1's is equal to the number of instances of the '**One year**' factor level

  * it would be the same for whichever other factor was converted to a dummy

  * all of the info we need about the level of the third one comes from the level of the other 2 of course

In [0]:
# note this is from the dummy df tc1
my_tab = pd.crosstab(index=tc1["Contract_One year"],columns="count") 
my_tab

col_0,count
Contract_One year,Unnamed: 1_level_1
0,5560
1,1472


In [0]:
# my function to implement kNN (with allK, leave1out options) from Matloff's R Regtools package:
# note, this is updated to be used for ch1 and 2 of the text, any other features included in
# regtools are not necessary here

def kNN(X,Y,newx,k,regress=True,allK=False,leave1out=False,scaleX=True,scaler='standard'):

  import warnings
  warnings.filterwarnings('ignore')

  import numpy as np

  from sklearn.neighbors import KNeighborsClassifier 
  from sklearn.neighbors import KNeighborsRegressor

  from sklearn.preprocessing import StandardScaler
  #from sklearn.preprocessing import MinMaxScaler
  #from sklearn.preprocessing import RobustScaler

  from statistics import mean 
  from statistics import mode
  from collections import Counter

  def kNNtype(neighbs,regress):
    if regress:
      knn = KNeighborsRegressor(n_neighbors=neighbs)
    else:
      knn = KNeighborsClassifier(n_neighbors=neighbs)
    return knn

  if scaler != 'standard':
    scaler = scaler
  else:
    scaler = StandardScaler()


# Update: for row subsets/test sets in hw; fitting the scaling function should be done separate,
# so the same can be applied to train and test data (or X and newx) 
  if scaleX == True:
    # scale should be fit to X/train
    scaler.fit(X)
    newx = pd.DataFrame(scaler.transform(newx))
    X = pd.DataFrame(scaler.transform(X))


  knn_all = pd.DataFrame()
  if allK == True:
    if leave1out == True:
      nn_all = []
      for j in list(newx.index.values.tolist()):
      #for j in list(Y.index.values.tolist()):
        knn_row = []
        knn = kNNtype(k+1,regress)
        knn.fit(X, Y)
        test = pd.DataFrame(newx.loc[j,:])
        nn = knn.kneighbors(test.T)[1][0]
        for i in range(2,k+1):
          nn1 = nn[1:i] # leave one out
          test = list(Y.iloc[nn1])
          if regress:
            test = mean(test)
          else:
            c = Counter(test)
            l = list(c.values())
            ind = l.index(max(c.values()))
            test = list(c.keys())[ind]
            # count number of times the max class occurs and if there is a tie
            # choose the second class with the max if index is even
            if (l.count(max(l))) > 1 and (j % 2 !=0):
              l[ind] = 0
              ind = l.index(max(c.values()))
              test = list(c.keys())[ind]

          knn_row.append(test)
        knn_row = pd.DataFrame(knn_row)
        knn_all = [knn_all, knn_row]
        knn_all = pd.concat(knn_all,axis=1, ignore_index=True)
        nn_all.append(list(nn1))
      nn_all = np.array(nn_all)
    else:
        for i in range(1,k+1):
          knn = kNNtype(i,regress)
          knn.fit(X, Y)
          test = knn.predict(newx)
          knn_row = pd.DataFrame(test).T
          knn_all = [knn_all, knn_row]
          knn_all = pd.concat(knn_all,axis=0, ignore_index=True)
        nn_all = knn.kneighbors(newx)[1]
  else:
    if leave1out == True:
      knn_row = []
      for j in list(newx.index.values.tolist()):
      #for j in list(Y.index.values.tolist()):
        knn = kNNtype(k,regress)
        knn.fit(X, Y)
        test = pd.DataFrame(newx.loc[j,:])
        nn = knn.kneighbors(test.T)[1][0]
        nn1 = nn[1:len(nn)]

        test = list(Y.iloc[nn1])
        if regress:
            test = mean(test)
        else:
          c = Counter(test)
          l = list(c.values())
          ind = l.index(max(c.values()))
          test = list(c.keys())[ind]
          # count number of times the max class occurs and if there is a tie
          # choose the second class with the max if index is even
          if (l.count(max(l))) > 1 and (j % 2 !=0):
            l[ind] = 0
            ind = l.index(max(c.values()))
            test = list(c.keys())[ind]

        knn_row.append(test)
      knn_all = pd.DataFrame(knn_row).T
      nn_all = nn1
    else:
        knn = kNNtype(k,regress)
        knn.fit(X, Y)
        test = knn.predict(newx)
        knn_all = pd.DataFrame(test)
        nn_all = knn.kneighbors(newx)[1]

  return knn_all, nn_all

* rename to 'stay'

### 2.2.2. Fitting the Model

* split data frame into X and Y

In [0]:
# remove column we want to predict:
tcX = tc1.drop(['Churn_Yes'], axis=1)
print(tcX.head())


# extract column we want to predict:
tcY = tc1.loc[:,'Churn_Yes']
print(tcY.head())

   tenure  ...  PaymentMethod_Mailed check
0       1  ...                           0
1      34  ...                           1
2       2  ...                           1
3      45  ...                           0
4       2  ...                           0

[5 rows x 30 columns]
0    0
1    0
2    1
3    0
4    1
Name: Churn_Yes, dtype: uint8


* lets see our predictions with the 75 nearest neighbors for each of the 7032 rows of the data

* we are using **classification** now, so the predicted class is the most common Y of the k nearest neighbors 

* in **binary classification** we can take either the most commonly predicted class or take the mean (and round to nearest integer)

  * *note:* in my knn above I used python's knn from `sklearn` which can predict multiple classes (without converting to dummies first) so I just chose most common class (instead of taking means of output columns and doing the max prob) when leave1out=True and I have to predict without using `knn.predict`

In [0]:
knnout, nn = kNN(tcX,tcY,tcX,75,regress=False,allK=False,leave1out=True)
knnout

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,6992,6993,6994,6995,6996,6997,6998,6999,7000,7001,7002,7003,7004,7005,7006,7007,7008,7009,7010,7011,7012,7013,7014,7015,7016,7017,7018,7019,7020,7021,7022,7023,7024,7025,7026,7027,7028,7029,7030,7031
0,1,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,1,...,0,0,0,0,0,1,1,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0


* this is the estimate of Churn_Yes (i.e. of a customer *not* staying loyal/leaving) for each point in the data

* note that we are predicting **1-p(churn_No)**, so it is opposite from textbook  since we are predicting **Churn_Yes**

### 2.2.3. Sanity Check

* lets do a 'sanity check' first...

* since the mean of a conditional mean equals the unconditional mean, we should have the average of our predicted values should approximately equal that of our actual Y values

In [0]:
tcY.mean() 
my_tab = pd.crosstab(index=telco["Churn"],columns="count") 
print(my_tab)
x=my_tab.values.tolist()/sum(my_tab.values)
print('Percentage of Yes and No:')
x.tolist()

col_0  count
Churn       
No      5174
Yes     1869
Percentage of Yes and No:


[[0.7346301292063041], [0.2653698707936959]]

### 2.2.4. Fitting the Model

* remember that this gave use the prediction for each of the 7032 rows of the data (without NA rows)

* so lets check the row 143 data point like in the text

* so it is the same prediction since we are predicting churn is **yes**

In [0]:
knnout[142]

0    0
Name: 142, dtype: int64

* lets check the actual probability of churn to be yes 

In [0]:
from statistics import mean
from sklearn.neighbors import KNeighborsClassifier 
knn = KNeighborsClassifier(n_neighbors=75)
knn.fit(tcX, tcY)
# check the nearest neighbors of row 142
test = pd.DataFrame(tcX.loc[142,:])
nn = knn.kneighbors(test.T)[1][0]
nn1 = nn[0:len(nn)]
test = list(tcY.iloc[nn1])
test = mean(test)
test

0.13333333333333333

* another way we can get the probability

* shows probability of a 1 and probability of a 0

In [0]:
probs = knn.predict_proba(test.T)
probs

array([[0.86666667, 0.13333333]])

* so this is about the same as regtools (which predicted 0.8 for churn to be **no**)

* lets check the number correctly predicted

In [0]:
x = (knnout.iloc[0].values == tcY.astype(float).values)
len(x[x == True])/len(tcY)#7032

# the % correct prediction using the R Regtools kNN function 
# was about 0.1% lower 

0.7919510807736063

* the % correctly predicted using the R Regtools kNN function 
was about 0.1% lower 

In [0]:
knnout, nn = kNN(tcX,tcY,tcX,75,regress=False,allK=True,leave1out=True)
knnout

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,6992,6993,6994,6995,6996,6997,6998,6999,7000,7001,7002,7003,7004,7005,7006,7007,7008,7009,7010,7011,7012,7013,7014,7015,7016,7017,7018,7019,7020,7021,7022,7023,7024,7025,7026,7027,7028,7029,7030,7031
0,1,0,0,0,1,1,1,0,1,0,0,0,0,1,1,0,0,0,0,0,1,0,1,0,0,1,0,1,0,1,0,1,0,1,0,0,1,1,1,1,...,0,1,0,0,0,0,0,1,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,1,1
1,1,0,0,0,1,1,1,0,1,0,0,0,0,0,1,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,...,0,1,0,0,0,0,0,1,1,0,1,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,1
2,1,0,0,0,0,1,1,0,1,0,0,0,0,0,1,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,1,...,0,1,0,1,0,1,0,1,0,0,1,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,1
3,1,0,0,0,0,1,1,0,1,0,0,0,0,0,1,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,1,...,0,1,0,1,0,1,1,1,0,0,1,1,1,0,0,1,0,0,0,0,1,0,1,0,0,0,1,0,0,1,1,0,1,0,0,0,0,0,1,1
4,0,0,0,0,0,1,1,0,1,0,0,0,0,1,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,1,...,0,1,0,1,0,1,1,1,0,0,1,1,0,0,0,1,0,0,0,0,1,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69,1,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,1,...,0,0,0,0,0,1,1,1,0,0,0,1,0,0,0,1,0,0,0,0,1,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0
70,1,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,1,...,0,0,0,0,0,1,1,1,0,0,0,1,0,0,0,1,0,0,0,0,1,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,1,1,0
71,1,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,1,...,0,0,0,0,0,1,1,1,0,0,0,1,0,0,0,1,0,0,0,0,1,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0
72,1,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,1,...,0,0,0,0,0,1,1,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,1,1,0


* we can write a function that gives us the average correctly predicted for a row
* of prediction, and then apply this to each row of knnout

* lets write a function to find the average correctly predicted for each k value on our test set

In [0]:
def oneRow(row, knnout, Y):
  x = (knnout.iloc[row,:].values == Y.astype(float).values)
  return len(x[x == True])/len(tcY)

* since each row of knnout contains the predictions for all the data points the average predicted correctly for each k 1 through 75 is output

In [0]:
# try row 25
oneRow(25,knnout,tcY)

0.7866894197952219

* `apply` `oneRow` and take the `max` to all rows to get the **best k**!

* About 27% of people will Churn (disloyal)
* so we could just guess everyone 0 (Churn_Yes = 0) and get an accuracy of:

In [0]:
1 - tcY.mean()

0.7342150170648465

* so we only get 5-6% improvement from guessing the mean, but whether or not this is significant depends on the data and the application

* skip section 2.2.5. since I already talked about that...

## 2.3 Vertebrae Data

* the UCI Vertebral Column Data has multiple classes to predict

  * can be DH, NO, or SP

In [0]:
import pandas as pd
### INSERT YOUR PATH TO vertebre data HERE: ###
my_path = '/content/drive/My Drive/ecs171_yancey/Lecture_Notes/Week_2/column_3C.dat'

# load CSV using pandas library
vert = pd.read_csv(my_path, sep=' ',header=None)

# note, that data is separated into groups of class (eg. all DH come first)
vert.head()

Unnamed: 0,0,1,2,3,4,5,6
0,63.03,22.55,39.61,40.48,98.67,-0.25,DH
1,39.06,10.06,25.02,29.0,114.41,4.56,DH
2,68.83,22.22,50.09,46.61,105.99,-3.53,DH
3,69.3,24.65,44.31,44.64,101.87,11.21,DH
4,49.71,9.65,28.32,40.06,108.17,7.92,DH


* extract the X features

In [0]:
X = vert.iloc[:,0:6] 

* lets look at the unique classes of the Y column

  * DH: disk hernia
  * NO: normal
  * SL: spondilolysthesis

In [0]:
# original Y classes:
vert.iloc[:,6].unique().tolist()

['DH', 'SL', 'NO']

* the sting label classes can be converted to **numeric** with an `sklearn LabelEncoder ` for multiclass prediction with knn

In [0]:
from sklearn.preprocessing import LabelEncoder  
le = LabelEncoder()
vert.iloc[:,6] = le.fit_transform(vert.iloc[:,6])
Y = vert.iloc[:,6]
print(Y.unique().tolist())

[0, 2, 1]


* these correspond to the labels above, in this order

* choosing tuning parameters:
* since there are only `len(vert)` its likely **k** will be smaller

In [0]:
len(vert)

310

### 2.3.2. Choosing Hyperparameters

* no need for for loop in multiclass case when allK=True

In [0]:
knnout, nn = kNN(X,Y,X,20,regress=False,allK=True,leave1out=True)
knnout

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299,300,301,302,303,304,305,306,307,308,309
0,0,0,0,0,0,1,1,0,0,2,0,1,1,1,1,0,1,1,0,0,1,1,1,0,1,0,0,0,1,2,1,0,1,1,0,0,0,1,0,0,...,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,2,1,1,1,0,1,2,1,2,1,1,1,1,1,0,1,1
1,0,0,0,0,0,1,1,0,0,0,0,0,1,0,1,0,1,1,0,0,1,1,1,0,1,1,0,0,1,0,1,2,1,0,0,0,0,1,0,1,...,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,2,1,1,1,0,1,0,1,0,1,0
2,0,0,0,0,0,1,1,0,1,0,1,1,1,0,1,0,1,1,0,0,1,1,1,0,0,0,1,0,1,0,1,0,0,1,0,0,1,1,0,0,...,1,0,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,2,1,2,0,1,1,0,0,0,1,1
3,0,0,0,0,0,1,1,1,0,0,0,0,1,0,0,1,1,1,0,0,1,1,1,1,0,1,0,1,1,0,1,0,1,0,0,0,1,1,0,1,...,1,0,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,2,1,2,0,1,1,0,0,1,1,1
4,0,0,0,0,0,1,1,0,0,2,0,0,1,0,0,1,0,1,0,0,1,1,0,0,0,0,0,1,0,1,1,0,0,1,0,0,1,1,0,0,...,1,0,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,0,1,1,1,1,1,1,0,1,2,1,2,0,1,1,0,0,1,1,1
5,0,0,0,0,0,1,1,1,0,2,0,0,1,0,0,1,0,1,0,0,1,1,0,1,0,0,0,1,0,0,1,2,0,0,0,0,1,1,0,1,...,1,0,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,0,1,2,1,2,1,1,1,0,0,1,1,1
6,0,0,0,0,0,1,1,1,1,2,0,0,1,0,0,1,0,1,0,0,1,1,0,1,0,0,0,0,0,1,1,0,0,1,0,0,1,1,0,0,...,1,0,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,0,0,1,2,1,2,0,1,1,0,0,1,1,1
7,0,0,0,0,0,1,1,1,1,2,0,0,1,0,0,1,1,1,0,0,1,1,0,1,0,0,0,0,0,1,1,2,0,1,0,0,1,1,0,0,...,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,0,1,2,1,2,0,1,1,0,1,1,1,1
8,0,0,0,0,0,1,1,1,1,2,0,0,0,0,0,1,1,1,0,0,1,1,0,1,0,0,0,0,0,1,1,2,0,1,0,0,1,1,0,0,...,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,0,0,1,1,1,1,0,0,1,2,1,2,0,1,1,0,0,1,1,1
9,0,0,0,0,0,1,1,1,1,2,0,0,0,0,0,1,1,1,0,0,1,0,0,1,0,0,0,0,0,1,1,2,0,1,0,0,1,1,0,0,...,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,0,1,1,1,1,1,1,0,1,2,1,2,0,1,1,0,0,1,1,1


In [0]:
import numpy as np
x = knnout.values == Y.astype(float).values
probs = x.sum(axis=1)/len(Y)

y = max(probs).tolist()
test = probs==y

np.where(test == True)
probs

array([0.80645161, 0.79032258, 0.77096774, 0.7516129 , 0.79032258,
       0.78387097, 0.78709677, 0.78387097, 0.78387097, 0.79677419,
       0.80322581, 0.79677419, 0.78387097, 0.78709677, 0.78064516,
       0.78387097, 0.78064516, 0.79032258, 0.78709677])

* i guess best k really is 2
* values were also pretty similar in R though

* how can we output the probabilities of each class on our test set?

  * `predict_proba`

In [0]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X, Y)
test = pd.DataFrame(X.iloc[0:11,:])
knn.predict_proba(test)

array([[0.5, 0.5, 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [0.5, 0.5, 0. ],
       [1. , 0. , 0. ],
       [0.5, 0.5, 0. ],
       [0.5, 0.5, 0. ],
       [1. , 0. , 0. ],
       [0.5, 0.5, 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ]])

## 2.4. Better than Random Chance?

* if we just guess the max occurences of Y
* we would guess 2 (SL)

In [0]:
from collections import Counter
Counter(Y)

Counter({0: 60, 1: 100, 2: 150})

* disregarding feature information on this data, we could just quickly guess the class with the max occurences

* **SL is 2** so we could just guess that everyone is that and get an accuracy of:

In [0]:
150/310

0.4838709677419355

* - and we would have 52% error (1-0.48)
* - this is much worse than 20% achieved w/ kNN because we have multiple classes

## 2.5. Confusion Matrix

* a confusion matrix is a good way to look at the accuracy of predictions in multiclass problems

* vals down the diagonal from top left are correct preds ect.

In [0]:
from sklearn.metrics import confusion_matrix
knnout, nn = kNN(X,Y,X,11,regress=False,leave1out=True)
confusion_matrix(knnout.values[0].tolist(), Y.tolist())

array([[ 38,  13,   4],
       [ 19,  77,  14],
       [  3,  10, 132]])

## 2.6. Unbalanced Data

* Telco Churn is an example of ***unbalanced data*** (because on 27% churned)

* Although many sources recommend *evenly balancing* each class to improve prediction accuracy (eg. "**resampling**"), this actually *weakens* our ML prediction

  * This is just *one* example of many *bad sources* that can be found online

  * eg. in the **Missed Appointments Data**, where we want to predict whether or not a patient will not show up to their doctor’s appt 
  * This would wrongly produce a near even number from each class predicted

  * weakens ability to predict new cases

* If we wanted to be able to predict more of the cases where people have a probability of not showing up we could flag those with greater than certain probability (eg. 20%) of showing up

### 2.6.1. Missed Appts Data

* in this data set we predict whether a patient will show up to a doctor appt


In [0]:
import pandas as pd
my_path = '/content/drive/My Drive/ecs171_yancey/Lecture_Notes/Week_2/KaggleV2-May-2016.csv'

# load CSV using pandas library
ma = pd.read_csv(my_path, sep=',')

# note, that data is separated into groups of class (eg. all DH come first)
ma.head()

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show
0,29872500000000.0,5642903,F,2016-04-29T18:38:08Z,2016-04-29T00:00:00Z,62,JARDIM DA PENHA,0,1,0,0,0,0,No
1,558997800000000.0,5642503,M,2016-04-29T16:08:27Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,0,0,0,0,0,No
2,4262962000000.0,5642549,F,2016-04-29T16:19:04Z,2016-04-29T00:00:00Z,62,MATA DA PRAIA,0,0,0,0,0,0,No
3,867951200000.0,5642828,F,2016-04-29T17:29:31Z,2016-04-29T00:00:00Z,8,PONTAL DE CAMBURI,0,0,0,0,0,0,No
4,8841186000000.0,5642494,F,2016-04-29T16:07:23Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,1,1,0,0,0,No


In [0]:
my_tab = pd.crosstab(index=ma["No-show"],columns="count") 
my_tab

col_0,count
No-show,Unnamed: 1_level_1
No,51613
Yes,13921


In [0]:
51613/(51613+13921)

0.7875759147923216

* about 21% do not show since Yes means no-show

* so we could still get 79% correct by just guessing everyone does show

* but since we have more information about specific patients,
* lets see if we can predict better

* I will use all columns used in textbook, but I am trying it without **neighborhood**

  * this significantly speeds up runtime without loss in accuracy

* convert to dummies

In [0]:
cols=[2,5]
cols.extend(range(7,14))

ma1 = ma.iloc[:,cols]
ma1.columns
ma2 = pd.get_dummies(ma1, drop_first=True)
ma2.columns

Index(['Age', 'Scholarship', 'Hipertension', 'Diabetes', 'Alcoholism',
       'Handcap', 'SMS_received', 'Gender_M', 'No-show_Yes'],
      dtype='object')

In [0]:
ma2.head()

Unnamed: 0,Age,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,Gender_M,No-show_Yes
0,62,0,1,0,0,0,0,0,0
1,56,0,0,0,0,0,0,1,0
2,62,0,0,0,0,0,0,0,0
3,8,0,0,0,0,0,0,0,0
4,56,0,1,1,0,0,0,0,0


In [0]:
ma2.dtypes

Age             int64
Scholarship     int64
Hipertension    int64
Diabetes        int64
Alcoholism      int64
Handcap         int64
SMS_received    int64
Gender_M        uint8
No-show_Yes     uint8
dtype: object

* notice, that the columns that were already in dummy (0/1) form when we loaded the dataset did not require creating a new column name (such as with the `Gender_M` column) when we called `get_dummies()`

* note that the SheduleDay column may be helpful if many dont show up mondays for example

* lets try the `sklearn` train/test split functions!

In [0]:
# get a test set
from sklearn.model_selection import train_test_split

train, test = train_test_split(ma2, test_size=0.2)

In [0]:
len(test.columns)

9

In [0]:
# we would need 88 columns if we included neighborhood
#X = train.iloc[:,0:88]
#Y = train.iloc[:,87]
#test = test.iloc[:,0:88]
#testY = test.iloc[:,87]

In [0]:
X = train.iloc[:,0:9]
Y = train.iloc[:,8]
test = test.iloc[:,0:9]
testY = test.iloc[:,8]

In [0]:
knnout, nn = kNN(X,Y,test,20,regress=False)

knnout

Unnamed: 0,0
0,1
1,0
2,0
3,0
4,0
...,...
13102,0
13103,0
13104,0
13105,0


In [0]:
import numpy as np
testY=testY.reset_index(drop=True)
x =abs(knnout.values.T[0] - testY.values.T)
x=np.where(x != 0)
i = x[0]

* only 10 incorrectly predicted out of all 13107 

In [0]:
print(len(i), 'guess incorrectly out of',len(testY),'!')
print('accuracy:{0:.2f}%'.format((1-13/13107)*100))

13 guess incorrectly out of 13107 !
accuracy:99.90%


* lets print the actual vs predicted values and overall means 

In [0]:
# actual values 
print('True Values')
print(testY[i])
print(testY.mean())

# predicted values
print('Predicted Values')
print(knnout.iloc[i])
knnout.mean()

1839     1
4232     1
4256     1
7475     1
7505     1
8346     1
8976     1
9085     1
11294    1
12428    1
Name: No-show_Yes, dtype: uint8
0.21477073319600212
       0
1839   0
4232   0
4256   0
7475   0
7505   0
8346   0
8976   0
9085   0
11294  0
12428  0


0    0.214008
dtype: float64

* *interesting*: only slightly off (in percentage of 1's), but all 0's predicted which were actually 1

* this is much better than the 79% we would have got if we guessed *all no-show = yes*



* if we wanted to flag the values that had greater than eg. 20% probability of no-show we could use `kNN` with `regress = True `(because it takes the *average* of the neighbors) 

* then we could see how many cases there are

In [0]:
knnout, nn = kNN(X,Y,test,20,regress=True,leave1out=True)

knnout