

# KNN 

We are using a different version of the Melbourne housing data set, to predict the housing type as one of three possible categories:

  - 'h' house
  - 'u' duplex
  - 't' townhouse


In [1]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import calendar
import datetime
%matplotlib inline

# Starting off loading a training set
df_melb = pd.read_csv('melb_data_train.csv')

## Fix a column of data to be numeric


In [2]:
df_melb['Date']

0          7/5/16
1      28/08/2016
2          8/7/17
3      24/06/2017
4      19/11/2016
          ...    
495    29/07/2017
496    16/04/2016
497        8/4/17
498        7/5/16
499      10/12/16
Name: Date, Length: 500, dtype: object

In [3]:
# normalize date accepts the date string as shown in the df_melb 'Date' column,
# and returns a data in a standarized format
def standardize_date(d):
    # Your code here
    try: 
        d = datetime.datetime.strptime(d, "%d/%m/%y").strftime("%Y %m %d")
    except:
        pass
    try: 
        d = datetime.datetime.strptime(d, "%d/%m/%Y").strftime("%Y %m %d")
    except: 
       pass
    return d

def get_time(d):
    d = time.strptime(d, "%Y %m %d")
    return calendar.timegm(d)

#standardize_date("21/12/2008")

In [4]:
df_melb['Date'] = df_melb['Date'].apply( lambda x : standardize_date(x)) 
df_melb['unixtime'] = df_melb['Date'].apply( lambda x: get_time(x))
df_melb = df_melb.drop(columns="Date")

print("The min unixtime is {:d} and the max unixtime is {:d}".format(df_melb['unixtime'].min(),df_melb['unixtime'].max()))

The min unixtime is 1454544000 and the max unixtime is 1506124800


## Use Imputation to fill in missing values


In [5]:
target_col = 'Type'

In [6]:
dict_imputation = dict()
for col in df_melb.columns:
    # your code here
    if col != target_col:
        mean = df_melb[col].mean()
        dict_imputation[col] = mean
        df_melb[col].fillna(mean, inplace = True)

In [7]:
dict_imputation

{'Rooms': 2.71,
 'Price': 932558.7,
 'Distance': 10.524599999999985,
 'Postcode': 3113.122,
 'Bathroom': 1.44,
 'Car': 1.503006012024048,
 'Landsize': 638.91,
 'BuildingArea': 121.7832,
 'YearBuilt': 1970.9417475728155,
 'unixtime': 1485178502.4}

In [8]:
df_melb.head()

Unnamed: 0,Rooms,Type,Price,Distance,Postcode,Bathroom,Car,Landsize,BuildingArea,YearBuilt,unixtime
0,2,h,399000,8.7,3032,1,1.0,904,53.0,1985.0,1462579200
1,3,h,1241000,13.9,3165,1,1.0,643,121.7832,1970.941748,1472342400
2,2,u,550000,3.0,3067,1,1.0,1521,121.7832,1970.941748,1499472000
3,3,u,691000,8.4,3072,1,1.0,170,121.7832,1970.941748,1498262400
4,2,u,657500,4.6,3122,1,1.0,728,73.0,1965.0,1479513600


## Normalize all the attributes


In [9]:
dict_normalize = dict()
for col in df_melb.columns:
    # your code here
    if col != target_col:
        min = df_melb[col].min()
        max = df_melb[col].max()
        dict_normalize[col] = (min,max)
        df_melb[col] = (df_melb[col] - min)/(max - min)

In [10]:
dict_normalize

{'Rooms': (1, 6),
 'Price': (291000, 5020000),
 'Distance': (0.7, 47.3),
 'Postcode': (3002, 3810),
 'Bathroom': (0, 4),
 'Car': (0.0, 4.0),
 'Landsize': (0, 41400),
 'BuildingArea': (0.0, 475.0),
 'YearBuilt': (1890.0, 2015.0),
 'unixtime': (1454544000, 1506124800)}

In [11]:
df_melb.head()

Unnamed: 0,Rooms,Type,Price,Distance,Postcode,Bathroom,Car,Landsize,BuildingArea,YearBuilt,unixtime
0,0.2,h,0.022838,0.171674,0.037129,0.25,0.25,0.021836,0.111579,0.76,0.155779
1,0.4,h,0.200888,0.283262,0.201733,0.25,0.25,0.015531,0.256386,0.647534,0.345059
2,0.2,u,0.054768,0.049356,0.080446,0.25,0.25,0.036739,0.256386,0.647534,0.871022
3,0.4,u,0.084584,0.165236,0.086634,0.25,0.25,0.004106,0.256386,0.647534,0.847571
4,0.2,u,0.077501,0.083691,0.148515,0.25,0.25,0.017585,0.153684,0.6,0.484087


## prep for classification

In [12]:
df_test = pd.read_csv("melb_data_test.csv")

In [13]:
# Your code here to fix date
df_test['Date'] = df_test['Date'].apply( lambda x : standardize_date(x)) 
df_test['unixtime'] = df_test['Date'].apply( lambda x: get_time(x))
df_test = df_test.drop(columns="Date")

print("The min unixtime is {:d} and the max unixtime is {:d}".format(df_test['unixtime'].min(),df_test['unixtime'].max()))

The min unixtime is 1454544000 and the max unixtime is 1506124800


In [14]:
# Your code here for imputation - must use dictionary from above!
for col in df_test.columns:
    if col != target_col:
        df_test[col].fillna(dict_imputation[col], inplace = True)

df_test.head()        

Unnamed: 0,Rooms,Type,Price,Distance,Postcode,Bathroom,Car,Landsize,BuildingArea,YearBuilt,unixtime
0,3,h,1116000,17.9,3192,1,2.0,610,121.7832,1970.941748,1498867200
1,3,h,2030000,11.2,3186,2,2.0,366,121.7832,1970.941748,1472342400
2,3,h,1480000,10.7,3187,2,2.0,697,143.0,1925.0,1478476800
3,3,u,1203500,12.3,3166,2,2.0,311,127.0,2000.0,1495843200
4,3,h,540000,14.7,3030,2,2.0,353,135.0,2011.0,1504396800


In [15]:
# Your code here for scaling - must use dictionary from above!
for col in df_test.columns:
    if col != target_col:
        min = dict_normalize[col][0]
        max = dict_normalize[col][1]
        df_test[col] = (df_test[col] - min)/(max - min)
        
df_test.head()

Unnamed: 0,Rooms,Type,Price,Distance,Postcode,Bathroom,Car,Landsize,BuildingArea,YearBuilt,unixtime
0,0.4,h,0.174455,0.369099,0.235149,0.25,0.5,0.014734,0.256386,0.647534,0.859296
1,0.4,h,0.367731,0.225322,0.227723,0.5,0.5,0.008841,0.256386,0.647534,0.345059
2,0.4,h,0.251427,0.214592,0.22896,0.5,0.5,0.016836,0.301053,0.28,0.463987
3,0.4,u,0.192958,0.248927,0.20297,0.5,0.5,0.007512,0.267368,0.88,0.80067
4,0.4,h,0.052654,0.300429,0.034653,0.5,0.5,0.008527,0.284211,0.968,0.966499


## Write the kNN classifier function


In [16]:
import operator
def get_neighbors(df_train, df, observation, k, target_col):
    distances = []
    for features in range(len(df)):
        euclidean_distance = np.linalg.norm(observation - df[features])
        distances.append((features, euclidean_distance))
    distances = sorted(distances, key=lambda tup: tup[-1])
    neighbors = []
    for i in range(k):
        neighbors.append(df_train.iloc[distances[i][0],[1]].loc[target_col])
    return neighbors

def knn_class(df_train, k, target_col, observation, use_weighted_vote):
    i = df_train.columns.get_loc(target_col)
    target = df_train[target_col]
    df = df_train.copy()
    df = df_train.drop(columns = target_col).to_numpy()
    observation = observation.drop(target_col).to_numpy()
    
    if use_weighted_vote == True:
        neighbors = get_neighbors(df_train, df, observation, k, target_col)
        count = dict((x, neighbors.count(x)) for x in set(neighbors))
        #print(count)
        #answer = max(count, key=lambda k: count[k])
        m = -1
        ans = ""
        for k in count.keys():
            if count[k] > m:
                m = count[k]
                ans = k
        return ans
        
    else:
        distances = []
        for features in range(len(df)):
                euclidean_distance = np.linalg.norm(observation-df_train[features])
                distances.append([euclidean_distance, features])
        votes = [i[1] for i in sorted(distances)[:k]]
        classification = Counter(votes).most_common(1)[0][0]
        return classification


print(knn_class(df_melb, 3, "Type", df_test.iloc[4], use_weighted_vote=True))
        
    

h


## Compute the accuracy using different k values


In [17]:
poss_k = [1,3,13,25,50,100] # possible k's
acc_k_majority = list(np.zeros(len(poss_k))) # Accuracy for each value of k using majority voting
acc_k_weighted = list(np.zeros(len(poss_k))) # Accuracy for each value of k using weighted voting
for k in range(len(poss_k)):
    correct = 0
    for index, row in df_test.iterrows():
        val = (knn_class(df_melb, poss_k[k], target_col, row, use_weighted_vote=True))
        if val == row[target_col]:
            correct += 1
    acc = correct/len(df_test)
    print(acc)
    acc_k_majority[k] = acc
#     correct = 0
#     for index, row in df_test.iterrows():
#         val = (knn_class(df_melb, poss_k[k], target_col, row, use_weighted_vote=False))
#         if val == row[target_col]:
#             correct += 1
#     acc = correct/len(df_test)
#     print(acc)
#     acc_k_weighted[k] = acc
# Your code here

0.67
0.71
0.71
0.7
0.68
0.69


In [18]:
# plot code here
plt.plot(["1","3","13","25","50","100"], acc_k)
plt.ylabel("Accuracy")
plt.xlabel("# of nearest neighbors (k)")
plt.title("K nearestt neigbors vs Accuracy")
plt.show()

NameError: name 'acc_k' is not defined

➡️ Answer containing your analysis of the I would choose $k = <value> $ and voting scheme because _reasons_ here ⬅️