# KNN(K Nearest Neighbor) - Regression

In [18]:
# inserted cell

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score
from sklearn.preprocessing import PolynomialFeatures

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [19]:
# read CSV file and save the results
data = pd.read_csv('data/Hitters.csv')
print(data.head())

data = pd.read_csv('data/Hitters.csv').dropna().drop('Player', axis = 1)
print(data.head())
data.info()
data.values

              Player  AtBat  Hits  HmRun  Runs  RBI  Walks  Years  CAtBat  \
0     -Andy Allanson    293    66      1    30   29     14      1     293   
1        -Alan Ashby    315    81      7    24   38     39     14    3449   
2       -Alvin Davis    479   130     18    66   72     76      3    1624   
3      -Andre Dawson    496   141     20    65   78     37     11    5628   
4  -Andres Galarraga    321    87     10    39   42     30      2     396   

   CHits    ...      CRuns  CRBI  CWalks  League Division PutOuts  Assists  \
0     66    ...         30    29      14       A        E     446       33   
1    835    ...        321   414     375       N        W     632       43   
2    457    ...        224   266     263       A        W     880       82   
3   1575    ...        828   838     354       N        E     200       11   
4    101    ...         48    46      33       N        E     805       40   

   Errors  Salary  NewLeague  
0      20     NaN          A  
1     

array([[315, 81, 7, ..., 10, 475.0, 'N'],
       [479, 130, 18, ..., 14, 480.0, 'A'],
       [496, 141, 20, ..., 3, 500.0, 'N'],
       ..., 
       [475, 126, 3, ..., 7, 385.0, 'A'],
       [573, 144, 9, ..., 12, 960.0, 'A'],
       [631, 170, 9, ..., 3, 1000.0, 'A']], dtype=object)

In [20]:
# create a Python list of feature names
feature_cols = list(data)
print(type(feature_cols))

feature_cols.remove('Salary')
print(feature_cols)

# use the list to select a subset of the original DataFrame
X = data[feature_cols]

# print the first 5 rows
X.head()

<class 'list'>
['AtBat', 'Hits', 'HmRun', 'Runs', 'RBI', 'Walks', 'Years', 'CAtBat', 'CHits', 'CHmRun', 'CRuns', 'CRBI', 'CWalks', 'League', 'Division', 'PutOuts', 'Assists', 'Errors', 'NewLeague']


Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,League,Division,PutOuts,Assists,Errors,NewLeague
1,315,81,7,24,38,39,14,3449,835,69,321,414,375,N,W,632,43,10,N
2,479,130,18,66,72,76,3,1624,457,63,224,266,263,A,W,880,82,14,A
3,496,141,20,65,78,37,11,5628,1575,225,828,838,354,N,E,200,11,3,N
4,321,87,10,39,42,30,2,396,101,12,48,46,33,N,E,805,40,4,N
5,594,169,4,74,51,35,11,4408,1133,19,501,336,194,A,W,282,421,25,A


In [21]:
# dealing with categorical variable
X=pd.get_dummies(X)
feature_cols=list(X)
print(feature_cols)

['AtBat', 'Hits', 'HmRun', 'Runs', 'RBI', 'Walks', 'Years', 'CAtBat', 'CHits', 'CHmRun', 'CRuns', 'CRBI', 'CWalks', 'PutOuts', 'Assists', 'Errors', 'League_A', 'League_N', 'Division_E', 'Division_W', 'NewLeague_A', 'NewLeague_N']


In [22]:
# check the type and shape of X
print(type(X))
print(X.shape)

<class 'pandas.core.frame.DataFrame'>
(263, 22)


In [23]:
# select a Series from the DataFrame
y = data['Salary']

# equivalent command that works if there are no spaces in the column name
y = data.Salary

# print the first 5 values
y.head()

1    475.0
2    480.0
3    500.0
4     91.5
5    750.0
Name: Salary, dtype: float64

In [24]:
# check the type and shape of y
print(type(y))
print(y.shape)

<class 'pandas.core.series.Series'>
(263,)


In [25]:
from sklearn.neighbors import KNeighborsRegressor

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

knnreg = KNeighborsRegressor(n_neighbors = 5).fit(X_train, y_train)

### Making predictions

In [26]:
# make predictions on the testing set
y_pred = knnreg.predict(X_test)

### Computing  $R^2$

In [27]:
print(knnreg.score(X_test, y_test))

0.401786251468


### Computing the RMSE 

In [28]:
print(np.sqrt(mean_squared_error(y_test, y_pred)))

420.224243098


### Normalization Data

In [29]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

knnreg = KNeighborsRegressor(n_neighbors = 5).fit(X_train_scaled, y_train)


### Making predictions

In [30]:
# make predictions on the testing set
y_pred = knnreg.predict(X_test_scaled)

### Computing  $R^2$

In [31]:
print(knnreg.score(X_test_scaled, y_test))

0.350079211453


### Computing the RMSE 

In [32]:
print(np.sqrt(mean_squared_error(y_test, y_pred)))

438.009089406
