In [1]:
import numpy as np
import pandas as pd

In [2]:
hitters = pd.read_csv("hitters.csv")
df = hitters.copy()
df = df.dropna()
df.head()

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,League,Division,PutOuts,Assists,Errors,Salary,NewLeague
1,315,81,7,24,38,39,14,3449,835,69,321,414,375,N,W,632,43,10,475.0,N
2,479,130,18,66,72,76,3,1624,457,63,224,266,263,A,W,880,82,14,480.0,A
3,496,141,20,65,78,37,11,5628,1575,225,828,838,354,N,E,200,11,3,500.0,N
4,321,87,10,39,42,30,2,396,101,12,48,46,33,N,E,805,40,4,91.5,N
5,594,169,4,74,51,35,11,4408,1133,19,501,336,194,A,W,282,421,25,750.0,A


In [3]:
dummies = pd.get_dummies(df[["League","Division","NewLeague"]])
dummies = dummies.drop(["League_A","Division_E","NewLeague_A"],axis = 1).astype("float64")
dummies.head()

Unnamed: 0,League_N,Division_W,NewLeague_N
1,1.0,1.0,1.0
2,0.0,1.0,0.0
3,1.0,0.0,1.0
4,1.0,0.0,1.0
5,0.0,1.0,0.0


In [4]:
x = df.drop(["Salary","League","Division","NewLeague"],axis = 1).astype("float64")
x = pd.concat([x,dummies],axis = 1)
x.head()

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,PutOuts,Assists,Errors,League_N,Division_W,NewLeague_N
1,315.0,81.0,7.0,24.0,38.0,39.0,14.0,3449.0,835.0,69.0,321.0,414.0,375.0,632.0,43.0,10.0,1.0,1.0,1.0
2,479.0,130.0,18.0,66.0,72.0,76.0,3.0,1624.0,457.0,63.0,224.0,266.0,263.0,880.0,82.0,14.0,0.0,1.0,0.0
3,496.0,141.0,20.0,65.0,78.0,37.0,11.0,5628.0,1575.0,225.0,828.0,838.0,354.0,200.0,11.0,3.0,1.0,0.0,1.0
4,321.0,87.0,10.0,39.0,42.0,30.0,2.0,396.0,101.0,12.0,48.0,46.0,33.0,805.0,40.0,4.0,1.0,0.0,1.0
5,594.0,169.0,4.0,74.0,51.0,35.0,11.0,4408.0,1133.0,19.0,501.0,336.0,194.0,282.0,421.0,25.0,0.0,1.0,0.0


In [5]:
y = df["Salary"]

In [8]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.25,random_state = 42)

KNN Model

In [9]:
from sklearn.neighbors import KNeighborsRegressor

In [10]:
knnmodel = KNeighborsRegressor().fit(x_train,y_train)

In [11]:
knnmodel.n_neighbors

5

Tahmin

In [12]:
y_tahmin = knnmodel.predict(x_test)

In [13]:
from sklearn.metrics import mean_squared_error,r2_score

In [14]:
np.sqrt(mean_squared_error(y_test,y_tahmin))

426.6570764525201

Model Tuning

In [15]:
from sklearn.model_selection import GridSearchCV

In [16]:
#GridSearch yöntemi türkçe olarak ızgara yöntemi de denebilir
#Birkaç parametresi olan algoritmalar için parametrelere verilen belli değerlerin kendi içlerinde de çaprazlanarak her değer için hata değerleri alınmasını sağlar
#tek parametre için de bu yöntem kullanılabilir ve arama gerçekleştirilir.

In [17]:
knn_parametreler = {"n_neighbors": np.arange(1,100,1)}
#parametreler için sözlük yapısında vereceğimiz parametrenin algoritmada hangi isim ile kullanılıyorsa o isimle kullanmak gerekir.

In [18]:
knn = KNeighborsRegressor()

In [19]:
knn_cv_model = GridSearchCV(knn, knn_parametreler, cv = 10)

In [20]:
knn_cv_model.fit(x_train,y_train)

GridSearchCV(cv=10, estimator=KNeighborsRegressor(),
             param_grid={'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
       52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68,
       69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85,
       86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99])})

In [21]:
knn_cv_model.best_params_["n_neighbors"]

8

In [22]:
knn_tuned = KNeighborsRegressor(n_neighbors = knn_cv_model.best_params_["n_neighbors"]).fit(x_train,y_train)

In [23]:
y_tahmin_tuned = knn_tuned.predict(x_test)

In [24]:
np.sqrt(mean_squared_error(y_test,y_tahmin_tuned))

413.7094731463598