In [1]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import GridSearchCV

from sklearn.neighbors import KNeighborsRegressor


# Loading Dataset

In [2]:
df = pd.read_csv('camera_dataset.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1038 entries, 0 to 1037
Data columns (total 13 columns):
Model                      1038 non-null object
Release date               1038 non-null int64
Max resolution             1038 non-null float64
Low resolution             1038 non-null float64
Effective pixels           1038 non-null float64
Zoom wide (W)              1038 non-null float64
Zoom tele (T)              1038 non-null float64
Normal focus range         1038 non-null float64
Macro focus range          1037 non-null float64
Storage included           1036 non-null float64
Weight (inc. batteries)    1036 non-null float64
Dimensions                 1036 non-null float64
Price                      1038 non-null float64
dtypes: float64(11), int64(1), object(1)
memory usage: 105.5+ KB


In [3]:
df.head()

Unnamed: 0,Model,Release date,Max resolution,Low resolution,Effective pixels,Zoom wide (W),Zoom tele (T),Normal focus range,Macro focus range,Storage included,Weight (inc. batteries),Dimensions,Price
0,Agfa ePhoto 1280,1997,1024.0,640.0,0.0,38.0,114.0,70.0,40.0,4.0,420.0,95.0,179.0
1,Agfa ePhoto 1680,1998,1280.0,640.0,1.0,38.0,114.0,50.0,0.0,4.0,420.0,158.0,179.0
2,Agfa ePhoto CL18,2000,640.0,0.0,0.0,45.0,45.0,0.0,0.0,2.0,0.0,0.0,179.0
3,Agfa ePhoto CL30,1999,1152.0,640.0,0.0,35.0,35.0,0.0,0.0,4.0,0.0,0.0,269.0
4,Agfa ePhoto CL30 Clik!,1999,1152.0,640.0,0.0,43.0,43.0,50.0,0.0,40.0,300.0,128.0,1299.0


# Makine Features and Label

In [4]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

In [5]:
less_significant_features = ['Release date', ]

features = df.drop(less_significant_features, axis=1)
label = df[['Price']]



# Seperating Numeric and Non Numeric Features

In [6]:
numeric_features = df.select_dtypes(include=numerics)
numeric_feaures_cols = list(numeric_features.columns)

# Creating preprocessing pipelines

In [7]:
handle_missing_values = ('filling_missing_values', Imputer(missing_values=np.nan, strategy='mean'))
normalizing = ('normalizing the values', MinMaxScaler())


preprocessing_transformations_steps = [handle_missing_values, normalizing]

preprocessing_transformations = Pipeline(preprocessing_transformations_steps)

processed_data = pd.DataFrame(preprocessing_transformations.fit_transform(numeric_features), columns=numeric_feaures_cols)



In [8]:
processed_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1038 entries, 0 to 1037
Data columns (total 12 columns):
Release date               1038 non-null float64
Max resolution             1038 non-null float64
Low resolution             1038 non-null float64
Effective pixels           1038 non-null float64
Zoom wide (W)              1038 non-null float64
Zoom tele (T)              1038 non-null float64
Normal focus range         1038 non-null float64
Macro focus range          1038 non-null float64
Storage included           1038 non-null float64
Weight (inc. batteries)    1038 non-null float64
Dimensions                 1038 non-null float64
Price                      1038 non-null float64
dtypes: float64(12)
memory usage: 97.4 KB


In [9]:
processed_data.head()

Unnamed: 0,Release date,Max resolution,Low resolution,Effective pixels,Zoom wide (W),Zoom tele (T),Normal focus range,Macro focus range,Storage included,Weight (inc. batteries),Dimensions,Price
0,0.230769,0.182336,0.128205,0.0,0.730769,0.220077,0.583333,0.470588,0.008889,0.225806,0.395833,0.020664
1,0.307692,0.22792,0.128205,0.047619,0.730769,0.220077,0.416667,0.0,0.008889,0.225806,0.658333,0.020664
2,0.461538,0.11396,0.0,0.0,0.865385,0.086873,0.0,0.0,0.004444,0.0,0.0,0.020664
3,0.384615,0.205128,0.128205,0.0,0.673077,0.067568,0.0,0.0,0.008889,0.0,0.0,0.031935
4,0.384615,0.205128,0.128205,0.0,0.826923,0.083012,0.416667,0.0,0.088889,0.16129,0.533333,0.160927


# Using GridSearch

In [None]:
preprocessing_transformations_steps.append(('knn', KNeighborsRegressor()))

params = {'knn__n_neighbors': list(range(1,10))}

grid_search = GridSearchCV(preprocessing_transformations_steps, param_grid=params)

In [None]:
grid_search.fit(numeric_features, label)