In [1]:
import pandas as pd
import numpy as np
import seaborn as sb

from sklearn import model_selection

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder 
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
import sklearn.metrics as sm
# from sklearn import datasets, linear_model, metrics 
# (n_neighbors=10)


In [2]:
df = pd.read_csv('autos.csv', encoding = "ISO-8859-1")

In [3]:
df.head()

Unnamed: 0,dateCrawled,name,seller,offerType,price,abtest,vehicleType,yearOfRegistration,gearbox,powerPS,model,kilometer,monthOfRegistration,fuelType,brand,notRepairedDamage,dateCreated,nrOfPictures,postalCode,lastSeen
0,2016-03-24 11:52:17,Golf_3_1.6,privat,Offer,480,test,,1993,manuell,0,golf,150000,0,benzin,volkswagen,,2016-03-24 00:00:00,0,70435,2016-04-07 03:16:57
1,2016-03-24 10:58:45,A5_Sportback_2.7_Tdi,privat,Offer,18300,test,coupe,2011,manuell,190,,125000,5,diesel,audi,ja,2016-03-24 00:00:00,0,66954,2016-04-07 01:46:50
2,2016-03-14 12:52:21,"Jeep_Grand_Cherokee_""Overland""",privat,Offer,9800,test,suv,2004,automatik,163,grand,125000,8,diesel,jeep,,2016-03-14 00:00:00,0,90480,2016-04-05 12:47:46
3,2016-03-17 16:54:04,GOLF_4_1_4__3Tï¿½RER,privat,Offer,1500,test,kleinwagen,2001,manuell,75,golf,150000,6,benzin,volkswagen,nein,2016-03-17 00:00:00,0,91074,2016-03-17 17:40:17
4,2016-03-31 17:25:20,Skoda_Fabia_1.4_TDI_PD_Classic,privat,Offer,3600,test,kleinwagen,2008,manuell,69,fabia,90000,7,diesel,skoda,nein,2016-03-31 00:00:00,0,60437,2016-04-06 10:17:21


In [4]:
carsCopyDf = df.copy()

In [5]:
carsCopyDf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 371528 entries, 0 to 371527
Data columns (total 20 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   dateCrawled          371528 non-null  object
 1   name                 371528 non-null  object
 2   seller               371528 non-null  object
 3   offerType            371528 non-null  object
 4   price                371528 non-null  int64 
 5   abtest               371528 non-null  object
 6   vehicleType          333659 non-null  object
 7   yearOfRegistration   371528 non-null  int64 
 8   gearbox              351319 non-null  object
 9   powerPS              371528 non-null  int64 
 10  model                351044 non-null  object
 11  kilometer            371528 non-null  int64 
 12  monthOfRegistration  371528 non-null  int64 
 13  fuelType             338142 non-null  object
 14  brand                371528 non-null  object
 15  notRepairedDamage    299468 non-nu

# Data preproccesing and visualisation 

In [6]:
columnsToDrop=['name','dateCrawled','dateCreated','postalCode','lastSeen','nrOfPictures']

In [7]:
carsCopyDf = carsCopyDf.drop(columns= columnsToDrop)

In [8]:
carsCopyDf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 371528 entries, 0 to 371527
Data columns (total 14 columns):
seller                 371528 non-null object
offerType              371528 non-null object
price                  371528 non-null int64
abtest                 371528 non-null object
vehicleType            333659 non-null object
yearOfRegistration     371528 non-null int64
gearbox                351319 non-null object
powerPS                371528 non-null int64
model                  351044 non-null object
kilometer              371528 non-null int64
monthOfRegistration    371528 non-null int64
fuelType               338142 non-null object
brand                  371528 non-null object
notRepairedDamage      299468 non-null object
dtypes: int64(5), object(9)
memory usage: 39.7+ MB


In [9]:
sb.set(rc={'figure.figsize':(11.7,8.27)})

In [10]:
carsCopyDf = carsCopyDf.drop_duplicates(keep='first', inplace = False)

In [11]:
carsCopyDf.isnull().sum()


seller                     0
offerType                  0
price                      0
abtest                     0
vehicleType            36439
yearOfRegistration         0
gearbox                18562
powerPS                    0
model                  19708
kilometer                  0
monthOfRegistration        0
fuelType               32222
brand                      0
notRepairedDamage      68964
dtype: int64

In [12]:
Q1 = np.percentile(carsCopyDf[carsCopyDf.yearOfRegistration < 2020].yearOfRegistration, 25, interpolation = 'midpoint')  
Q3 = np.percentile(carsCopyDf[carsCopyDf.yearOfRegistration < 2020].yearOfRegistration, 75, interpolation = 'midpoint')  

IQR = Q3 - Q1  
low_lim = Q1 - 1.5 * IQR 
up_lim = Q3 + 1.5 * IQR
low_lim

1985.5

In [13]:
PQ1 = np.percentile(carsCopyDf.price, 25, interpolation = 'midpoint')  
PQ3 = np.percentile(carsCopyDf.price, 75, interpolation = 'midpoint')  

PIQR = PQ3 - PQ1  
Plow_lim = PQ1 - 1.5 * PIQR 
Pup_lim = PQ3 + 1.5 * PIQR
Plow_lim
Pup_lim

16333.5

In [14]:
carsCopyDf = carsCopyDf[carsCopyDf.price < 17475]

In [15]:
carsCopyDf = carsCopyDf[carsCopyDf.yearOfRegistration > 1985 ] 

In [16]:
carsCopyDf = carsCopyDf[carsCopyDf.monthOfRegistration < 2020 ] 

In [17]:
carsCopyDf.monthOfRegistration = carsCopyDf.monthOfRegistration / 12

In [18]:
age = 2020 - carsCopyDf.yearOfRegistration + carsCopyDf.monthOfRegistration/12

In [19]:
carsCopyDf['age'] = age

In [20]:
carsCopyDf = carsCopyDf.drop(columns=['yearOfRegistration', 'monthOfRegistration'])

In [21]:
carsCopyDf = carsCopyDf[carsCopyDf.powerPS > 50]

In [22]:
carsCopyDf = carsCopyDf[carsCopyDf.powerPS < 1000]

In [23]:
carsCopyDf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 278619 entries, 2 to 371526
Data columns (total 13 columns):
seller               278619 non-null object
offerType            278619 non-null object
price                278619 non-null int64
abtest               278619 non-null object
vehicleType          257756 non-null object
gearbox              273021 non-null object
powerPS              278619 non-null int64
model                267105 non-null object
kilometer            278619 non-null int64
fuelType             259402 non-null object
brand                278619 non-null object
notRepairedDamage    234256 non-null object
age                  278619 non-null float64
dtypes: float64(1), int64(3), object(9)
memory usage: 29.8+ MB


In [24]:
carsCopyDf.isnull().sum()


seller                   0
offerType                0
price                    0
abtest                   0
vehicleType          20863
gearbox               5598
powerPS                  0
model                11514
kilometer                0
fuelType             19217
brand                    0
notRepairedDamage    44363
age                      0
dtype: int64

In [25]:
carsCopyDf.price.mean()

4450.271618231348

In [6]:
carsCopyDf = carsCopyDf.dropna()

In [27]:
carsCopyDf = carsCopyDf[carsCopyDf['kilometer'] > 35000]


In [7]:
carsCopyDf = carsCopyDf.apply(LabelEncoder().fit_transform)

# Algorithms

In [29]:
# array = carsCopyDf.values
# X=carsCopyDf.iloc[:,:-1]
# Y=carsCopyDf.iloc[:,-1]

# result = []
# # evaluate each model in turn
# reg = LinearRegression().fit(X, Y)
# result.append(('LinearRegression', reg.score(X, Y)))

# reg = KNeighborsRegressor(n_neighbors=10).fit(X, Y)
# result.append(('KNN-n=10', reg.score(X, Y)))

# reg = KNeighborsRegressor(n_neighbors=100).fit(X, Y)
# result.append(('KNN-n=100', reg.score(X, Y)))

# reg = RandomForestRegressor(n_estimators=10, criterion='mse').fit(X, Y)
# result.append(('RandomRegressor-n=10-mse', reg.score(X, Y)))

# reg = RandomForestRegressor(n_estimators=100, criterion='mse').fit(X, Y)
# result.append(('RandomRegressor-n=100-mse', reg.score(X, Y)))

# reg = RandomForestRegressor(n_estimators=10, criterion='mae').fit(X, Y)
# result.append(('RandomRegressor-n=10-mae', reg.score(X, Y)))

# reg = RandomForestRegressor(n_estimators=100, criterion='mae').fit(X, Y)
# result.append(('RandomRegressor-n=100-mae', reg.score(X, Y)))


# # result
    

In [8]:
array = carsCopyDf.values
X1=carsCopyDf.iloc[:,:-1]
Y=carsCopyDf.iloc[:,-1]

fs = SelectKBest(score_func=f_regression, k=10)
# apply feature selection
X = fs.fit_transform(X1, Y)

# result = []
# # evaluate each model in turn
# reg = LinearRegression().fit(X, Y)
# result.append(('LinearRegression', reg.score(X, Y)))

# reg = KNeighborsRegressor(n_neighbors=10).fit(X, Y)
# result.append(('KNN-n=10', reg.score(X, Y)))

# reg = KNeighborsRegressor(n_neighbors=100).fit(X, Y)
# result.append(('KNN-n=100', reg.score(X, Y)))

# reg = RandomForestRegressor(n_estimators=10, criterion='mse').fit(X, Y)
# result.append(('RandomRegressor-n=10-mse', reg.score(X, Y)))

# reg = RandomForestRegressor(n_estimators=100, criterion='mse').fit(X, Y)
# result.append(('RandomRegressor-n=100-mse', reg.score(X, Y)))

# reg = RandomForestRegressor(n_estimators=10, criterion='mae').fit(X, Y)
# result.append(('RandomRegressor-n=10-mae', reg.score(X, Y)))

# reg = RandomForestRegressor(n_estimators=100, criterion='mae').fit(X, Y)
# result.append(('RandomRegressor-n=100-mae', reg.score(X, Y)))


# for name, score in results:
#     msg = "%s: %f" % (name, score)
#     print(msg)

  corr /= X_norms
  return (a < x) & (x < b)
  return (a < x) & (x < b)
  cond2 = cond0 & (x <= _a)


In [9]:
result = []
# evaluate each model in turn
reg = LinearRegression().fit(X, Y)
result.append(('LinearRegression', reg.score(X, Y)))
result

[('LinearRegression', 0.3282654374827726)]

In [10]:
reg = KNeighborsRegressor(n_neighbors=10).fit(X, Y)
result.append(('KNN-n=10', reg.score(X, Y)))
result

[('LinearRegression', 0.3282654374827726), ('KNN-n=10', 0.41631869303912183)]

In [11]:
reg = KNeighborsRegressor(n_neighbors=100).fit(X, Y)
result.append(('KNN-n=100', reg.score(X, Y)))
result

[('LinearRegression', 0.3282654374827726),
 ('KNN-n=10', 0.41631869303912183),
 ('KNN-n=100', 0.34996286380417996)]

In [12]:
reg = RandomForestRegressor(n_estimators=10, criterion='mse').fit(X, Y)
result.append(('RandomRegressor-n=10-mse', reg.score(X, Y)))
result

[('LinearRegression', 0.3282654374827726),
 ('KNN-n=10', 0.41631869303912183),
 ('KNN-n=100', 0.34996286380417996),
 ('RandomRegressor-n=10-mse', 0.8809303892166627)]

In [None]:
reg = RandomForestRegressor(n_estimators=100, criterion='mse').fit(X, Y)
result.append(('RandomRegressor-n=100-mse', reg.score(X, Y)))
result

In [None]:
reg = RandomForestRegressor(n_estimators=10, criterion='mae').fit(X, Y)
result.append(('RandomRegressor-n=10-mae', reg.score(X, Y)))
result

In [None]:
reg = RandomForestRegressor(n_estimators=100, criterion='mae').fit(X, Y)
result.append(('RandomRegressor-n=100-mae', reg.score(X, Y)))
result