In [1]:
import pandas as pd

Task 1

In [2]:
#прочитали данные, предобработали
df = pd.read_csv('hyundi.csv')
df = df.drop(['model', 'transmission', 'fuelType'], axis=1)
df

Unnamed: 0,year,price,mileage,tax(£),mpg,engineSize
0,2017,7999,17307,145,58.9,1.2
1,2016,14499,25233,235,43.5,2.0
2,2016,11399,37877,30,61.7,1.7
3,2016,6499,23789,20,60.1,1.0
4,2015,10199,33177,160,51.4,2.0
...,...,...,...,...,...,...
4855,2016,8680,25906,0,78.4,1.6
4856,2015,7830,59508,30,65.7,1.7
4857,2017,6830,13810,20,60.1,1.0
4858,2018,13994,23313,145,44.8,1.6


In [3]:
from sklearn.model_selection import train_test_split
#Разделили на выборки
x_train, x_test, y_train, y_test = train_test_split(df.drop(['price'], axis=1),df["price"],train_size=0.8,random_state=42)

In [4]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import r2_score
#Обучили, взяли r2_score метрику
model = DecisionTreeClassifier(max_depth=25)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
r2_score(y_test, y_pred)
#max_depth ставим не сильно большое значение, чтобы не переобучиться
#самые важные признаки - year, mileage, mpg, engineSize. Проверил выкидыванием остальных столбцов и сравнением метрики

0.7185537541943192

Task 2

In [5]:
#Объединим в один датафрейм
df = pd.read_csv('hyundi.csv')
df = df.rename(columns={'tax(£)' : 'tax'})
autos = ['bmw.csv', 'cclass.csv', 'focus.csv', 'ford.csv', 'merc.csv', 'skoda.csv', 'toyota.csv', 'vauxhall.csv', 'vw.csv']
for auto in autos:
  df_dop = pd.read_csv(auto)
  df= pd.concat([df, df_dop])
df.fillna(0, inplace=True)
#Ограничим датасет
df = df.sample(n=12345)
df


Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
7691,5 Series,2015,12990,Automatic,86000,Diesel,160.0,51.4,3.0
3589,I30,2018,11679,Manual,11513,Diesel,145.0,74.3,1.6
8832,3 Series,2018,22800,Automatic,18627,Petrol,145.0,47.9,2.0
7898,Polo,2015,8495,Manual,40453,Petrol,20.0,60.1,1.2
4223,Astra,2019,12392,Manual,13202,Diesel,145.0,72.4,1.6
...,...,...,...,...,...,...,...,...,...
7111,A Class,2019,25000,Semi-Auto,7800,Petrol,145.0,53.3,1.3
8301,4 Series,2017,22995,Automatic,29989,Diesel,150.0,49.6,3.0
4681,Golf,2016,11000,Manual,33000,Diesel,0.0,74.3,1.6
11557,Tiguan,2016,14990,Manual,31202,Diesel,125.0,58.9,2.0


In [6]:
#Преобразуем категориальные признаки с помощью get_dummies
categories = [
    ('model', set(df['model'])),
    ('transmission', set(df['transmission'])),
    ('fuelType', set(df['fuelType'])),
]

for category, _ in categories:
    df = pd.concat([df.drop(columns=category), pd.get_dummies(df[category], prefix=category)], axis=1)
df

Unnamed: 0,year,price,mileage,tax,mpg,engineSize,model_ 1 Series,model_ 2 Series,model_ 3 Series,model_ 4 Series,...,model_ i8,transmission_Automatic,transmission_Manual,transmission_Other,transmission_Semi-Auto,fuelType_Diesel,fuelType_Electric,fuelType_Hybrid,fuelType_Other,fuelType_Petrol
7691,2015,12990,86000,160.0,51.4,3.0,False,False,False,False,...,False,True,False,False,False,True,False,False,False,False
3589,2018,11679,11513,145.0,74.3,1.6,False,False,False,False,...,False,False,True,False,False,True,False,False,False,False
8832,2018,22800,18627,145.0,47.9,2.0,False,False,True,False,...,False,True,False,False,False,False,False,False,False,True
7898,2015,8495,40453,20.0,60.1,1.2,False,False,False,False,...,False,False,True,False,False,False,False,False,False,True
4223,2019,12392,13202,145.0,72.4,1.6,False,False,False,False,...,False,False,True,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7111,2019,25000,7800,145.0,53.3,1.3,False,False,False,False,...,False,False,False,False,True,False,False,False,False,True
8301,2017,22995,29989,150.0,49.6,3.0,False,False,False,True,...,False,True,False,False,False,True,False,False,False,False
4681,2016,11000,33000,0.0,74.3,1.6,False,False,False,False,...,False,False,True,False,False,True,False,False,False,False
11557,2016,14990,31202,125.0,58.9,2.0,False,False,False,False,...,False,False,True,False,False,True,False,False,False,False


In [7]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
params = {
    "criterion": ('squared_error', 'friedman_mse', 'absolute_error', 'poisson'),
    "splitter": ('best', 'random'),
    "max_depth": list(range(5, 30)),
}
#Другие параметры брать не очень интересно
#Подбор гиперпараметров осуществляем с помощью GridSearchCV
DTR = GridSearchCV(
    DecisionTreeRegressor(random_state=42),
    params,
    n_jobs=-1
)

DTR.fit(x_train, y_train)
best_params = DTR.best_params_
best_params

{'criterion': 'absolute_error', 'max_depth': 11, 'splitter': 'best'}