In [1]:
import pandas as pd

Task 1

In [2]:
#прочитали данные, предобработали
df = pd.read_csv('hyundi.csv')
df = df.drop(['model', 'transmission', 'fuelType'], axis=1)
df

Unnamed: 0,year,price,mileage,tax(£),mpg,engineSize
0,2017,7999,17307,145,58.9,1.2
1,2016,14499,25233,235,43.5,2.0
2,2016,11399,37877,30,61.7,1.7
3,2016,6499,23789,20,60.1,1.0
4,2015,10199,33177,160,51.4,2.0
...,...,...,...,...,...,...
4855,2016,8680,25906,0,78.4,1.6
4856,2015,7830,59508,30,65.7,1.7
4857,2017,6830,13810,20,60.1,1.0
4858,2018,13994,23313,145,44.8,1.6


In [3]:
from sklearn.model_selection import train_test_split
#Разделили на выборки
x_train, x_test, y_train, y_test = train_test_split(df.drop(['price'], axis=1),df["price"],train_size=0.8,random_state=42)

In [4]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import r2_score
#Обучили, взяли r2_score метрику
model = DecisionTreeClassifier(max_depth=25)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(r2_score(y_test, y_pred))
#max_depth ставим не сильно большое значение, чтобы не переобучиться
importances = model.feature_importances_
df.drop(['price'], axis=1).columns[importances.argmax()]
#самое важное - пробег

0.7197448920001279


'mileage'

Task 2

In [5]:
#Объединим в один датафрейм
df = pd.read_csv('hyundi.csv')
df = df.rename(columns={'tax(£)' : 'tax'})
autos = ['bmw.csv', 'cclass.csv', 'focus.csv', 'ford.csv', 'merc.csv', 'skoda.csv', 'toyota.csv', 'vauxhall.csv', 'vw.csv']
for auto in autos:
  df_dop = pd.read_csv(auto)
  df= pd.concat([df, df_dop])
df.fillna(0, inplace=True)
#Ограничим датасет
df = df.sample(n=12345)
df


Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
1831,GLA Class,2014,15790,Semi-Auto,60101,Diesel,145.0,55.4,2.1
7765,C Class,2019,34579,Semi-Auto,1000,Petrol,145.0,44.1,1.5
6087,EcoSport,2016,10299,Manual,22399,Petrol,125.0,52.3,1.0
1947,Focus,2019,17500,Manual,13402,Petrol,0.0,0.0,1.0
1278,Octavia,2017,10490,Manual,33300,Diesel,150.0,68.9,1.6
...,...,...,...,...,...,...,...,...,...
6653,Kuga,2018,16900,Manual,3736,Diesel,145.0,64.2,1.5
721,RAV4,2016,19975,Automatic,56772,Hybrid,20.0,55.4,2.5
5687,C-HR,2019,23498,Automatic,2920,Hybrid,140.0,57.7,1.8
7611,Mondeo,2015,7561,Manual,98438,Diesel,30.0,64.2,2.0


In [6]:
#Преобразуем категориальные признаки с помощью get_dummies
categories = [
    ('model', set(df['model'])),
    ('transmission', set(df['transmission'])),
    ('fuelType', set(df['fuelType'])),
]

for category, _ in categories:
    df = pd.concat([df.drop(columns=category), pd.get_dummies(df[category], prefix=category)], axis=1)
df

Unnamed: 0,year,price,mileage,tax,mpg,engineSize,model_ 1 Series,model_ 2 Series,model_ 3 Series,model_ 4 Series,...,model_ i3,model_ i8,transmission_Automatic,transmission_Manual,transmission_Semi-Auto,fuelType_Diesel,fuelType_Electric,fuelType_Hybrid,fuelType_Other,fuelType_Petrol
1831,2014,15790,60101,145.0,55.4,2.1,False,False,False,False,...,False,False,False,False,True,True,False,False,False,False
7765,2019,34579,1000,145.0,44.1,1.5,False,False,False,False,...,False,False,False,False,True,False,False,False,False,True
6087,2016,10299,22399,125.0,52.3,1.0,False,False,False,False,...,False,False,False,True,False,False,False,False,False,True
1947,2019,17500,13402,0.0,0.0,1.0,False,False,False,False,...,False,False,False,True,False,False,False,False,False,True
1278,2017,10490,33300,150.0,68.9,1.6,False,False,False,False,...,False,False,False,True,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6653,2018,16900,3736,145.0,64.2,1.5,False,False,False,False,...,False,False,False,True,False,True,False,False,False,False
721,2016,19975,56772,20.0,55.4,2.5,False,False,False,False,...,False,False,True,False,False,False,False,True,False,False
5687,2019,23498,2920,140.0,57.7,1.8,False,False,False,False,...,False,False,True,False,False,False,False,True,False,False
7611,2015,7561,98438,30.0,64.2,2.0,False,False,False,False,...,False,False,False,True,False,True,False,False,False,False


In [9]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
params = {
    "criterion": ('squared_error', 'friedman_mse', 'absolute_error', 'poisson'),
    "splitter": ('best', 'random'),
    "max_depth": list(range(5, 30)),
}
#Другие параметры брать не очень интересно
#Подбор гиперпараметров осуществляем с помощью GridSearchCV
DTR = GridSearchCV(
    DecisionTreeRegressor(random_state=42),
    params,
    n_jobs=-1
)

DTR.fit(x_train, y_train)
best_params = DTR.best_params_

print(best_params)
y_pred = DTR.predict(x_test)
r2_score(y_test, y_pred)


{'criterion': 'absolute_error', 'max_depth': 11, 'splitter': 'best'}


0.7500171462325913