### Импорты

In [1]:
import os

import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score
from sklearn.tree import DecisionTreeRegressor

### Загрузка данных

In [2]:
path = "../data/task_4/"
data = {}
for file_name in os.listdir(path):
    if "unclean" not in file_name:
        data[file_name[:-4]] = pd.read_csv(path+file_name)
brands = list(data.keys())

## Task 1 (easy)

### Обработка данных

In [3]:
brand = brands[0]
print(f"Our brand: {brand}")

Our brand: vauxhall


In [4]:
df = data[brand]
df.head()

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
0,Corsa,2018,7885,Manual,9876,Petrol,145,55.4,1.4
1,Corsa,2019,11995,Manual,2500,Petrol,145,54.3,1.4
2,Corsa,2017,9777,Automatic,9625,Petrol,145,47.9,1.4
3,Corsa,2016,8500,Manual,25796,Petrol,30,55.4,1.4
4,Corsa,2019,10000,Manual,3887,Petrol,145,43.5,1.4


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13632 entries, 0 to 13631
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   model         13632 non-null  object 
 1   year          13632 non-null  int64  
 2   price         13632 non-null  int64  
 3   transmission  13632 non-null  object 
 4   mileage       13632 non-null  int64  
 5   fuelType      13632 non-null  object 
 6   tax           13632 non-null  int64  
 7   mpg           13632 non-null  float64
 8   engineSize    13632 non-null  float64
dtypes: float64(2), int64(4), object(3)
memory usage: 958.6+ KB


In [6]:
X = df.drop(["price", "model", "transmission", "fuelType"], axis=1)
y = df.price

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Обучение модели

In [8]:
model = DecisionTreeRegressor()
model.fit(X_train, y_train)
None

In [9]:
train_prediction = model.predict(X_train)
test_prediction = model.predict(X_test)

train_accuracy = r2_score(train_prediction, y_train)
test_accuracy = r2_score(test_prediction, y_test)

print(f"r2_score train: {train_accuracy}")
print(f"r2_score test: {test_accuracy}")

r2_score train: 0.996396480894464
r2_score test: 0.7560371478350549


Модель переобучилась... На тесте скор не очень хороший, а на трейне почти 1

In [10]:
pd.Series(model.feature_importances_, index=list(X_train))

year          0.572959
mileage       0.110797
tax           0.016923
mpg           0.185665
engineSize    0.113656
dtype: float64

Самый важный критерий для этой модели - год

## Task 2 (medium)

### Обработка данных

In [11]:
for name in brands:
    print(name, "\t" if len(name) > 6 else "\t\t", list(data[name]))

vauxhall 	 ['model', 'year', 'price', 'transmission', 'mileage', 'fuelType', 'tax', 'mpg', 'engineSize']
bmw 		 ['model', 'year', 'price', 'transmission', 'mileage', 'fuelType', 'tax', 'mpg', 'engineSize']
vw 		 ['model', 'year', 'price', 'transmission', 'mileage', 'fuelType', 'tax', 'mpg', 'engineSize']
hyundi 		 ['model', 'year', 'price', 'transmission', 'mileage', 'fuelType', 'tax(£)', 'mpg', 'engineSize']
audi 		 ['model', 'year', 'price', 'transmission', 'mileage', 'fuelType', 'tax', 'mpg', 'engineSize']
toyota 		 ['model', 'year', 'price', 'transmission', 'mileage', 'fuelType', 'tax', 'mpg', 'engineSize']
ford 		 ['model', 'year', 'price', 'transmission', 'mileage', 'fuelType', 'tax', 'mpg', 'engineSize']
focus 		 ['model', 'year', 'price', 'transmission', 'mileage', 'fuelType', 'engineSize']
skoda 		 ['model', 'year', 'price', 'transmission', 'mileage', 'fuelType', 'tax', 'mpg', 'engineSize']
cclass 		 ['model', 'year', 'price', 'transmission', 'mileage', 'fuelType', 'engineSize

In [12]:
data["hyundi"].rename(columns={"tax(£)": "tax"}, inplace=True)

#### Пропуски заменим на среднее

In [13]:
average_mpg = 0
average_tax = 0
n = 0
for name in brands:
    if name not in ["focus", "cclass"]:
        n += 1
        average_mpg += data[name].mpg.mean()
        average_tax += data[name].tax.mean()
average_mpg /= n
average_tax /= n
data["focus"]["mpg"] = average_mpg
data["cclass"]["mpg"] = average_mpg
data["focus"]["tax"] = average_tax
data["cclass"]["tax"] = average_tax

In [14]:
df = pd.concat(data.values())
df.head()

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
0,Corsa,2018,7885,Manual,9876,Petrol,145.0,55.4,1.4
1,Corsa,2019,11995,Manual,2500,Petrol,145.0,54.3,1.4
2,Corsa,2017,9777,Automatic,9625,Petrol,145.0,47.9,1.4
3,Corsa,2016,8500,Manual,25796,Petrol,30.0,55.4,1.4
4,Corsa,2019,10000,Manual,3887,Petrol,145.0,43.5,1.4


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 108540 entries, 0 to 13118
Data columns (total 9 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   model         108540 non-null  object 
 1   year          108540 non-null  int64  
 2   price         108540 non-null  int64  
 3   transmission  108540 non-null  object 
 4   mileage       108540 non-null  int64  
 5   fuelType      108540 non-null  object 
 6   tax           108540 non-null  float64
 7   mpg           108540 non-null  float64
 8   engineSize    108540 non-null  float64
dtypes: float64(3), int64(3), object(3)
memory usage: 8.3+ MB


In [16]:
prepared = pd.get_dummies(df)
X = prepared.drop(["price"], axis=1)
y = prepared.price

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Обучение модели с поиском лучших параметров по сетке

In [18]:
grid = {
    "criterion": ["squared_error", "friedman_mse"],
    'splitter': ["best", "random"],
    'max_depth': [10, 50, 100, 200],
    'max_features': ["sqrt", "log2", None],
}

In [19]:
model = GridSearchCV(DecisionTreeRegressor(), grid)
model.fit(X_train, y_train)
model.best_params_

{'criterion': 'squared_error',
 'max_depth': 200,
 'max_features': None,
 'splitter': 'random'}

In [20]:
train_prediction = model.predict(X_train)
test_prediction = model.predict(X_test)

train_accuracy = r2_score(train_prediction, y_train)
test_accuracy = r2_score(test_prediction, y_test)

print(f"r2_score train: {train_accuracy}")
print(f"r2_score test: {test_accuracy}")

r2_score train: 0.9994892187145668
r2_score test: 0.9337951738790087


У модели с данными параметрами очень неплохой скор на тесте

In [21]:
pd.DataFrame(model.cv_results_)[['params', 'mean_test_score']].sort_values('mean_test_score', ascending=False).head(10)

Unnamed: 0,params,mean_test_score
23,"{'criterion': 'squared_error', 'max_depth': 20...",0.935142
40,"{'criterion': 'friedman_mse', 'max_depth': 100...",0.934224
16,"{'criterion': 'squared_error', 'max_depth': 10...",0.933464
34,"{'criterion': 'friedman_mse', 'max_depth': 50,...",0.933199
10,"{'criterion': 'squared_error', 'max_depth': 50...",0.932997
41,"{'criterion': 'friedman_mse', 'max_depth': 100...",0.932626
47,"{'criterion': 'friedman_mse', 'max_depth': 200...",0.932614
22,"{'criterion': 'squared_error', 'max_depth': 20...",0.932563
46,"{'criterion': 'friedman_mse', 'max_depth': 200...",0.931633
11,"{'criterion': 'squared_error', 'max_depth': 50...",0.931329


Можно сказать, что friedman_mse показывает себя лучше, чем squared_error
