In [1]:
import pandas as pd
import numpy as np
# import ast

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [2]:
df = pd.read_csv('car_price_08_11_2020_final.csv',sep=',')

In [3]:
del_column=['model_info', 'equipment_dict', 'Владельцы','Таможня', 'Состояние', 'priceCurrency', 'name', 'url_saleid', 'image', 'bodyType', 'description', 'parsing_unixtime', 'sell_id','super_gen','vehicleConfiguration']
df.drop(del_column, axis=1,inplace=True)
df.head(2)

Unnamed: 0,brand,engineDisplacement,enginePower,fuelType,mileage,modelDate,model_name,numberOfDoors,productionDate,vehicleTransmission,vendor,ПТС,Привод,Руль,price,auto_class,price_segment,seller_type,section,color
0,MERCEDES,1332,150,бензин,10939,2018,A_KLASSE,5,2018,роботизированная,EUROPEAN,Оригинал,передний,левый,2097000,C,PREMIUM,COMMERCIAL,,белый
1,MERCEDES,1595,122,бензин,56671,2012,A_KLASSE,5,2015,роботизированная,EUROPEAN,Оригинал,передний,левый,1298000,C,PREMIUM,COMMERCIAL,,белый


In [4]:
df.isnull().sum()

brand                     0
engineDisplacement        0
enginePower               0
fuelType                  0
mileage                   0
modelDate                 0
model_name                0
numberOfDoors             0
productionDate            0
vehicleTransmission       0
vendor                    0
ПТС                     333
Привод                    0
Руль                      0
price                     0
auto_class              312
price_segment             0
seller_type               0
section                3669
color                     0
dtype: int64

#### Удаляем строки с пустыми значениями ПТС и auto_class

In [5]:
# удаляем пустые значения
df.dropna(subset=['ПТС'],axis=0, inplace=True)
df.dropna(subset=['auto_class'],axis=0, inplace=True)

df.reset_index(drop=True, inplace=True)

### Обработка признаков

##### price

In [6]:
# стоимость авто переведем в доллары по курсу ноября 2020
usd_to_rub = 76
df.price = df.price/usd_to_rub

##### numberOfDoors

In [7]:
df['numberOfDoors'].value_counts()

5    34145
4    19764
3     1644
2     1510
Name: numberOfDoors, dtype: int64

In [8]:
# удалим авто с нулевым количеством дверей
df = df.drop(df[df.numberOfDoors==0].index)

##### vendor

In [9]:
df['vendor'].value_counts()

EUROPEAN    39082
JAPANESE    17980
RUSSIAN         1
Name: vendor, dtype: int64

In [10]:
# замена значения 
df.loc[df['vendor']=='RUSSIAN', 'vendor'] = 'JAPANESE'

##### section

In [11]:
df['section'].value_counts() 

used    41007
new     12403
Name: section, dtype: int64

In [12]:
df.isnull().sum()

brand                     0
engineDisplacement        0
enginePower               0
fuelType                  0
mileage                   0
modelDate                 0
model_name                0
numberOfDoors             0
productionDate            0
vehicleTransmission       0
vendor                    0
ПТС                       0
Привод                    0
Руль                      0
price                     0
auto_class                0
price_segment             0
seller_type               0
section                3653
color                     0
dtype: int64

In [13]:
df.loc[df['section'].isnull() & (df['mileage']<2000), 'mileage'].index

Int64Index([  91,  168,  339,  462,  468,  518,  661,  716,  717, 1334, 1522,
            1539, 1592, 1597, 1615, 1906, 2054, 2368, 2816, 3262, 3386, 3420,
            3440, 3487, 3505],
           dtype='int64')

In [14]:
# Заменяю пустые поля, значением new, где пробег менее 2000
df.iloc[df.loc[df['section'].isnull() & (df['mileage']<2000), 'mileage'].index, df.columns.get_loc('section')] = 'new'

# Заменяем оставшиеся пустые поля, значением used
df.iloc[df.loc[df['section'].isnull(), ['mileage']].index, df.columns.get_loc('section')] = 'used'

##### engineDisplacement==0

In [15]:
df[['brand', 'model_name', 'engineDisplacement']].loc[df['engineDisplacement']==0] 

Unnamed: 0,brand,model_name,engineDisplacement
3467,NISSAN,LEAF,0
3468,NISSAN,LEAF,0
4481,MERCEDES,B_KLASSE,0
11976,MERCEDES,EQC,0
11977,MERCEDES,EQC,0
...,...,...,...
44137,AUDI,E_TRON,0
44138,AUDI,E_TRON,0
44191,AUDI,E_TRON,0
49500,VOLKSWAGEN,GOLF,0


In [16]:
# заполнение объема двигателя авто, где значения равны 0, медианными значениями объема среди таких же моделей авто
map_engine = df.groupby(['brand','model_name'])['engineDisplacement'].median()
map_index_engine = df.loc[df['engineDisplacement'] == 0].index
for i in map_index_engine:
    df.iloc[i,  df.columns.get_loc('engineDisplacement')] = map_engine[df.iloc[i, df.columns.get_loc('brand')], df.iloc[i, df.columns.get_loc('model_name')]]

In [17]:
df.head(2)

Unnamed: 0,brand,engineDisplacement,enginePower,fuelType,mileage,modelDate,model_name,numberOfDoors,productionDate,vehicleTransmission,vendor,ПТС,Привод,Руль,price,auto_class,price_segment,seller_type,section,color
0,MERCEDES,1332.0,150,бензин,10939,2018,A_KLASSE,5,2018,роботизированная,EUROPEAN,Оригинал,передний,левый,27592.105263,C,PREMIUM,COMMERCIAL,used,белый
1,MERCEDES,1595.0,122,бензин,56671,2012,A_KLASSE,5,2015,роботизированная,EUROPEAN,Оригинал,передний,левый,17078.947368,C,PREMIUM,COMMERCIAL,used,белый


### Обучение модели

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
X_test, X_train, y_test, y_train = train_test_split(df.drop('price',axis=1), df['price'], train_size=0.27, random_state=42)
X_train.shape

(41656, 19)

In [20]:
X_test.to_csv('X_test.csv')
y_test.to_csv('y_test.csv')

### Создание pipeline

In [21]:
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion

In [22]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]
    
class OHEEncoder(BaseEstimator, TransformerMixin):
    '''
    Трансформирует категориальный признак c помощью get_dummies(drop_first=True)
    '''
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key,  drop_first=True).columns]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key,  drop_first=True)
        test_columns = [col for col in X.columns]
        for col_ in test_columns:
            if col_ not in self.columns:
                X[col_] = 0
        return X[self.columns]
    
class CountEncoder(BaseEstimator, TransformerMixin):
    '''
    Трансформирует категориальный признак. Заменяет каждый признак 
    количеством соответствующих признаков в датасете
    '''
    def __init__(self, key):
        self.key = key
        
    def fit(self, X, y=None):
        self.mapping = X.groupby([self.key]).size()
        return self

    def transform(self, X):
        X.loc[:,self.key] = X.loc[:,self.key].map(self.mapping)
        return X[[self.key]].copy()

class FixEncoder(BaseEstimator, TransformerMixin):
    '''
    Трансформирует категориальный признак ['Привод', 'auto_class', 'price_segment']. 
    Заменяет признаки на константы.
    '''
    def __init__(self):
        self.map_privod = {'задний':0, 'передний':1, 'полный':2}
        self.map_auto_class = {'A':0, 'B':1, 'C':2, 'D':3, 'E':4, 'F':5, 'J':6, 'M':7, 'S':8}
        self.map_price_segment = {'ECONOMY':0, 'MEDIUM':1, 'PREMIUM':2}

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X.replace({'Привод': self.map_privod, 'auto_class': self.map_auto_class, \
                  'price_segment': self.map_price_segment}, inplace=True)
#         X.loc[:,'Привод'] = X.loc[:,'Привод'].map(self.map_privod)
#         X.loc[:,'auto_class'] = X.loc[:,'auto_class'].map(self.map_auto_class)
#         X.loc[:,'price_segment'] = X.loc[:,'price_segment'].map(self.map_price_segment)
        return X.copy()

In [23]:
fix_features= ['Привод', 'auto_class', 'price_segment']
dummies_features=['Руль', 'vendor', 'ПТС', 'seller_type', 'section']
countencoder_features = ['fuelType', 'numberOfDoors', 'vehicleTransmission', 'color', 'brand', 'model_name']

remainig_features = list(X_train.columns)
for _ in fix_features:
    remainig_features.remove(_)

for _ in dummies_features:
    remainig_features.remove(_)

for _ in countencoder_features:
    remainig_features.remove(_)


In [33]:
remainig_features

['engineDisplacement', 'enginePower', 'mileage', 'modelDate', 'productionDate']

In [24]:
final_transformers = list()

# dummies_feutures - преобразуем с помощью OHEEncoder
transformer = Pipeline([
                ('selector', ColumnSelector(key=dummies_features)),
                ('scaler_dummies', OHEEncoder(key=dummies_features))
        ])
final_transformers.append(('dummies_feutures', transformer))

# fix_features - преобразуем с помощью FixEncoder
transformer = Pipeline([
                ('selector', ColumnSelector(key=fix_features)),
                ('scale_fix', FixEncoder())
        ])
final_transformers.append(('fix_features', transformer))



# countencoder_features - преобразуем с помощью CountEncoder
for col in countencoder_features:
    transformer = Pipeline([
                    ('selector', NumberSelector(key=col)),
                    ('scaler_count', CountEncoder(key=col))
            ])
    final_transformers.append((col, transformer))

    
# remainig_feutures - оставляем как в исходнике
transformer = Pipeline([
                    ('selector', ColumnSelector(key=remainig_features))
        ])
final_transformers.append(('remainig_feutures',transformer))

In [25]:
feats = FeatureUnion(final_transformers)

In [26]:
feats.fit(df.drop('price',axis=1), df['price'])

FeatureUnion(n_jobs=None,
             transformer_list=[('dummies_feutures',
                                Pipeline(memory=None,
                                         steps=[('selector',
                                                 ColumnSelector(key=['Руль',
                                                                     'vendor',
                                                                     'ПТС',
                                                                     'seller_type',
                                                                     'section'])),
                                                ('scaler_dummies',
                                                 OHEEncoder(key=['Руль',
                                                                 'vendor',
                                                                 'ПТС',
                                                                 'seller_type',
                                         

In [27]:
regressor = Pipeline([
    ('features', feats),
    ('regressor', RandomForestRegressor())
])

In [28]:
# regressor.fit(X_train, y_train)
regressor.fit(df.drop('price',axis=1), df['price'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  regex=regex,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item_labels[indexer[info_axis]]] = value


Pipeline(memory=None,
         steps=[('features',
                 FeatureUnion(n_jobs=None,
                              transformer_list=[('dummies_feutures',
                                                 Pipeline(memory=None,
                                                          steps=[('selector',
                                                                  ColumnSelector(key=['Руль',
                                                                                      'vendor',
                                                                                      'ПТС',
                                                                                      'seller_type',
                                                                                      'section'])),
                                                                 ('scaler_dummies',
                                                                  OHEEncoder(key=['Руль',
                              

In [29]:
y_pred = regressor.predict(X_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  regex=regex,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item_labels[indexer[info_axis]]] = value


In [30]:
mae = mean_absolute_error(y_test, y_pred)
# mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'mae: {mae}\nr2: {r2}')

mae: 1185.0381993802848
r2: 0.9902917314211762


In [32]:
# !pip freeze > requirements.txt
# !pip install pipreqs
# !pipreqs --force --use-local
# !conda list -e > requirements.txt

In [None]:
# asdasdf/0
import dill

with open("regressor_autoru_pipeline.dill", "wb") as f:
    dill.dump(regressor, f)