In [101]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import make_pipeline, make_union
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import FunctionTransformer, StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA

from matplotlib import pyplot as plt

%matplotlib inline

In [90]:
df = pd.read_csv(
    'ceny_aut_w_polsce.tsv', 
    sep='\t',
    dtype={
        'price': np.float32,
        'mileage': np.float32,
        'year': np.float32,
        'brand': str,
        'engingeType': str,
        'engineCapacity': np.float32,
    },
)

In [91]:
X = df[['mileage', 'year', 'brand', 'engingeType', 'engineCapacity']]
y = df['price']

In [92]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [93]:
class PandasSelector(BaseEstimator, TransformerMixin):
    
    def __init__(self, selected_columns):
        self.selected_columns = selected_columns
    
    def fit(self, df, *args):
        self.fitted_ = True
        return self

    def transform(self, df):
        return np.array(df[self.selected_columns])

In [97]:
class MileagePerYear(BaseEstimator, TransformerMixin):
    
    def fit(self, df, *args):
        return self

    def transform(self, df):
        mpy = np.array(df['mileage']/(2018 - df['year']))
        log_mpy = np.log(1 + mpy)
        return np.reshape(log_mpy, (-1, 1))


class StringConverter(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        self.map = {} # column : string : int
    
    def fit(self, X, *args):
        for col in range(X.shape[1]):
            self.map[col] = {}
            idx = 1
            for row in range(X.shape[0]):
                s = X[row, col]
                if s not in self.map[col]:
                    self.map[col][s] = idx
                    idx += 1
        return self

    def transform(self, X):
        X_int = np.zeros(shape=X.shape)
        for col in range(X.shape[1]):
            for row in range(X.shape[0]):
                s = X[row, col]
                X_int[row, col] = self.map[col].get(s, 0)

        return X_int

In [111]:
pipeline = make_pipeline(
    make_union(
        # how old is it?
        make_pipeline(
            PandasSelector(['year']),
            FunctionTransformer(
                lambda x: np.log(2018 - x),
                validate=False,
            ),
        ),
        # float feats
        make_pipeline(
            PandasSelector(['mileage', 'engineCapacity']),
            StandardScaler(),
        ),
        # was it used often?
        make_pipeline(
            MileagePerYear(),
            StandardScaler(),
        ),
        # brand
        make_pipeline(
            PandasSelector(['brand']),
            StringConverter(),
            OneHotEncoder(sparse=False),
            PCA(20),
        ),
    ),
    LinearRegression(),
)

model = pipeline.fit(X_train, y_train)

In [112]:
from sklearn.metrics import mean_absolute_error

mean_absolute_error(y_test, model.predict(X_test))

14756.51365243448