In [2]:
from xgboost import XGBClassifier
from matplotlib import pyplot
import pandas as pd
import numpy as np

from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer

In [19]:
df = pd.read_csv('/home/hawliet/Documents/Data-science-project/data/cars/cars_preprocessed_undrop.csv', sep='\t', encoding='utf-8')
df.head(3)

Unnamed: 0,url,name,model,brand,price,eLabel,bodyType,length,height,width,...,fuelCapacity,fuelConsumption,speed,payload,trailerWeight,vEengineType,vEfuelType,vEengineDisplacement,vEenginePower,torque
0,http://www.cars-data.com//en/audi-s3-quattro-s...,Audi S3 quattro,Audi S3,Audi,42.247,G,hatchback,4159.0,1415.0,1763.0,...,55.0,9.1,238.0,560.0,1600.0,dohc,petrol,1781.0,154.0,270.0
1,http://www.cars-data.com//en/audi-a6-avant-40-...,Audi A6 Avant 40 TDI,Audi A6 Avant,Audi,64.45,C,stationwagon,4939.0,1494.0,1886.0,...,73.0,4.5,241.0,645.0,2000.0,dohc,diesel,1968.0,0.0,400.0
2,http://www.cars-data.com//en/audi-a6-avant-45-...,Audi A6 Avant 45 TDI quattro,Audi A6 Avant,Audi,77.13,E,stationwagon,4939.0,1494.0,1886.0,...,73.0,5.7,250.0,695.0,2000.0,dohc,diesel,2967.0,0.0,500.0


In [20]:
df.drop(columns=['url', 'name', 'model', 'fuelType', 'vehicleTransmission'], inplace=True)

In [21]:
num_cols = ['length', 'height', 'width', 'weight', 'weightTotal', 'emissionsCO2', 'numberOfAxles',
            'numberOfDoors', 'numberOfForwardGears', 'seatingCapacity', 'cargoVolume', 'roofLoad', 
            'accelerationTime', 'fuelCapacity', 'fuelConsumption', 'speed', 'payload', 'trailerWeight', 
            'vEengineDisplacement', 'vEenginePower', 'torque']
cat_cols = []
for col in df.columns:
    if not (col in num_cols):
        cat_cols.append(col)

In [11]:
df[cat_cols][:5]

Unnamed: 0,brand,price,eLabel,bodyType,modelDate,driveWheelConfiguration,vEengineType,vEfuelType
0,Audi,42.247,G,hatchback,1999.0,front+rear,dohc,petrol
1,Audi,64.45,C,stationwagon,2018.0,front,dohc,diesel
2,Audi,77.13,E,stationwagon,2018.0,front+rear,dohc,diesel
3,Audi,83.0,E,stationwagon,2018.0,front+rear,dohc,diesel
4,Audi,67.96,C,stationwagon,2019.0,front,dohc,petrol


In [32]:
tmp = df.driveWheelConfiguration.str.replace('+/', ' ')
tmp

error: nothing to repeat at position 0

In [10]:
class MultilabelEcoding(BaseEstimator, TransformerMixin):
    def __init__(self, num_top_titles=1):
        self.num_top_titles = num_top_titles
    def fit(self, X_df, y=None):
        title_col = X_df.Name.str.extract(r'([a-zA-z]+)\.', expand=False)
        self.title_counts_ = title_col.value_counts()
        titles = list(self.title_counts_.index)
        self.top_titles_ = titles[:max(1, min(self.num_top_titles, len(titles)))]
        return self
    def transform(self, X_df, y=None):
        transformed_df = X_df.copy()
        title_col = transformed_df.Name.str.extract(r'([a-zA-z]+)\.', expand=False)
        transformed_df.insert(2, 'Title', title_col, True)
        transformed_df['Title'] = transformed_df['Title'].apply(lambda x: x if any(title == x for title in self.top_titles_) else 'Others')
        transformed_df = transformed_df.drop(['Cabin', 'Ticket', 'Name'], axis=1)
        return transformed_df

array(['Audi', 'Honda', 'Renault', 'Buick', 'Opel', 'BMW', 'Peugeot',
       'Citroen', 'Toyota', 'Porsche', 'Austin', 'Maybach',
       'Aston Martin', 'Alfa Romeo', 'Mazda', 'Fiat', 'Mini', 'Ford',
       'Mercedes', 'Kia', 'Ferrari', 'Volkswagen', 'Jaguar', 'Nissan',
       'Bentley', 'Suzuki', 'Talbot', 'Volvo', 'Daihatsu', 'Chevrolet',
       'Lexus', 'Cadillac', 'Lotus', 'Josse', 'Hyundai', 'Skoda',
       'McLaren', 'Saab', 'Mitsubishi', 'Seat', 'Jeep', 'Lada', 'Lancia',
       'Dodge', 'Datsun', 'Lamborghini', 'Infiniti', 'Daimler', 'DS',
       'SsangYong', 'Rover', 'MG', 'TVR', 'Maserati', 'Tesla', 'Morgan',
       'Pontiac', 'Subaru', 'Land Rover', 'Daewoo', 'Galloper', 'FSO',
       'Triumph', 'Carver', 'Dacia', 'Noble', 'Chrysler', 'Smart',
       'Donkervoort', 'Landwind', 'Innocenti', 'Hummer', 'Lincoln',
       'Mercury', 'Rolls-Royce', 'Yugo', 'Abarth', 'Marcos', 'Fisker',
       'Autobianchi', 'PGO', 'Think', 'Asia Motors', 'Corvette', 'Mega',
       'Princess', 'Spec

In [16]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

column_trans = ColumnTransformer([('cats', categorical_transformer, cat_cols)])
# ('nums', SimpleImputer(missing_values=np.nan, strategy='mean'), num_cols)

preprocess_pipeline = Pipeline(steps=[
    ('colsTransformer', column_trans)])
# ('stdScaler', StandardScaler())

X = preprocess_pipeline.fit_transform(df[cat_cols])

In [17]:
si = SimpleImputer(strategy='mean')
y = si.fit_transform(df.price.values.reshape(-1, 1))

In [18]:
X.shape, y.shape

((84174, 27322), (84174, 1))

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.9)

In [20]:
X_train.shape

(8417, 27322)

In [None]:
model = XGBClassifier()
model.fit(X, y)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
