In [231]:
import os
import random
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

# Setup

In [232]:
class Config:
    seed = 1707
    test_size = 0.2

In [233]:
def set_all_seeds(seed):
    # python's seeds
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

set_all_seeds(seed=Config.seed)

# Load data

In [234]:
train = pd.read_csv('../data/car_train.csv')
test = pd.read_csv('../data/car_test.csv')

In [235]:
train.sample(3)

Unnamed: 0,car_id,model,car_type,fuel_type,car_rating,year_to_start,riders,year_to_work,target_reg,target_class
435,g-1503076Q,Smart ForFour,economy,petrol,6.08,2017,134494,2017,48.26,engine_check
2285,I69261244y,Nissan Qashqai,standart,petrol,4.18,2015,87802,2021,98.3,another_bug
1245,O-2688441F,Tesla Model 3,premium,electro,4.82,2015,80038,2021,56.28,engine_fuel


In [236]:
test.sample(3)

Unnamed: 0,car_id,model,car_type,fuel_type,car_rating,year_to_start,riders,year_to_work
607,I-1503695V,Nissan Qashqai,standart,petrol,4.56,2013,39508,2021
1859,T-2294411h,Skoda Rapid,economy,petrol,5.28,2014,60231,2016
1446,E-5337479S,Skoda Rapid,economy,petrol,4.54,2013,42677,2017


# Feature engineering

In [237]:
rides = pd.read_csv('../data/rides_info.csv')
rides.sample(3)

Unnamed: 0,user_id,car_id,ride_id,ride_date,rating,ride_duration,ride_cost,speed_avg,speed_max,stop_times,distance,refueling,user_ride_quality,deviation_normal
478669,P16103023s,h20080273C,Y1g,2020-03-30,6.48,13,177,33,42.160949,0,393.856126,0,3.189878,1.339
256062,g50133062h,R61678217L,F1m,2020-02-27,5.34,36,461,49,73.0,1,745.277404,0,1.72349,-1.567
451753,C34625892I,f61508634I,s1N,2020-01-27,6.81,56,889,46,67.0,1,2089.670705,0,-10.100461,0.184


In [238]:
rides_df_gr = rides.groupby('car_id', as_index=False).agg(
    mean_rating=('rating', 'mean'),
    distance_sum=('distance', 'sum'),
    rating_min=('rating', 'min'),
    speed_max=('speed_max', 'max'),
    user_ride_quality_median=('user_ride_quality', 'median'),
    deviation_normal_count=('deviation_normal', 'count'),
    user_uniq=('user_id', lambda x: x.nunique())
)
#
rides_df_gr.head(3)

Unnamed: 0,car_id,mean_rating,distance_sum,rating_min,speed_max,user_ride_quality_median,deviation_normal_count,user_uniq
0,A-1049127W,4.255172,11257530.0,0.1,179.729652,-0.288229,174,172
1,A-1079539w,4.088046,19127650.0,0.1,184.505566,2.508746,174,173
2,A-1162143G,4.662299,2995194.0,0.1,180.0,0.639653,174,172


In [239]:
def add_feature(df):

    if 'mean_rating' not in df.columns:
        return df.merge(rides_df_gr, on='car_id', how='left')
    return df

train = add_feature(train)
test = add_feature(test)

## Encode categorical features as a one-hot

In [240]:
features_cat = ['car_type', 'fuel_type', 'model']
train = pd.get_dummies(train, columns=features_cat)
test = pd.get_dummies(test, columns=features_cat)

train.sample(3)

Unnamed: 0,car_id,car_rating,year_to_start,riders,year_to_work,target_reg,target_class,mean_rating,distance_sum,rating_min,...,model_Renault Sandero,model_Skoda Rapid,model_Smart Coupe,model_Smart ForFour,model_Smart ForTwo,model_Tesla Model 3,model_VW Polo,model_VW Polo VI,model_VW Tiguan,model_Volkswagen ID.4
1799,F21467141f,3.96,2015,86262,2016,29.02,electro_bug,4.283908,20347800.0,0.0,...,False,False,False,False,False,False,False,False,False,False
260,G13398649k,4.2,2014,65872,2017,54.52,engine_ignition,4.133391,17227250.0,0.1,...,False,False,False,False,False,False,True,False,False,False
1112,v22019991n,3.24,2015,78360,2021,34.65,electro_bug,4.155057,13031440.0,0.0,...,False,False,False,False,False,False,False,False,False,False


## Classifying the features

In [241]:
features_drop = ['car_id', 'target_reg']
features_target = ['target_class', 'target_reg']
features_cat = ['car_type', 'fuel_type', 'model']

In [242]:
features_filtered = [i for i in train.columns if (i not in features_target and i not in features_drop)]
features_num = [i for i in features_filtered if i not in features_cat]

print(
    f'features_cat: {len(features_cat)}, {features_cat}\n'
    f'features_num: {len(features_num)}, {features_num}\n'
    f'features_target: {len(features_target)}, {features_target}\n'
)

features_cat: 3, ['car_type', 'fuel_type', 'model']
features_num: 43, ['car_rating', 'year_to_start', 'riders', 'year_to_work', 'mean_rating', 'distance_sum', 'rating_min', 'speed_max', 'user_ride_quality_median', 'deviation_normal_count', 'user_uniq', 'car_type_business', 'car_type_economy', 'car_type_premium', 'car_type_standart', 'fuel_type_electro', 'fuel_type_petrol', 'model_Audi A3', 'model_Audi A4', 'model_Audi Q3', 'model_BMW 320i', 'model_Fiat 500', 'model_Hyundai Solaris', 'model_Kia Rio', 'model_Kia Rio X', 'model_Kia Rio X-line', 'model_Kia Sportage', 'model_MINI CooperSE', 'model_Mercedes-Benz E200', 'model_Mercedes-Benz GLC', 'model_Mini Cooper', 'model_Nissan Qashqai', 'model_Renault Kaptur', 'model_Renault Sandero', 'model_Skoda Rapid', 'model_Smart Coupe', 'model_Smart ForFour', 'model_Smart ForTwo', 'model_Tesla Model 3', 'model_VW Polo', 'model_VW Polo VI', 'model_VW Tiguan', 'model_Volkswagen ID.4 ']
features_target: 2, ['target_class', 'target_reg']



# Random Forest with K-fold validation

In [244]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, StratifiedKFold

In [246]:
X = train[features_filtered].drop(features_target, axis=1, errors='ignore')
y = train[['target_class']]

n_split = 5
clfs = []
scores = []

kf = KFold(n_splits=n_split, shuffle=True, random_state=Config.seed)
for num, (train_index, test_index) in enumerate(kf.split(X)):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    clf = RandomForestClassifier(
        n_estimators=2_000,
        min_samples_leaf=4,
        n_jobs=1,
        max_features=0.60,
        # class_weight='balanced',
        random_state=Config.seed,
        max_depth=6
    )

    clfs.append(clf) # Save model

    clf.fit(X_train, y_train["target_class"])

    y_pred = clf.predict(X_test)
    score = np.mean(np.array(y_pred == y_test['target_class']))
    scores.append(score)
    print(f'fold: {num} acc: {score}')

assert len(clfs) == n_split

# Count avg and dispersion for all folds
print(f'mean accuracy score: {np.mean(scores, dtype="float16")} std: {np.std(scores)}')

fold: 0 acc: 0.7756410256410257
fold: 1 acc: 0.8055555555555556
fold: 2 acc: 0.8008565310492506
fold: 3 acc: 0.8094218415417559
fold: 4 acc: 0.8265524625267666
mean accuracy score: 0.8037109375 std: 0.0164545695250253
