In [1]:
# modules needed for inline matplotlib and seaborn
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()  # set plotting style to seaborn

In [2]:
# modules
import pandas as pd

# sklearn modules
from sklearn.model_selection import StratifiedShuffleSplit

## Load Data File

In [3]:
munged_df = pd.read_csv('data/munged/bikes_2019-03-24.csv')
munged_df.head()

Unnamed: 0,bike_type,brand,price,frame_material,model_year,brake_type,fork_material,handlebar_material,fd_groupset,rd_groupset,cassette_groupset,crankset_material,crankset_groupset,seatpost_material,chain_groupset,shifter_groupset
0,road,Pinarello,12000.0,carbon,2018.0,rim,carbon,carbon,sram red etap,sram red etap,sram red,,sram red,,sram red,sram red etap
1,road,Pinarello,10000.0,carbon,,hydraulic,carbon,carbon,shimano dura-ace,shimano dura-ace,shimano dura-ace,,shimano dura-ace,,shimano dura-ace,shimano dura-ace
2,road,Factor,9999.0,carbon,2018.0,hydraulic,carbon,,sram red etap,sram red etap,sram force,,sram red,,sram red,sram red etap
3,road,Factor,8750.0,carbon,,rim,carbon,,sram red etap,sram red etap,sram red,,sram red,,sram red,sram red etap
4,road,Factor,8999.0,carbon,2018.0,rim,carbon,carbon,sram red etap,sram red etap,sram red,,sram red,,sram red,sram red etap


### Quick Glance
Let's take a quick glance to see what we're working with to get a better sense of what we're working with.

#### Info
* There are 2194 samples
* 16 columns of which **price** and **model_year** are numeric data, everthing else is categorical.
* **bike_type**, **brand**, **price**, and **brake_type** are only fields without missing data.
* Many of the fields have missing values; need to determine how to best handle this.

In [4]:
munged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2194 entries, 0 to 2193
Data columns (total 16 columns):
bike_type             2194 non-null object
brand                 2194 non-null object
price                 2194 non-null float64
frame_material        1936 non-null object
model_year            1582 non-null float64
brake_type            2194 non-null object
fork_material         1076 non-null object
handlebar_material    1150 non-null object
fd_groupset           642 non-null object
rd_groupset           1360 non-null object
cassette_groupset     1503 non-null object
crankset_material     639 non-null object
crankset_groupset     994 non-null object
seatpost_material     1024 non-null object
chain_groupset        902 non-null object
shifter_groupset      1113 non-null object
dtypes: float64(2), object(14)
memory usage: 274.3+ KB


#### Numerical
##### Price
* we have values less than 0, we should look into these and possibly drop before test split
* there seems to be few very expensive bike samples in the dataset; definitely want to ensure they're distributed in both train and test sets

##### Model Year
* range from 2013 to 2020
* looks like most bike sample are 2019 model year, which makes sense since data was collected in 2018-2019 season

In [5]:
munged_df.model_year.mode()

0    2019.0
dtype: float64

In [6]:
munged_df.describe()

Unnamed: 0,price,model_year
count,2194.0,1582.0
mean,2250.573801,2018.257901
std,2151.609557,0.847206
min,-1.0,2013.0
25%,535.9925,2018.0
50%,1499.99,2018.0
75%,3399.98,2019.0
max,12519.99,2020.0


### Data Cleaning
From quick glance of the dataset, it looks like it needs a little more data cleaning before we can proceed with test set hold out. Tasks:
1. We have prices less than zero, we should drop samples with bike prices less than a certain threshold
2. Remove samples that missing values for all fields except "brand", "price", "brake_type" which have no missing values.
3. Look at **bike_type** field and determine if we need to drop any specific values. We care primarily about adult bikes.

In [7]:
# Identify samples with empty rows
empty_rows_df = munged_df[(munged_df.frame_material.isnull()) & (munged_df.model_year.isnull())
         & (munged_df.fork_material.isnull()) & (munged_df.handlebar_material.isnull())
         & (munged_df.fd_groupset.isnull()) & (munged_df.rd_groupset.isnull())
         & (munged_df.cassette_groupset.isnull()) & (munged_df.crankset_material.isnull())
         & (munged_df.crankset_groupset.isnull()) & (munged_df.seatpost_material.isnull())
         & (munged_df.chain_groupset.isnull()) & (munged_df.shifter_groupset.isnull())]
empty_rows_df.head()

Unnamed: 0,bike_type,brand,price,frame_material,model_year,brake_type,fork_material,handlebar_material,fd_groupset,rd_groupset,cassette_groupset,crankset_material,crankset_groupset,seatpost_material,chain_groupset,shifter_groupset
215,childrens,SE,169.0,,,other,,,,,,,,,,
222,childrens,SE,179.0,,,other,,,,,,,,,,
256,childrens,SE,159.0,,,other,,,,,,,,,,
433,childrens,Weehoo,449.0,,,other,,,,,,,,,,
434,childrens,Weehoo,199.0,,,other,,,,,,,,,,


In [8]:
# Identify bikes with prices less than 100
low_price_df = munged_df[munged_df.price < 100]
low_price_df.head()

Unnamed: 0,bike_type,brand,price,frame_material,model_year,brake_type,fork_material,handlebar_material,fd_groupset,rd_groupset,cassette_groupset,crankset_material,crankset_groupset,seatpost_material,chain_groupset,shifter_groupset
95,road,Bianchi,-1.0,carbon,2018.0,rim,carbon,,shimano ultegra di2,shimano ultegra di2,shimano ultegra,,shimano ultegra,,,shimano ultegra di2
666,mountain,Niner,-1.0,carbon,2017.0,hydraulic,,,,shimano deore xt,shimano deore xt,,shimano deore xt,,shimano deore xt,shimano deore xt
667,mountain,Pivot,-1.0,carbon,2018.0,hydraulic,,carbon,,shimano xtr,shimano deore xt,,race face,,,shimano deore xt
668,mountain,Pivot,-1.0,carbon,2017.0,hydraulic,,carbon,,sram xO1 eagle,sram xO1 eagle,,sram xO1 eagle,carbon,,sram xO1 eagle
715,mountain,Borealis,-1.0,carbon,2018.0,hydraulic,,alloy,,sram xO1 eagle,sram gx eagle,,sram gx,alloy,sram x1,sram gx


In [9]:
# Identify bike_type values that should be dropped
munged_df.bike_type.value_counts()

mountain       781
road           441
childrens      242
urban          236
bmx            132
ebike          129
cyclocross      71
comfort         46
hybrid          40
touring         30
commuter        23
folding         12
triathlon        4
track            3
cargo            3
singlespeed      1
Name: bike_type, dtype: int64

In [10]:
# Drop children's bike and focus on adult bikes only
children_df = munged_df[munged_df.bike_type == 'childrens']
children_df.head()

Unnamed: 0,bike_type,brand,price,frame_material,model_year,brake_type,fork_material,handlebar_material,fd_groupset,rd_groupset,cassette_groupset,crankset_material,crankset_groupset,seatpost_material,chain_groupset,shifter_groupset
145,childrens,Fuji,159.99,alloy,2019.0,coaster,steel,steel,,,,steel,,alloy,,
198,childrens,Fuji,344.99,alloy,2019.0,linear_pull,,steel,shimano tourney,shimano acera,shimano altus,,,steel,,shimano tourney
207,childrens,Fuji,424.99,alloy,2019.0,mechanical,,alloy,,shimano acera,shimano tourney,,,alloy,,shimano altus
213,childrens,Fuji,239.99,alloy,,linear_pull,,steel,,shimano tourney,single speed,,,steel,,shimano tourney
215,childrens,SE,169.0,,,other,,,,,,,,,,


In [11]:
# Create dataframe without dropped rows
drop_list = empty_rows_df.index.tolist() + low_price_df.index.tolist() + children_df.index.tolist()
drop_list = set(drop_list)  # remove dupes
drop_count = len(drop_list)
print(f'There are {drop_count} samples to be dropped.')
print(f'There should be {len(munged_df) - drop_count} samles left afterwards.')
data_df = munged_df.drop(drop_list)
print(f'There are now {len(data_df)} samples remaining.')

There are 274 samples to be dropped.
There should be 1920 samles left afterwards.
There are now 1920 samples remaining.


In [12]:
data_df.bike_type.value_counts()

mountain       769
road           438
urban          234
ebike          128
bmx            120
cyclocross      71
comfort         46
hybrid          39
touring         30
commuter        23
folding         12
triathlon        4
track            3
cargo            2
singlespeed      1
Name: bike_type, dtype: int64

In [13]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1920 entries, 0 to 2193
Data columns (total 16 columns):
bike_type             1920 non-null object
brand                 1920 non-null object
price                 1920 non-null float64
frame_material        1765 non-null object
model_year            1441 non-null float64
brake_type            1920 non-null object
fork_material         966 non-null object
handlebar_material    1032 non-null object
fd_groupset           621 non-null object
rd_groupset           1288 non-null object
cassette_groupset     1430 non-null object
crankset_material     563 non-null object
crankset_groupset     980 non-null object
seatpost_material     928 non-null object
chain_groupset        886 non-null object
shifter_groupset      1062 non-null object
dtypes: float64(2), object(14)
memory usage: 255.0+ KB


In [14]:
data_df.describe()

Unnamed: 0,price,model_year
count,1920.0,1441.0
mean,2520.56776,2018.252602
std,2163.364683,0.845763
min,159.99,2013.0
25%,724.99,2018.0
50%,1894.5,2018.0
75%,3599.99,2019.0
max,12519.99,2020.0


## Train Test Split
Implement "stratified" split for this regression data. By doing so, the hope is to get a more reliable (lower bias and variance) estimate of model performance. Using regular random split on the dataset is likely to naturally introduce bias into the data or unintentionally exclude data points, and thus affect model performance. Using stratification during model selection produces better results because the validation set(s) more accurately represent the task we need to solve. If one class is hard to predict, we won’t have more of them (artificially hindering the performance metric) or less (assisting performance) if we use stratification.

#### Sorted Stratification
Since this a regression task and not classification, we need a method to define the "classification" sets of the data so we can ensure similar distribution of the target variable in both the train and test sets. The following approach will be used to achieve this:
1. Sort the samples on the target variable
2. Categorize samples into *k* partitions (or "classes") depending on test size needed
3. Utilize sklearn's "StratifiedShuffleSplit" to generate tran vs test split

Alternative approach is to categorize samples by **bike_type** to ensure similar distribution of bike types between train and test sets. Categorizing by target variable is most likely the best since that is what we're predicting for. The latter might introduce other biases that we're unaware.

In [40]:
# Set random state for reproducibility purposes
RAND_STATE = 42

In [41]:
# We can simplify steps 1 and 2 by help of pandas.qcut method
price_cats = pd.qcut(x=data_df.price, q=10, labels=range(10))
price_cats.head()

0    9
1    9
2    9
3    9
4    9
Name: price, dtype: category
Categories (10, int64): [0 < 1 < 2 < 3 ... 6 < 7 < 8 < 9]

In [42]:
# Generate shuffled train and test indices
train_idx, test_idx = next(
    StratifiedShuffleSplit(n_splits=1, test_size=0.20, random_state=RAND_STATE).split(
        X=price_cats, y=price_cats
    )
)
print(f'Num train samples: {len(train_idx)}\nNum test samples: {len(test_idx)}')

Num train samples: 1536
Num test samples: 384


In [51]:
# Get train and test datasets
train_df = data_df.iloc[train_idx].copy()
test_df = data_df.iloc[test_idx].copy()  # Hold-out set

# Verify dimensions
print(f'train shape: {train_df.shape}\ntest shape: {test_df.shape}')

train shape: (1536, 16)
test shape: (384, 16)


## Train Data Exploration and Feature Engineering
The data has many categorical fields. The first step is see how we can transform them into useful numerical fields. This will involving some one-hot encoding and identifying rankable values.

**price** and **model_year** are already numerical. Price is the target variable, so we won't directly touch this since we don't want to include it as a feature else we will be indirectly including what we want to predict as feature.

Let's look at each of the remaining categorical fields to see how best to utilize or whether to just drop them. Model year has some missing valaues so we will need to decide how we want to handle those missing values.

In [48]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1536 entries, 1687 to 1902
Data columns (total 16 columns):
bike_type             1536 non-null object
brand                 1536 non-null object
price                 1536 non-null float64
frame_material        1418 non-null object
model_year            1146 non-null float64
brake_type            1536 non-null object
fork_material         756 non-null object
handlebar_material    836 non-null object
fd_groupset           477 non-null object
rd_groupset           1027 non-null object
cassette_groupset     1149 non-null object
crankset_material     460 non-null object
crankset_groupset     795 non-null object
seatpost_material     741 non-null object
chain_groupset        717 non-null object
shifter_groupset      857 non-null object
dtypes: float64(2), object(14)
memory usage: 204.0+ KB


#### Frame Material
There are 1418 non-null samples. Looking at value counts, we see majority of frame material are either carbon or aluminium/alloy material. This is typical of what you would expect, majority of bikes this day are primarily aluminium or some alloy and for top-end there's a high demand for carbon due to its lightness and stiffness. The average price per frame material confirms this. Carbon frame bikes are considerably more expensive than aluminium or alloy frame bikes. Titanium also seems to garner higher price but we only have two samples represented in train dataset so we should consider trying to get more data for such bikes.

Since carbon and titanium seem to have a strong relationship to price, the best approach for this categorical data is to implement one-hot encoding for carbon frame material. The same will be done for titanium in preparation for when more data is collected. After doing some further research, titanium seems to be an upcoming frame material for similar reasons as carbon but less fragile.

In [52]:
print(train_df.frame_material.value_counts())
grouped = train_df.groupby('frame_material')
grouped['price'].mean()

carbon      590
aluminum    454
alloy       222
steel        88
chromoly     62
titanium      2
Name: frame_material, dtype: int64


frame_material
alloy       1350.845631
aluminum    1639.982753
carbon      4240.846712
chromoly    1002.245161
steel        777.092727
titanium    3499.000000
Name: price, dtype: float64

In [54]:
train_df['carbon_frame'] = train_df.frame_material.apply(lambda x: 1 if x == 'carbon' else 0)
train_df.carbon_frame.value_counts()

0    946
1    590
Name: carbon_frame, dtype: int64

In [53]:
train_df['titanium_frame'] = train_df.frame_material.map(lambda x: 1 if x == 'titanium' else 0)
train_df.titanium_frame.value_counts()

0    1534
1       2
Name: titanium_frame, dtype: int64

#### Other Material Categorical Fields
Except for fork material, we see that carbon material continues to have a strong relationship to price for other material based fields. So we will implement similar one-hot encoding to extract numerical features for these fields.

In [55]:
print(train_df.handlebar_material.value_counts())
grouped = train_df.groupby('handlebar_material')
grouped['price'].mean()

alloy       565
carbon      139
aluminum     73
steel        41
chromoly     18
Name: handlebar_material, dtype: int64


handlebar_material
alloy       1951.561398
aluminum    1734.927260
carbon      6430.066835
chromoly     658.993333
steel        418.381463
Name: price, dtype: float64

In [58]:
train_df['carbon_handlebar'] = train_df.handlebar_material.apply(lambda x: 1 if x == 'carbon' else 0)
train_df.carbon_handlebar.value_counts()

0    1397
1     139
Name: carbon_handlebar, dtype: int64

In [56]:
print(train_df.crankset_material.value_counts())
grouped = train_df.groupby('crankset_material')
grouped['price'].mean()

alloy       245
carbon       79
steel        55
aluminum     48
chromoly     33
Name: crankset_material, dtype: int64


crankset_material
alloy       1603.336286
aluminum    2626.796667
carbon      6350.193418
chromoly     461.870000
steel       1159.007273
Name: price, dtype: float64

In [59]:
train_df['carbon_crankset'] = train_df.crankset_material.apply(lambda x: 1 if x == 'carbon' else 0)
train_df.carbon_crankset.value_counts()

0    1457
1      79
Name: carbon_crankset, dtype: int64

In [57]:
print(train_df.seatpost_material.value_counts())
grouped = train_df.groupby('seatpost_material')
grouped['price'].mean()

alloy       517
carbon      156
aluminum     48
steel        20
Name: seatpost_material, dtype: int64


seatpost_material
alloy       1299.749729
aluminum    1622.093125
carbon      4037.964103
steel        476.789500
Name: price, dtype: float64

In [60]:
train_df['carbon_seatpost'] = train_df.seatpost_material.apply(lambda x: 1 if x == 'carbon' else 0)
train_df.carbon_seatpost.value_counts()

0    1380
1     156
Name: carbon_seatpost, dtype: int64

Alloy, aluminium, and carbon fork have relatively same average price. Surprisingly, having steel and chromoly fork has strong negative relationship to price. For this, we will implement one-hot encoding for steel and chromoly fork material; or rather one-hot encoding for non-carbon, alloy, aluminium for material.

In [61]:
print(train_df.fork_material.value_counts())
grouped = train_df.groupby('fork_material')
grouped['price'].mean()

carbon      396
steel       164
alloy        73
chromoly     67
aluminum     56
Name: fork_material, dtype: int64


fork_material
alloy       2542.837123
aluminum    2259.267143
carbon      2783.785833
chromoly    1011.839104
steel        847.309939
Name: price, dtype: float64

In [65]:
train_df['nonstandard_fork'] = train_df.fork_material.apply(lambda x: 1 if x == 'chromoly' or x == 'steel' else 0)
train_df.nonstandard_fork.value_counts()

0    1305
1     231
Name: nonstandard_fork, dtype: int64

#### Brake Type

In [66]:
print(train_df.brake_type.value_counts())
grouped = train_df.groupby('brake_type')
grouped['price'].mean()

hydraulic      773
other          298
mechanical     125
disc           114
vbrake          59
rim             53
caliper         48
linear_pull     28
ubrake          20
coaster         18
Name: brake_type, dtype: int64


brake_type
caliper        1373.881458
coaster         384.374444
disc           2420.468070
hydraulic      3310.022885
linear_pull     996.412143
mechanical     1089.069840
other          2097.056946
rim            2780.842264
ubrake          310.039500
vbrake          499.992203
Name: price, dtype: float64

In [68]:
train_df[(train_df.brake_type == 'disc') | (train_df.brake_type == 'hydraulic') | (train_df.brake_type == 'mechanical')].mean()

price               2935.488913
model_year          2018.330544
titanium_frame         0.000000
carbon_frame           0.443676
carbon_handlebar       0.113636
carbon_crankset        0.070158
carbon_seatpost        0.103755
nonstandard_fork       0.072134
dtype: float64

In [69]:
train_df[train_df.brake_type == 'other'] ### DATA INTEGRITY ISSUES - NEED TO CLEAN UP DATA PIPELINE
# Seem to have framesets in samples and unnecessarily incomplete samples which have actual specs on website

Unnamed: 0,bike_type,brand,price,frame_material,model_year,brake_type,fork_material,handlebar_material,fd_groupset,rd_groupset,...,crankset_groupset,seatpost_material,chain_groupset,shifter_groupset,titanium_frame,carbon_frame,carbon_handlebar,carbon_crankset,carbon_seatpost,nonstandard_fork
2046,ebike,Specialized,3449.99,,2019.0,other,,,,,...,,,,,0,0,0,0,0,0
1699,mountain,Santa Cruz,2799.99,carbon,2019.0,other,,,,sram nx,...,sram nx,,sram nx,sram nx,0,1,0,0,0,0
1825,mountain,Haro,329.99,,2017.0,other,,,,,...,,,,,0,0,0,0,0,0
910,track,Fuji,336.99,chromoly,2018.0,other,chromoly,alloy,,,...,,alloy,,,0,0,0,0,0,1
1535,road,Specialized,4124.99,carbon,2017.0,other,carbon,alloy,sram red etap,sram red etap,...,,carbon,,sram red etap,0,1,0,0,1,0
2061,bmx,Haro,1369.99,chromoly,2018.0,other,chromoly,chromoly,,,...,,,,,0,0,0,0,0,1
1616,road,Specialized,679.99,aluminum,2016.0,other,carbon,alloy,shimano sora,shimano sora,...,shimano sora,alloy,sram x9,shimano sora,0,0,0,0,0,0
676,cyclocross,Pivot,4499.00,carbon,,other,carbon,,shimano ultegra,shimano ultegra,...,praxis,,,shimano ultegra,0,1,0,0,0,0
1890,mountain,Specialized,2499.99,,2019.0,other,,,,,...,,,,,0,0,0,0,0,0
2193,bmx,Kink,699.99,chromoly,2019.0,other,chromoly,,,,...,,,,,0,0,0,0,0,1


#### Model Year Field

In [None]:
# Summary statistics
train_df.model_year.describe()

In [None]:
# Mode
train_df.model_year.mode()

In [None]:
# Counts by values
train_df.model_year.value_counts()

The mean is about 2018, the median is 2018, and the mode is 2019. Let's set missing values to 2018

In [None]:
train_df[train_df.model_year.isnull()]