# Categorical

## Setup

In [1]:
import pandas as pd
import altair as alt

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

## Data

### Import data

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/kirenz/datasets/master/mariokart.csv')

### Data structure

In [3]:
df

Unnamed: 0,id,duration,n_bids,cond,start_pr,ship_pr,total_pr,ship_sp,seller_rate,stock_photo,wheels,title
0,150377422259,3,20,new,0.99,4.00,51.55,standard,1580,yes,1,~~ Wii MARIO KART &amp; WHEEL ~ NINTENDO Wii ~...
1,260483376854,7,13,used,0.99,3.99,37.04,firstClass,365,yes,1,Mariokart Wii Nintendo with wheel - Mario Kart...
2,320432342985,3,16,new,0.99,3.50,45.50,firstClass,998,no,1,Mario Kart Wii (Wii)
3,280405224677,3,18,new,0.99,0.00,44.00,standard,7,yes,1,Brand New Mario Kart Wii Comes with Wheel. Fre...
4,170392227765,1,20,new,0.01,0.00,71.00,media,820,yes,2,BRAND NEW NINTENDO 1 WII MARIO KART WITH 2 WHE...
...,...,...,...,...,...,...,...,...,...,...,...,...
138,110441497272,1,20,used,0.01,0.00,39.51,standard,7284,yes,0,Mario Kart Wii (Wii) Nintendo Wii game *--WOW ...
139,150376936435,7,9,used,17.99,0.00,52.00,parcel,121,no,2,Mario Kart Wii (Wii)
140,140349730405,3,14,new,0.99,8.70,47.70,priority,251,yes,1,"Wii Mario Kart game + wheel: NIB, factory sealed"
141,300352306018,7,13,used,1.00,4.90,38.76,parcel,41,no,0,Mario Kart Wii


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 143 entries, 0 to 142
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           143 non-null    int64  
 1   duration     143 non-null    int64  
 2   n_bids       143 non-null    int64  
 3   cond         143 non-null    object 
 4   start_pr     143 non-null    float64
 5   ship_pr      143 non-null    float64
 6   total_pr     143 non-null    float64
 7   ship_sp      143 non-null    object 
 8   seller_rate  143 non-null    int64  
 9   stock_photo  143 non-null    object 
 10  wheels       143 non-null    int64  
 11  title        142 non-null    object 
dtypes: float64(3), int64(5), object(4)
memory usage: 13.5+ KB


### Data corrections

In [5]:
# we only need cond and total_pr
df = df[['cond', 'total_pr']]

In [6]:
df

Unnamed: 0,cond,total_pr
0,new,51.55
1,used,37.04
2,new,45.50
3,new,44.00
4,new,71.00
...,...,...
138,used,39.51
139,used,52.00
140,new,47.70
141,used,38.76


 Note that the original dataset contains some Mario Kart games being sold at prices above $\$100$ but for this analysis we will limit our focus to Mario Kart games that were sold below $\$100$.

In [7]:
# only keep games with total price below 100
df = df[df['total_pr']<100]

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 141 entries, 0 to 142
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   cond      141 non-null    object 
 1   total_pr  141 non-null    float64
dtypes: float64(1), object(1)
memory usage: 3.3+ KB


In [9]:
# make a copy of our original dataframe (before we change the data)
df_orig = df.copy()

In [10]:
# Convert categorical into binary numeric format
df = pd.get_dummies(df)

In [11]:
df.head()

Unnamed: 0,total_pr,cond_new,cond_used
0,51.55,1,0
1,37.04,0,1
2,45.5,1,0
3,44.0,1,0
4,71.0,1,0


In [12]:
# we only keep condition new (1=new and 0=used)
df = df.drop(columns = ['cond_used'])

In [13]:
df.head()

Unnamed: 0,total_pr,cond_new
0,51.55,1
1,37.04,0
2,45.5,1
3,44.0,1
4,71.0,1


### Variable lists

In [14]:
# Prepare the data
X = df[["cond_new"]]
y = df[["total_pr"]]

## Analysis

In [15]:
alt.Chart(df_orig).mark_boxplot().encode(
    x='total_pr',
    y='cond'
)

## Model

### Select model

In [16]:
# linear regression model
reg = LinearRegression()

### Fit the model

In [None]:
# Fit the model
reg.fit(X, y)

### Coefficients

In [17]:
reg.intercept_

array([42.87109756])

In [18]:
reg.coef_

array([[10.89958041]])

### Make predictions

In [None]:
# Make predictions
y_pred = reg.predict(X)

### Evaluation

#### R-squared

In [19]:
r2_score(y, y_pred)

0.3505528034705867

#### RMSE

In [20]:
mean_squared_error(y, y_pred, squared=False)

7.318444770500862