# Categorical

## Setup

In [3]:
import pandas as pd
import altair as alt

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

## Data

In [4]:
df = pd.read_csv('https://raw.githubusercontent.com/kirenz/datasets/master/mariokart.csv')

# we only need cond and total_pr
df = df[['cond', 'total_pr']]

df

Unnamed: 0,cond,total_pr
0,new,51.55
1,used,37.04
2,new,45.50
3,new,44.00
4,new,71.00
...,...,...
138,used,39.51
139,used,52.00
140,new,47.70
141,used,38.76


 Note that the original dataset contains some Mario Kart games being sold at prices above $100 but for this analysis we will limit our focus to Mario Kart games that were sold below $100.

In [5]:
# only keep games with total price below 100
df = df[df['total_pr']<100]

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 141 entries, 0 to 142
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   cond      141 non-null    object 
 1   total_pr  141 non-null    float64
dtypes: float64(1), object(1)
memory usage: 3.3+ KB


In [7]:
alt.Chart(df).mark_boxplot().encode(
    x='total_pr',
    y='cond'
)

In [8]:
# Convert categorical into binary numeric format
df = pd.get_dummies(df)
df

Unnamed: 0,total_pr,cond_new,cond_used
0,51.55,1,0
1,37.04,0,1
2,45.50,1,0
3,44.00,1,0
4,71.00,1,0
...,...,...,...
138,39.51,0,1
139,52.00,0,1
140,47.70,1,0
141,38.76,0,1


In [9]:
# we only keep condition new (1=new and 0=used)
df = df.drop(columns = ['cond_used'])
df

Unnamed: 0,total_pr,cond_new
0,51.55,1
1,37.04,0
2,45.50,1
3,44.00,1
4,71.00,1
...,...,...
138,39.51,0
139,52.00,0
140,47.70,1
141,38.76,0


In [10]:
# Prepare the data
X = df[["cond_new"]]
y = df[["total_pr"]]

## Model

### Fit the model

In [11]:
# linear regression model
reg = LinearRegression()

# Fit the model
reg.fit(X, y)

# Make predictions
y_pred = reg.predict(X)

### Coefficients

In [12]:
reg.intercept_

array([42.87109756])

In [13]:
reg.coef_

array([[10.89958041]])

## Evaluation

### R-squared

In [14]:
r2_score(y, y_pred)

0.3505528034705867

### RMSE

In [15]:
mean_squared_error(y, y_pred, squared=False)

7.318444770500862