# Categorical

## Setup

In [None]:
import pandas as pd
import altair as alt

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)
alt.data_transformers.disable_max_rows()

## Data

### Import data

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/kirenz/datasets/master/mariokart.csv')

### Data structure

In [None]:
df

In [None]:
df.info()

### Data corrections

In [None]:
# we only need cond and total_pr
df = df[['cond', 'total_pr']]

In [None]:
df

 Note that the original dataset contains some Mario Kart games being sold at prices above $\$100$ but for this analysis we will limit our focus to Mario Kart games that were sold below $\$100$.

In [None]:
# only keep games with total price below 100
df = df[df['total_pr']<100]

In [None]:
df.info()

In [None]:
# make a copy of our original dataframe (before we change the data)
df_orig = df.copy()

In [None]:
# Convert categorical into binary numeric format
df = pd.get_dummies(df)

In [None]:
df.head()

In [None]:
# we only keep condition new (1=new and 0=used)
df = df.drop(columns = ['cond_used'])

In [None]:
df.head()

### Variable lists

In [None]:
# Prepare the data
X = df[["cond_new"]]
y = df[["total_pr"]]

## Analysis

In [None]:
alt.Chart(df_orig).mark_boxplot().encode(
    x='total_pr',
    y='cond'
)

## Model

### Select model

In [None]:
# linear regression model
reg = LinearRegression()

### Fit the model

In [None]:
# Fit the model
reg.fit(X, y)

### Coefficients

In [None]:
reg.intercept_

In [None]:
reg.coef_

### Make predictions

In [None]:
# Make predictions
y_pred = reg.predict(X)

### Evaluation

#### R-squared

In [None]:
r2_score(y, y_pred)

#### RMSE

In [None]:
mean_squared_error(y, y_pred, squared=False)