# Classification vs Regression ML Models

## Data loading

In [6]:
import pandas as pd
import seaborn as sns

df_mpg = sns.load_dataset('mpg', index_col='name').dropna()
df_mpg

Unnamed: 0_level_0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
chevrolet chevelle malibu,18.0,8,307.0,130.0,3504,12.0,70,usa
buick skylark 320,15.0,8,350.0,165.0,3693,11.5,70,usa
...,...,...,...,...,...,...,...,...
ford ranger,28.0,4,120.0,79.0,2625,18.6,82,usa
chevy s-10,31.0,4,119.0,82.0,2720,19.4,82,usa


## Feature selection

Create two target variables:

1. `target_categorical`
2. `target_numerical`

In [7]:
target_categorical = df_mpg.origin
target_numerical = df_mpg.mpg

And select the explanatory variables:

In [8]:
explanatory = df_mpg.drop(columns=['mpg', 'origin'])

## Machine Learning System

K Nearest Neighbors

### ML classification model

#### Fit the mathematical equation

In [9]:
from sklearn.neighbors import KNeighborsClassifier

model_kn_c = KNeighborsClassifier()
model_kn_c.fit(X=explanatory, y=target_categorical)

#### Calculate predictions

In [10]:
model_kn_c.predict(X=explanatory)

array(['usa', 'usa', 'usa', 'usa', 'usa', 'usa', 'usa', 'usa', 'usa',
       'usa', 'usa', 'usa', 'usa', 'usa', 'japan', 'usa', 'usa', 'usa',
       'japan', 'europe', 'europe', 'europe', 'japan', 'europe', 'usa',
       'usa', 'usa', 'usa', 'usa', 'japan', 'japan', 'europe', 'usa',
       'usa', 'usa', 'usa', 'usa', 'usa', 'usa', 'usa', 'usa', 'usa',
       'usa', 'usa', 'usa', 'usa', 'usa', 'usa', 'japan', 'japan',
       'europe', 'europe', 'japan', 'japan', 'europe', 'japan', 'japan',
       'usa', 'japan', 'usa', 'usa', 'usa', 'usa', 'usa', 'usa', 'usa',
       'usa', 'usa', 'usa', 'usa', 'japan', 'usa', 'usa', 'usa', 'usa',
       'europe', 'europe', 'europe', 'europe', 'usa', 'japan', 'japan',
       'usa', 'usa', 'usa', 'usa', 'usa', 'usa', 'usa', 'usa', 'usa',
       'usa', 'usa', 'usa', 'usa', 'usa', 'usa', 'usa', 'usa', 'usa',
       'usa', 'japan', 'usa', 'usa', 'usa', 'usa', 'usa', 'japan', 'usa',
       'japan', 'japan', 'usa', 'usa', 'japan', 'usa', 'usa', 'europe',
    

#### Compare predictions to reality

In [11]:
model_kn_c.score(X=explanatory, y=target_categorical)

0.8214285714285714

#### Compare predictions to reality in a `DataFrame`

In [12]:
df_pred_classification = target_categorical.to_frame()
df_pred_classification['prediction_classification'] = model_kn_c.predict(X=explanatory)
df_pred_classification

Unnamed: 0_level_0,origin,prediction_classification
name,Unnamed: 1_level_1,Unnamed: 2_level_1
chevrolet chevelle malibu,usa,usa
buick skylark 320,usa,usa
...,...,...
ford ranger,usa,usa
chevy s-10,usa,japan


### ML regression model

#### Fit the mathematical equation

In [13]:
from sklearn.neighbors import KNeighborsRegressor

model_kn_r = KNeighborsRegressor()
model_kn_r.fit(X=explanatory, y=target_numerical)

#### Calculate predictions

In [14]:
model_kn_r.predict(X=explanatory)

array([18.24, 15.28, 17.04, 17.04, 17.04, 14.5 , 14.5 , 14.9 , 13.8 ,
       16.84, 15.64, 15.44, 16.52, 20.22, 23.8 , 23.54, 22.12, 24.12,
       27.  , 29.66, 24.88, 27.78, 26.2 , 24.  , 24.16, 12.8 , 13.48,
       13.48, 12.4 , 27.  , 28.44, 24.  , 23.12, 19.48, 18.58, 17.8 ,
       18.2 , 14.8 , 12.8 , 14.8 , 14.7 , 11.8 , 12.8 , 12.  , 22.4 ,
       23.  , 18.44, 17.8 , 25.54, 27.82, 33.04, 33.04, 34.24, 34.24,
       29.66, 29.7 , 25.04, 28.6 , 28.24, 21.78, 24.98, 13.7 , 14.3 ,
       14.9 , 14.3 , 14.92, 12.8 , 13.4 , 13.2 , 13.6 , 29.76, 16.3 ,
       14.7 , 13.8 , 14.7 , 21.24, 26.02, 25.34, 31.6 , 27.18, 24.8 ,
       25.02, 29.  , 28.32, 14.  , 14.92, 15.3 , 14.8 , 16.64, 11.8 ,
       13.4 , 15.28, 14.8 , 13.2 , 11.8 , 15.64, 19.8 , 18.44, 19.04,
       24.16, 22.48, 30.92, 11.8 , 11.8 , 13.2 , 13.4 , 21.58, 24.8 ,
       23.  , 26.2 , 25.  , 25.94, 26.64, 26.04, 15.  , 15.3 , 34.88,
       30.5 , 25.1 , 22.4 , 17.04, 26.68, 22.36, 14.2 , 20.3 , 21.08,
       18.58, 29.1 ,

#### Compare predictions to reality

In [15]:
model_kn_r.score(X=explanatory, y=target_numerical)

0.8171378313866275

#### Compare predictions to reality in a `DataFrame`

In [16]:
df_pred_regression = target_numerical.to_frame()
df_pred_regression['prediction_regression'] = model_kn_r.predict(X=explanatory)
df_pred_regression

Unnamed: 0_level_0,mpg,prediction_regression
name,Unnamed: 1_level_1,Unnamed: 2_level_1
chevrolet chevelle malibu,18.0,18.24
buick skylark 320,15.0,15.28
...,...,...
ford ranger,28.0,29.50
chevy s-10,31.0,26.58


#### Join regression and classification `DataFrame`

In [17]:
pd.concat([df_pred_classification, df_pred_regression], axis=1)

Unnamed: 0_level_0,origin,prediction_classification,mpg,prediction_regression
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
chevrolet chevelle malibu,usa,usa,18.0,18.24
buick skylark 320,usa,usa,15.0,15.28
...,...,...,...,...
ford ranger,usa,usa,28.0,29.50
chevy s-10,usa,japan,31.0,26.58


## Other algorithms

### Decision Tree

#### Regression 

In [18]:
from sklearn.tree import DecisionTreeRegressor
model_dt_r = DecisionTreeRegressor()

model_dt_r.fit(X=explanatory, y=target_numerical)
model_dt_r.predict(X=explanatory)
model_dt_r.score(X=explanatory, y=target_numerical)

1.0

#### Classification

In [19]:
from sklearn.tree import DecisionTreeClassifier
model_dt_c = DecisionTreeClassifier()

model_dt_c.fit(X=explanatory, y=target_categorical)
model_dt_c.predict(X=explanatory)
model_dt_c.score(X=explanatory, y=target_categorical)

1.0

### Random Forest

#### Regression 

In [20]:
from sklearn.ensemble import RandomForestRegressor
model_rf_r = RandomForestRegressor()

model_rf_r.fit(X=explanatory, y=target_numerical)
model_rf_r.predict(X=explanatory)
model_rf_r.score(X=explanatory, y=target_numerical)

0.9833286252204423

#### Classification

In [21]:
from sklearn.ensemble import RandomForestClassifier
model_rf_c = RandomForestClassifier()

model_rf_c.fit(X=explanatory, y=target_categorical)
model_rf_c.predict(X=explanatory)
model_rf_c.score(X=explanatory, y=target_categorical)

1.0

## Compare all models in a `DataFrame`

### Regression models

In [22]:
list_model_regression = [model_kn_r, model_dt_r, model_rf_r]
list_model_regression_names = map(lambda x: x.__class__.__name__, list_model_regression)
list_model_regression_predictions = list(map(lambda x: x.predict(X=explanatory), list_model_regression))
df_pred_regression = pd.DataFrame(
    list_model_regression_predictions,
    index=list_model_regression_names).T.set_index(df.index)

df_pred_regression.insert(0, 'target_numerical', target_numerical)
df_pred_regression

Unnamed: 0_level_0,target_numerical,KNeighborsRegressor,DecisionTreeRegressor,RandomForestRegressor
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
chevrolet chevelle malibu,18.0,18.24,18.0,17.115
buick skylark 320,15.0,15.28,15.0,14.790
...,...,...,...,...
ford ranger,28.0,29.50,28.0,28.973
chevy s-10,31.0,26.58,31.0,30.213


### Classification models

In [23]:
list_model_classification = [model_kn_c, model_dt_c, model_rf_c]
list_model_classification_names = map(lambda x: x.__class__.__name__, list_model_classification)
list_model_classification_predictions = list(map(lambda x: x.predict(X=explanatory), list_model_classification))
df_pred_classification = pd.DataFrame(
    list_model_classification_predictions,
    index=list_model_classification_names).T.set_index(df.index)

df_pred_classification.insert(0, 'target_categorical', target_categorical)
df_pred_classification

Unnamed: 0_level_0,target_numerical,KNeighborsClassifier,DecisionTreeClassifier,RandomForestClassifier
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
chevrolet chevelle malibu,usa,usa,usa,usa
buick skylark 320,usa,usa,usa,usa
...,...,...,...,...
ford ranger,usa,usa,usa,usa
chevy s-10,usa,japan,usa,usa
