In [1]:
# Setup matplotlib to plot inline (within the notebook)
%matplotlib inline

# Import the pyplot module of Matplotlib as plt
import matplotlib.pyplot as plt

# Import pandas under the abbreviation 'pd'
import pandas as pd

# Import NumPy under the abbreviation 'np'
import numpy as np

In [4]:
animalspeed_df = pd.read_csv('animal_speeds.csv', delimiter=';')
animalspeed_df.head()

Unnamed: 0,animal,weight,movement_type,highspeed
0,House sparrow,0.04,flying,46.0
1,Pelican,7.8,flying,65.0
2,Vulture,11.0,flying,88.0
3,Pigeon,0.35,flying,80.0
4,Puffin,0.58,flying,88.0


In [5]:
animalspeed_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159 entries, 0 to 158
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   animal         159 non-null    object 
 1   weight         159 non-null    float64
 2   movement_type  159 non-null    object 
 3   highspeed      159 non-null    float64
dtypes: float64(2), object(2)
memory usage: 5.1+ KB


In [6]:
animalspeed_df.dtypes

animal            object
weight           float64
movement_type     object
highspeed        float64
dtype: object

In [7]:
from sklearn.pipeline import Pipeline

from sklearn.compose import ColumnTransformer

from sklearn.impute import SimpleImputer

from sklearn.preprocessing import OneHotEncoder

from sklearn.model_selection import train_test_split

In [8]:
categorical_features = ["animal", "movement_type"]

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))])

In [11]:

preprocessor = ColumnTransformer(
    transformers=[
        
        ("cat", categorical_transformer, categorical_features)])

In [12]:
from sklearn.linear_model import Ridge

from sklearn.svm import SVR

from sklearn.ensemble import RandomForestRegressor

In [13]:
regression_models = {"Ridge": Ridge(),
                     "SVR_linear": SVR(kernel="linear"),
                     "SVR_rbf": SVR(kernel="rbf"),
                     "RandomForestRegressor": RandomForestRegressor()}

regression_results = {}

In [15]:
animalspeed_df_X = animalspeed_df.drop("highspeed", axis=1)

animalspeed_df_y = animalspeed_df["highspeed"]

In [16]:
animalspeed_df_X_train, animalspeed_df_X_test, animalspeed_df_y_train, animalspeed_df_y_test = train_test_split(animalspeed_df_X,
                                                                    animalspeed_df_y,
                                                                    test_size=0.2,
                                                                    random_state=0)

animalspeed_df_X_train.shape, animalspeed_df_X_test.shape, animalspeed_df_y_train.shape, animalspeed_df_y_test.shape

((127, 3), (32, 3), (127,), (32,))

In [17]:
for model_name, model in regression_models.items():
    
    model_pipeline = Pipeline(steps=[("preprocessor", preprocessor),
                                      ("model", model)])
    
    print(f"Fitting {model_name}...")
    model_pipeline.fit(animalspeed_df_X_train, animalspeed_df_y_train)

    print(f"Scoring {model_name}...")
    regression_results[model_name] = model_pipeline.score(animalspeed_df_X_test, 
                                                          animalspeed_df_y_test)

Fitting Ridge...
Scoring Ridge...
Fitting SVR_linear...
Scoring SVR_linear...
Fitting SVR_rbf...
Scoring SVR_rbf...
Fitting RandomForestRegressor...
Scoring RandomForestRegressor...


In [18]:
regression_results


{'Ridge': 0.28593024105551335,
 'SVR_linear': 0.22550740880110842,
 'SVR_rbf': 0.12725559553935728,
 'RandomForestRegressor': 0.3738118100248793}

In [19]:
from sklearn.metrics import mean_absolute_error

from sklearn.metrics import mean_squared_error


from sklearn.metrics import r2_score

In [20]:
ridge_pipeline = Pipeline(steps=[("preprocessor", preprocessor),
                                 ("animal", Ridge())])

ridge_pipeline.fit(animalspeed_df_X_train, animalspeed_df_y_train)

animalspeed_df_y_preds = ridge_pipeline.predict(animalspeed_df_X_test)

# View the first 50 predictions
animalspeed_df_y_preds[:50]

array([87.73987098, 50.86963072, 50.86963072, 50.86963072, 50.86963072,
       38.43589201, 50.86963072, 38.43589201, 41.3213842 , 50.86963072,
       38.43589201, 38.43589201, 50.86963072, 37.00276622, 50.86963072,
       41.3213842 , 50.86963072, 38.43589201, 50.86963072, 87.73987098,
       50.86963072, 50.86963072, 50.86963072, 38.43589201, 50.86963072,
       87.73987098, 87.73987098, 50.86963072, 38.43589201, 38.43589201,
       87.73987098, 50.86963072])

In [21]:
mse = mean_squared_error(animalspeed_df_y_test, animalspeed_df_y_preds)
# Return the MSE
mse

849.20121271434

In [22]:
mae =  mean_absolute_error(animalspeed_df_y_test, animalspeed_df_y_preds)
# Return the MAE
mae

22.51028042665635

In [23]:
r2 = r2_score(animalspeed_df_y_test, animalspeed_df_y_preds)
# Return the R^2 score
r2

0.28593024105551335