# Entry 25 notebook - Baseline Models

In [1]:
import pandas as pd
import numpy as np

from sklearn.datasets import load_breast_cancer
from sklearn.dummy import DummyRegressor, DummyClassifier
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

## Regression

In [2]:
df_raw = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data', header=None, sep='\t')
cols = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model_yr', 'origin']
df_raw = df_raw[0].str.split(expand=True)
df_raw.columns = cols

df_raw[df_raw['horsepower']=='?'] = np.nan
df_raw.dropna(inplace=True)
df = df_raw.astype(float)
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_yr,origin
0,18.0,8.0,307.0,130.0,3504.0,12.0,70.0,1.0
1,15.0,8.0,350.0,165.0,3693.0,11.5,70.0,1.0
2,18.0,8.0,318.0,150.0,3436.0,11.0,70.0,1.0
3,16.0,8.0,304.0,150.0,3433.0,12.0,70.0,1.0
4,17.0,8.0,302.0,140.0,3449.0,10.5,70.0,1.0


In [3]:
y = df['mpg']
X = df.drop('mpg', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=12)
X_train.head()

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_yr,origin
244,4.0,90.0,48.0,1985.0,21.5,78.0,2.0
333,6.0,168.0,132.0,2910.0,11.4,80.0,3.0
305,4.0,151.0,90.0,2670.0,16.0,79.0,1.0
208,8.0,318.0,150.0,3940.0,13.2,76.0,1.0
371,4.0,135.0,84.0,2525.0,16.0,82.0,1.0


In [4]:
def make_reg_dummy(X_train, y_train, X_test, y_test, strategy):
    d_pipe = make_pipeline(SimpleImputer(strategy='median'), StandardScaler(), DummyRegressor(strategy=strategy))
    d_pipe.fit(X_train, y_train)
    print(strategy, 'scores:', d_pipe.score(X_test, y_test), '\n')

In [5]:
for strategy in ['mean', 'median']:
    make_reg_dummy(X_train, y_train, X_test, y_test, strategy)

mean scores: -0.03701267059018276 

median scores: -0.09726879754329487 



## Classification

In [6]:
cancer = load_breast_cancer()
df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
target = cancer.target

In [7]:
def make_class_dummy(X_train, y_train, X_test, y_test, strategy):
    d_pipe = make_pipeline(SimpleImputer(strategy='median'), StandardScaler(), DummyClassifier(strategy=strategy, random_state=12))
    d_pipe.fit(X_train, y_train)
    print(strategy, 'scores:', d_pipe.score(X_test, y_test), '\n')

In [8]:
X_train, X_test, y_train, y_test = train_test_split(df, target, train_size=0.8, random_state=12)

for strategy in ['stratified', 'most_frequent', 'prior', 'uniform']:
    make_class_dummy(X_train, y_train, X_test, y_test, strategy)

stratified scores: 0.5350877192982456 

most_frequent scores: 0.5789473684210527 

prior scores: 0.5789473684210527 

uniform scores: 0.45614035087719296 

