In [1]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, SGDRegressor

In this exercise I'm exploring 2 types of linear models, one regression, one classification.

While regression is what you typically think of for a linear model, they can also be used
effectively in classification problems.

Here's the to-do list of this notebook:

1. Load the wine dataset from sklearn.
2. For the wine dataset, create and train and test split, 80% train / 20% test.
3. Create a LogisticRegression model with these hyper parameters: random_state=0, max_iter=10000.
4. Evaluate the model with the test dataset.
5. Load the diabetes dataset from sklearn.
6. For the diabetes dataset, create a train and test split, 80% train / 20% test.
7. Create a SGDRegressor model with these hyper parameters: random_state-0, max_iter=10000.
8. Evaluate the model with the test dataset.

Linear Classifier

In [2]:
# Load in the wine dataset
wine = datasets.load_wine()

# Create the wine `data` dataset as a dataframe and name the columns with `feature_names`
df_wine = pd.DataFrame(wine['data'], columns=wine['feature_names'])

# Include the target as well
df_wine['target'] = wine['target']

In [3]:
df_wine

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
0,14.23,1.71,2.43,15.6,127.0,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.20,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.50,16.8,113.0,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95.0,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740.0,2
174,13.40,3.91,2.48,23.0,102.0,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750.0,2
175,13.27,4.28,2.26,20.0,120.0,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835.0,2
176,13.17,2.59,2.37,20.0,120.0,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840.0,2


In [9]:
# Split your data with these ratios: train 0.8 | test: 0.2

df_wine_train, df_wine_test = train_test_split(df_wine, test_size=0.2, random_state=0)

In [10]:
# How does the model perform on the training dataset and default model parameters?
# Using the hyperparameters in the requirements, is there improvement?
# Remember we use the test dataset to score the model
clf = LogisticRegression(random_state=0, max_iter=10000).fit(
    df_wine_train.loc[:, df_wine_train.columns != "target"], df_wine_train["target"]
)

clf.score(df_wine_test.loc[:, df_wine_test.columns != 'target'], df_wine_test['target'])

0.9722222222222222

Linear Regression

In [6]:
# Load in the diabetes dataset

diabetes = datasets.load_diabetes()

In [7]:
# Create the diabetes 'data' dataset as a dataframe and name the columns with 'feature_names'

df_diabetes = pd.DataFrame(diabetes['data'], columns=diabetes['feature_names'])

# Include the target as well
df_diabetes['target'] = diabetes['target']

In [8]:
df_diabetes

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
0,0.038076,0.050680,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204,75.0
2,0.085299,0.050680,0.044451,-0.005670,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.025930,141.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362,206.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641,135.0
...,...,...,...,...,...,...,...,...,...,...,...
437,0.041708,0.050680,0.019662,0.059744,-0.005697,-0.002566,-0.028674,-0.002592,0.031193,0.007207,178.0
438,-0.005515,0.050680,-0.015906,-0.067642,0.049341,0.079165,-0.028674,0.034309,-0.018114,0.044485,104.0
439,0.041708,0.050680,-0.015906,0.017293,-0.037344,-0.013840,-0.024993,-0.011080,-0.046883,0.015491,132.0
440,-0.045472,-0.044642,0.039062,0.001215,0.016318,0.015283,-0.028674,0.026560,0.044529,-0.025930,220.0


In [11]:
# Split your data with these ratios: train: 0.8 | test: 0.2

df_diabetes_train, df_diabetes_test = train_test_split(df_diabetes, test_size=0.2, random_state=0)

In [12]:
# How does the model perform on the training dataset and default model parameters?
# Using the hyperparameters in the requirements, is there improvement?
# Remember we use the test dataset to score the model
reg = SGDRegressor(random_state=0, max_iter=10000).fit(
    df_diabetes_train.loc[:, df_diabetes_train.columns != "target"], df_diabetes_train["target"]
)
reg.score(df_diabetes_test.loc[:, df_diabetes_test.columns != "target"], df_diabetes_test["target"])

0.3484895912801911

In [1]:
[ (x, x+1) for x in range (1,5) ]

[(1, 2), (2, 3), (3, 4), (4, 5)]