# Supervised Learning Exercises

In [2]:
# Import the necessary Libraries
# Pandas, Numpy
# SKLearn
# - StandardScaler
# - train_test_split, GridSearchCV
# - confusion_matrix, classification_report, mean_squared_error
# - For the Following import both Regression and Classification
# - Linear
# - K Neighbors
# - Decision Tree
# - Random Forest
# - Support Vector
# - Datasets: fetch_california_housing, load_iris

import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.svm import SVR, SVC

from sklearn.datasets import fetch_california_housing, load_iris







# Type your code above this line

### Regression

In [3]:
# Load the fetch_california_housing data into a DataFrame and add the Target to the the DataFrame


data = fetch_california_housing()

df = pd.DataFrame(data.data, columns=data.feature_names)
df.head()
# data
df[data.target_names[0]] = data.target

df.info()


df[df.isna().any(axis=1)]
# Type your code above this line

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   MedInc       20640 non-null  float64
 1   HouseAge     20640 non-null  float64
 2   AveRooms     20640 non-null  float64
 3   AveBedrms    20640 non-null  float64
 4   Population   20640 non-null  float64
 5   AveOccup     20640 non-null  float64
 6   Latitude     20640 non-null  float64
 7   Longitude    20640 non-null  float64
 8   MedHouseVal  20640 non-null  float64
dtypes: float64(9)
memory usage: 1.4 MB


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal


In [4]:
# Train Test and Split the data with Standard Scaler
X = df[df.columns[:-1]]
y = df['MedHouseVal']

# Standard scaler

scaler = StandardScaler()
scaler.fit(X)
scaled_X = scaler.transform(X)
X = pd.DataFrame(scaled_X, columns=X.columns)

X.head()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)


# Type your code above this line

In [7]:
# Create a dictionary with all the algorithms and the necessary tuning parameters called regression_dictionary
# Linear - None
# Neighbors - values 1 to 20
# Decision Tree and Random Forest - Max Depth or 4 and 5
# Support Vector - None

regression_dictionary = {
    
    'linear': {'algorithm': LinearRegression, 'params':{} },
    'neighbor':{'algorithm': KNeighborsRegressor, 'params':{'n_neighbors':range(1,20)} },
    'tree':{'algorithm': DecisionTreeRegressor, 'params':{'max_depth':[4,5]} },
    'random_forest':{'algorithm': RandomForestRegressor, 'params':{} },
    'support_vector':{'algorithm': SVR, 'params':{}},

}






# Type your code above this line

In [6]:
# Iterate through the regression_dictionary using GridSearchCV
# Fit, predict and evaluate each model


for name, algorithm in regression_dictionary.items():
    print('\n',name.title())
    grid = GridSearchCV(algorithm['algorithm'](), param_grid=algorithm['params'], verbose=3)
    grid.fit(X_train, y_train)
    pred_reg = grid.predict(X_test)
    
    print('Evaluation')
    print('Score', grid.score(X_test, y_test))
    print('RMSE', np.sqrt(mean_squared_error(y_test, pred_reg)))







# Type your code above this line


 Linear
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5] END ..................................., score=0.605 total time=   0.0s
[CV 2/5] END ..................................., score=0.603 total time=   0.0s
[CV 3/5] END ..................................., score=0.617 total time=   0.0s
[CV 4/5] END ..................................., score=0.611 total time=   0.0s
[CV 5/5] END ..................................., score=0.597 total time=   0.0s
Evaluation
Score 0.5957702326061664
RMSE 0.7284008391515452

 Neighbor
Fitting 5 folds for each of 19 candidates, totalling 95 fits
[CV 1/5] END .....................n_neighbors=1;, score=0.530 total time=   0.1s
[CV 2/5] END .....................n_neighbors=1;, score=0.506 total time=   0.1s
[CV 3/5] END .....................n_neighbors=1;, score=0.491 total time=   0.1s
[CV 4/5] END .....................n_neighbors=1;, score=0.512 total time=   0.1s
[CV 5/5] END .....................n_neighbors=1;, score=0.513 total tim

[CV 5/5] END ....................n_neighbors=19;, score=0.686 total time=   0.3s
Evaluation
Score 0.6894584983687466
RMSE 0.6384341595119768

 Tree
Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV 1/5] END .......................max_depth=4;, score=0.572 total time=   0.0s
[CV 2/5] END .......................max_depth=4;, score=0.571 total time=   0.0s
[CV 3/5] END .......................max_depth=4;, score=0.585 total time=   0.0s
[CV 4/5] END .......................max_depth=4;, score=0.579 total time=   0.0s
[CV 5/5] END .......................max_depth=4;, score=0.570 total time=   0.0s
[CV 1/5] END .......................max_depth=5;, score=0.618 total time=   0.0s
[CV 2/5] END .......................max_depth=5;, score=0.618 total time=   0.1s
[CV 3/5] END .......................max_depth=5;, score=0.622 total time=   0.1s
[CV 4/5] END .......................max_depth=5;, score=0.611 total time=   0.1s
[CV 5/5] END .......................max_depth=5;, score=0.607 t

### Classification

In [9]:
# Load the load_iris data into a DataFrame and add the Target to the the DataFrame

data = load_iris()

df = pd.DataFrame(data.data, columns=data.feature_names)
df.head()
# data
df['Target'] = data.target

df.info()


df[df.isna().any(axis=1)]



# Type your code above this line

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  150 non-null    float64
 1   sepal width (cm)   150 non-null    float64
 2   petal length (cm)  150 non-null    float64
 3   petal width (cm)   150 non-null    float64
 4   Target             150 non-null    int64  
dtypes: float64(4), int64(1)
memory usage: 6.0 KB


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),Target


In [10]:
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),Target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [11]:
df['Target Names'] = df['Target'].map({
    0: data.target_names[0] ,
    1: data.target_names[1],
    2: data.target_names[2]
    
})

In [12]:
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),Target,Target Names
0,5.1,3.5,1.4,0.2,0,setosa
1,4.9,3.0,1.4,0.2,0,setosa
2,4.7,3.2,1.3,0.2,0,setosa
3,4.6,3.1,1.5,0.2,0,setosa
4,5.0,3.6,1.4,0.2,0,setosa


In [13]:
df['Target Names'].value_counts()

setosa        50
versicolor    50
virginica     50
Name: Target Names, dtype: int64

In [14]:
# Train Test and Split the data with Standard Scaler

X = df[df.columns[:-2]]
y = df['Target']

# Standard scaler

scaler = StandardScaler()
scaler.fit(X)
scaled_X = scaler.transform(X)
X = pd.DataFrame(scaled_X, columns=X.columns)

X.head()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)





# Type your code above this line

In [15]:
X.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,-0.900681,1.019004,-1.340227,-1.315444
1,-1.143017,-0.131979,-1.340227,-1.315444
2,-1.385353,0.328414,-1.397064,-1.315444
3,-1.506521,0.098217,-1.283389,-1.315444
4,-1.021849,1.249201,-1.340227,-1.315444


In [17]:
# Create a dictionary with all the algorithms and the necessary tuning parameters called classification_dictionary
# Linear - None
# Neighbors - values 1 to 20
# Decision Tree and Random Forest - Max Depth or 4 and 5
# Support Vector - C values of 1000, 10000000, gamma values of auto and scale

classification_dictionary = {
    
    'linear': {'algorithm': LogisticRegression, 'params':{} },
    'neighbor':{'algorithm': KNeighborsClassifier, 'params':{'n_neighbors':range(1,20)} },
    'tree':{'algorithm': DecisionTreeClassifier, 'params':{'max_depth':[4,5]} },
    'random_forest':{'algorithm': RandomForestClassifier, 'params':{'max_depth':[4,5]} },
    'support_vector':{'algorithm': SVC, 'params':{'C':[1000,10000000], 'gamma':['auto','scale']}},

}






# Type your code above this line

In [19]:
# Interate through the regression_dictionary using GridSearchCV
# Fit, predict and evaluate each model

for name, algorithm in classification_dictionary.items():
    print('\n', name.title())
    
    grid = GridSearchCV(algorithm['algorithm'](), param_grid=algorithm['params'], verbose=3)
    
    grid.fit(X_train, y_train)
    
    pred_class = grid.predict(X_test)
    
    
    
    print(confusion_matrix(y_test, pred_class))
    print(classification_report(y_test, pred_class))




# Type your code above this line


 Linear
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5] END ..................................., score=0.952 total time=   0.0s
[CV 2/5] END ..................................., score=0.905 total time=   0.0s
[CV 3/5] END ..................................., score=0.905 total time=   0.0s
[CV 4/5] END ..................................., score=1.000 total time=   0.0s
[CV 5/5] END ..................................., score=0.952 total time=   0.0s
[[19  0  0]
 [ 0 13  0]
 [ 0  0 13]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       1.00      1.00      1.00        13
           2       1.00      1.00      1.00        13

    accuracy                           1.00        45
   macro avg       1.00      1.00      1.00        45
weighted avg       1.00      1.00      1.00        45


 Neighbor
Fitting 5 folds for each of 19 candidates, totalling 95 fits
[CV 1/5] END .....................n_n

[CV 1/5] END .......................max_depth=4;, score=0.952 total time=   0.1s
[CV 2/5] END .......................max_depth=4;, score=0.905 total time=   0.1s
[CV 3/5] END .......................max_depth=4;, score=0.905 total time=   0.1s
[CV 4/5] END .......................max_depth=4;, score=1.000 total time=   0.2s
[CV 5/5] END .......................max_depth=4;, score=0.952 total time=   0.2s
[CV 1/5] END .......................max_depth=5;, score=0.952 total time=   0.1s
[CV 2/5] END .......................max_depth=5;, score=0.905 total time=   0.1s
[CV 3/5] END .......................max_depth=5;, score=0.905 total time=   0.1s
[CV 4/5] END .......................max_depth=5;, score=1.000 total time=   0.1s
[CV 5/5] END .......................max_depth=5;, score=0.952 total time=   0.1s
[[19  0  0]
 [ 0 13  0]
 [ 0  0 13]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       1.00      1.00      1.00 