In [1]:
import numpy as np
import pandas as pd
import sklearn

print(f'NumPy version: {np.__version__}')
print(f'Pandas version: {pd.__version__}')
print(f'Sklearn version: {sklearn.__version__}')

NumPy version: 1.19.2
Pandas version: 1.1.3
Sklearn version: 0.23.2


In [2]:
from sklearn import metrics
from sklearn.linear_model import LinearRegression, LogisticRegression, ARDRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
import warnings
warnings.filterwarnings('ignore')  
from sklearn import preprocessing

from sklearn.model_selection import cross_validate, GridSearchCV, RandomizedSearchCV


In [3]:
def make_df(df:pd.DataFrame, scores:dict, model_name:str, search_strategy:str, index:int):
    """
    This function prints into DataFrame the scores values, search_strategy and regression model for a certain data set.
    :param df:in this DataFrame we eill insert the values
    :param scores:the calculated scores
    :param model_name:the name of the regression model
    :param search_strategy: the search strategy used
    :index: line number
    """
    
    df.iloc[index, 0] = model_name
    df.iloc[index, 1] = search_strategy
    df.iloc[index, 2] = np.mean(scores['test_neg_mean_absolute_error'])
    df.iloc[index, 3] = np.mean(scores['test_neg_mean_squared_error'])
    df.iloc[index, 4] = np.mean(scores['test_neg_median_absolute_error'])
    df.iloc[index, 5] = np.mean(scores['train_neg_mean_absolute_error'])
    df.iloc[index, 6] = np.mean(scores['train_neg_mean_squared_error'])
    df.iloc[index, 7] = np.mean(scores['train_neg_median_absolute_error'])
    df.iloc[index, 8] = np.mean(scores['fit_time'])
    df.iloc[index, 9] = np.mean(scores['score_time'])


## CPU Computer Hardware

1. [CPU Computer Hardware](https://archive.ics.uci.edu/ml/datasets/Computer+Hardware); excludeti din dataset coloanele: vendor name, model name, estimated relative performance; se va estima coloana "published relative performance".

In [4]:
CPU_data: pd.DataFrame = pd.read_csv("data/machine.data", sep=",", header=None)
CPU_data = CPU_data.drop([0, 1, 9], axis = 1)

empty:np.ndarray = np.zeros(10)
CPU_df = pd.DataFrame({
    'Model':list(empty), 
    'Search_strategy': list(empty), 
    'test_neg_mean_absolute_error': list(empty), 
    'test_neg_mean_squared_error': list(empty), 
    'test_neg_median_absolute_error': list(empty),
    'train_neg_mean_absolute_error': list(empty),
    'train_neg_mean_squared_error': list(empty),
    'train_neg_median_absolute_error': list(empty),
    'fit_time': list(empty),
    'score_time': list(empty)
})

CPU_x: np.ndarray = CPU_data.iloc[:, :-1].values
CPU_y: np.ndarray = CPU_data.iloc[:, -1].values

CPU_data.head()

Unnamed: 0,2,3,4,5,6,7,8
0,125,256,6000,256,16,128,198
1,29,8000,32000,32,8,32,269
2,29,8000,32000,32,8,32,220
3,29,8000,32000,32,8,32,172
4,29,8000,16000,32,8,16,132


- #### LiniarRegression

In [5]:
model = LinearRegression()
param_dict:dict = {"fit_intercept":[True, False], "normalize":[True, False], "n_jobs":[-1, None, 3, 5]}
grid_search:sklearn.model_selection._search.GridSearchCV = GridSearchCV(estimator=model, param_grid=param_dict, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(CPU_x, CPU_y)
best_param:dict = grid_search.best_params_
print("Best parameters using GridSearchCV: ", best_param)
model_best:sklearn.linear_model._base.LinearRegression = LinearRegression(**best_param)
scores1:dict = cross_validate(model_best, CPU_x, CPU_y, cv=5, scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'], return_train_score=True)
make_df(CPU_df, scores1, 'LinearRegression', 'GridSearchCV', 0)

randomized_search:sklearn.model_selection._search.RandomizedSearchCV = RandomizedSearchCV(estimator=model, param_distributions=param_dict, cv=5, scoring='neg_mean_squared_error')
randomized_search.fit(CPU_x, CPU_y)
best_param:dict= randomized_search.best_params_
print("Best parameters using RansomizedSearchCV: ", best_param)
model_best:sklearn.linear_model._base.LinearRegression= LinearRegression(**best_param)
scores2:dict = cross_validate(model_best, CPU_x, CPU_y, cv=5, scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'], return_train_score=True)
make_df(CPU_df, scores2, 'LinearRegression', 'RandomizedSearchCV', 1)

print("The intermediate data frame looks like this:")
CPU_df.head(n=10)

Best parameters using GridSearchCV:  {'fit_intercept': True, 'n_jobs': -1, 'normalize': True}
Best parameters using RansomizedSearchCV:  {'normalize': True, 'n_jobs': 5, 'fit_intercept': True}
The intermediate data frame looks like this:


Unnamed: 0,Model,Search_strategy,test_neg_mean_absolute_error,test_neg_mean_squared_error,test_neg_median_absolute_error,train_neg_mean_absolute_error,train_neg_mean_squared_error,train_neg_median_absolute_error,fit_time,score_time
0,LinearRegression,GridSearchCV,-43.378262,-6383.656697,-27.053792,-36.695674,-3243.698611,-25.581576,0.001643,0.0
1,LinearRegression,RandomizedSearchCV,-43.378262,-6383.656697,-27.053792,-36.695674,-3243.698611,-25.581576,0.0016,0.0
2,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


- #### DecisionTreeRegressor

In [6]:
model = DecisionTreeRegressor()
param_dict:dict = {'criterion':['mse', 'friedman_mse', 'mae'],'splitter':['best', 'random'], 'max_depth': [1, 2, 3, 4, 5],
              'min_samples_leaf':[1, 2, 3, 4, 5], 'max_features':[None, 'auto', 'log2', 'sqrt'],
                 'max_leaf_nodes':[None, 10, 20, 30, 40, 50]}
grid_search:sklearn.model_selection._search.GridSearchCV = GridSearchCV(estimator=model, param_grid=param_dict, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(CPU_x, CPU_y)
best_param:dict = grid_search.best_params_
print("Best parameters using GridSearchCV: ", best_param)
model_best:sklearn.tree._classes.DecisionTreeRegressor = DecisionTreeRegressor(**best_param)
scores1:dict = cross_validate(model_best, CPU_x, CPU_y, cv=5, scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'], return_train_score=True)
make_df(CPU_df, scores1, 'DecisionTreeRegressor', 'GridSearchCV', 2)

randomized_search:sklearn.model_selection._search.RandomizedSearchCV = RandomizedSearchCV(estimator=model, param_distributions=param_dict, cv=5, scoring='neg_mean_squared_error')
randomized_search.fit(CPU_x, CPU_y)
best_param:dict= randomized_search.best_params_
print("Best parameters using RansomizedSearchCV: ", best_param)
model_best:sklearn.tree._classes.DecisionTreeRegressor= DecisionTreeRegressor(**best_param)
scores2:dict = cross_validate(model_best, CPU_x, CPU_y, cv=5, scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'], return_train_score=True)
make_df(CPU_df, scores2, 'DecisionTreeRegressor', 'RandomizedSearchCV', 3)

print("The intermediate data frame looks like this:")
CPU_df.head(n=10)

Best parameters using GridSearchCV:  {'criterion': 'mse', 'max_depth': 5, 'max_features': 'auto', 'max_leaf_nodes': 20, 'min_samples_leaf': 1, 'splitter': 'random'}
Best parameters using RansomizedSearchCV:  {'splitter': 'best', 'min_samples_leaf': 2, 'max_leaf_nodes': 20, 'max_features': 'sqrt', 'max_depth': 5, 'criterion': 'mae'}
The intermediate data frame looks like this:


Unnamed: 0,Model,Search_strategy,test_neg_mean_absolute_error,test_neg_mean_squared_error,test_neg_median_absolute_error,train_neg_mean_absolute_error,train_neg_mean_squared_error,train_neg_median_absolute_error,fit_time,score_time
0,LinearRegression,GridSearchCV,-43.378262,-6383.656697,-27.053792,-36.695674,-3243.698611,-25.581576,0.001643,0.0
1,LinearRegression,RandomizedSearchCV,-43.378262,-6383.656697,-27.053792,-36.695674,-3243.698611,-25.581576,0.0016,0.0
2,DecisionTreeRegressor,GridSearchCV,-36.153043,-4715.49686,-17.191567,-18.744878,-729.438723,-13.290054,0.0,0.001703
3,DecisionTreeRegressor,RandomizedSearchCV,-39.546574,-10164.441928,-17.95,-19.314293,-1273.884085,-8.05,0.001605,0.001604
4,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


- #### LogisticRegression

In [7]:
model = LogisticRegression()
param_dict:dict = {'penalty': ['l1', 'l2', 'elasticnet'], 'solver':['liblinear', 'saga'], 'max_iter':[20]}
grid_search:sklearn.model_selection._search.GridSearchCV = GridSearchCV(estimator=model, param_grid=param_dict, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(CPU_x, CPU_y)
best_param:dict = grid_search.best_params_
print("Best parameters using GridSearchCV: ", best_param)
model_best:sklearn.linear_model._logistic.LogisticRegression = LogisticRegression(**best_param)

scores1:dict = cross_validate(model_best, CPU_x, CPU_y, cv=5, scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'], return_train_score=True)
make_df(CPU_df, scores1, 'LogisticRegression', 'GridSearchCV', 4)

randomized_search:sklearn.model_selection._search.RandomizedSearchCV = RandomizedSearchCV(estimator=model, param_distributions=param_dict, cv=5, scoring='neg_mean_squared_error')
randomized_search.fit(CPU_x, CPU_y)
best_param:dict= randomized_search.best_params_
print("Best parameters using RansomizedSearchCV: ", best_param)
model_best:sklearn.linear_model._logistic.LogisticRegression= LogisticRegression(**best_param)
scores2:dict = cross_validate(model_best, CPU_x, CPU_y, cv=5, scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'], return_train_score=True)
make_df(CPU_df, scores2, 'LogisticRegression', 'RandomizedSearchCV', 5)

print("The intermediate data frame looks like this:")
CPU_df.head(n=10)

Best parameters using GridSearchCV:  {'max_iter': 20, 'penalty': 'l1', 'solver': 'liblinear'}
Best parameters using RansomizedSearchCV:  {'solver': 'liblinear', 'penalty': 'l1', 'max_iter': 20}
The intermediate data frame looks like this:


Unnamed: 0,Model,Search_strategy,test_neg_mean_absolute_error,test_neg_mean_squared_error,test_neg_median_absolute_error,train_neg_mean_absolute_error,train_neg_mean_squared_error,train_neg_median_absolute_error,fit_time,score_time
0,LinearRegression,GridSearchCV,-43.378262,-6383.656697,-27.053792,-36.695674,-3243.698611,-25.581576,0.001643,0.0
1,LinearRegression,RandomizedSearchCV,-43.378262,-6383.656697,-27.053792,-36.695674,-3243.698611,-25.581576,0.0016,0.0
2,DecisionTreeRegressor,GridSearchCV,-36.153043,-4715.49686,-17.191567,-18.744878,-729.438723,-13.290054,0.0,0.001703
3,DecisionTreeRegressor,RandomizedSearchCV,-39.546574,-10164.441928,-17.95,-19.314293,-1273.884085,-8.05,0.001605,0.001604
4,LogisticRegression,GridSearchCV,-49.538211,-9244.966318,-18.8,-15.677203,-1145.408483,-4.4,0.086624,0.00148
5,LogisticRegression,RandomizedSearchCV,-51.528571,-9937.661208,-19.3,-15.599458,-1136.875749,-4.4,0.087251,0.000433
6,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


- #### ARDRegression

In [8]:
model = ARDRegression()
param_dict:dict = {'tol':[1e-3, 1e-4, 1e-5], 'alpha_1':[1e-6, 1e-8, 1e-9], "fit_intercept":[True, False], "normalize":[True, False]}
grid_search:sklearn.model_selection._search.GridSearchCV = GridSearchCV(estimator=model, param_grid=param_dict, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(CPU_x, CPU_y)
best_param:dict = grid_search.best_params_
print("Best parameters using GridSearchCV: ", best_param)
model_best:sklearn.linear_model._bayes.ARDRegression = ARDRegression(**best_param)
scores1:dict = cross_validate(model_best, CPU_x, CPU_y, cv=5, scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'], return_train_score=True)
make_df(CPU_df, scores1, 'ARDRegression', 'GridSearchCV', 6)

randomized_search:sklearn.model_selection._search.RandomizedSearchCV = RandomizedSearchCV(estimator=model, param_distributions=param_dict, cv=5, scoring='neg_mean_squared_error')
randomized_search.fit(CPU_x, CPU_y)
best_param:dict= randomized_search.best_params_
print("Best parameters using RansomizedSearchCV: ", best_param)
model_best:sklearn.linear_model._bayes.ARDRegression= ARDRegression(**best_param)
scores2:dict = cross_validate(model_best, CPU_x, CPU_y, cv=5, scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'], return_train_score=True)
make_df(CPU_df, scores2, 'ARDRegression', 'RandomizedSearchCV', 7)

print("The intermediate data frame looks like this:")
CPU_df.head(n=10)

Best parameters using GridSearchCV:  {'alpha_1': 1e-06, 'fit_intercept': True, 'normalize': True, 'tol': 1e-05}
Best parameters using RansomizedSearchCV:  {'tol': 1e-05, 'normalize': True, 'fit_intercept': True, 'alpha_1': 1e-09}
The intermediate data frame looks like this:


Unnamed: 0,Model,Search_strategy,test_neg_mean_absolute_error,test_neg_mean_squared_error,test_neg_median_absolute_error,train_neg_mean_absolute_error,train_neg_mean_squared_error,train_neg_median_absolute_error,fit_time,score_time
0,LinearRegression,GridSearchCV,-43.378262,-6383.656697,-27.053792,-36.695674,-3243.698611,-25.581576,0.001643,0.0
1,LinearRegression,RandomizedSearchCV,-43.378262,-6383.656697,-27.053792,-36.695674,-3243.698611,-25.581576,0.0016,0.0
2,DecisionTreeRegressor,GridSearchCV,-36.153043,-4715.49686,-17.191567,-18.744878,-729.438723,-13.290054,0.0,0.001703
3,DecisionTreeRegressor,RandomizedSearchCV,-39.546574,-10164.441928,-17.95,-19.314293,-1273.884085,-8.05,0.001605,0.001604
4,LogisticRegression,GridSearchCV,-49.538211,-9244.966318,-18.8,-15.677203,-1145.408483,-4.4,0.086624,0.00148
5,LogisticRegression,RandomizedSearchCV,-51.528571,-9937.661208,-19.3,-15.599458,-1136.875749,-4.4,0.087251,0.000433
6,ARDRegression,GridSearchCV,-42.119425,-6283.550052,-26.405635,-36.38983,-3263.051322,-24.503549,0.007997,0.0
7,ARDRegression,RandomizedSearchCV,-42.119425,-6283.550052,-26.405635,-36.38983,-3263.051323,-24.503549,0.003788,0.00034
8,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


- #### Epsilon-SupportVectorRegression

In [9]:
model = SVR()
param_dict:dict = {'kernel': ['rbf', 'liniar'], 'gamma': ['scale', 'auto'], 'C': [1, 20, 30, 40, 50]}
grid_search:sklearn.model_selection._search.GridSearchCV = GridSearchCV(estimator=model, param_grid=param_dict, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(CPU_x, CPU_y)
best_param :dict= grid_search.best_params_
print("Best parameters using GridSearchCV: ", best_param)
model_best:sklearn.svm._classes.SVR = SVR(**best_param)
scores1:dict = cross_validate(model_best, CPU_x, CPU_y, cv=5, scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'], return_train_score=True)
make_df(CPU_df, scores1, 'SVR', 'GridSearchCV', 8)

randomized_search:sklearn.model_selection._search.RandomizedSearchCV = RandomizedSearchCV(estimator=model, param_distributions=param_dict, cv=5, scoring='neg_mean_squared_error')
randomized_search.fit(CPU_x, CPU_y)
best_param:dict= randomized_search.best_params_
print("Best parameters using RansomizedSearchCV: ", best_param)
model_best:sklearn.svm._classes.SVR= SVR(**best_param)
scores2:dict = cross_validate(model_best, CPU_x, CPU_y, cv=5, scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'], return_train_score=True)
make_df(CPU_df, scores2, 'SVR', 'RandomizedSearchCV', 9)

print("The intermediate data frame looks like this:")
CPU_df.head(n=10)

Best parameters using GridSearchCV:  {'C': 50, 'gamma': 'scale', 'kernel': 'rbf'}
Best parameters using RansomizedSearchCV:  {'kernel': 'rbf', 'gamma': 'scale', 'C': 50}
The intermediate data frame looks like this:


Unnamed: 0,Model,Search_strategy,test_neg_mean_absolute_error,test_neg_mean_squared_error,test_neg_median_absolute_error,train_neg_mean_absolute_error,train_neg_mean_squared_error,train_neg_median_absolute_error,fit_time,score_time
0,LinearRegression,GridSearchCV,-43.378262,-6383.656697,-27.053792,-36.695674,-3243.698611,-25.581576,0.001643,0.0
1,LinearRegression,RandomizedSearchCV,-43.378262,-6383.656697,-27.053792,-36.695674,-3243.698611,-25.581576,0.0016,0.0
2,DecisionTreeRegressor,GridSearchCV,-36.153043,-4715.49686,-17.191567,-18.744878,-729.438723,-13.290054,0.0,0.001703
3,DecisionTreeRegressor,RandomizedSearchCV,-39.546574,-10164.441928,-17.95,-19.314293,-1273.884085,-8.05,0.001605,0.001604
4,LogisticRegression,GridSearchCV,-49.538211,-9244.966318,-18.8,-15.677203,-1145.408483,-4.4,0.086624,0.00148
5,LogisticRegression,RandomizedSearchCV,-51.528571,-9937.661208,-19.3,-15.599458,-1136.875749,-4.4,0.087251,0.000433
6,ARDRegression,GridSearchCV,-42.119425,-6283.550052,-26.405635,-36.38983,-3263.051322,-24.503549,0.007997,0.0
7,ARDRegression,RandomizedSearchCV,-42.119425,-6283.550052,-26.405635,-36.38983,-3263.051323,-24.503549,0.003788,0.00034
8,SVR,GridSearchCV,-43.996462,-13885.807271,-18.189899,-39.184372,-10498.105828,-13.240566,0.001676,0.0
9,SVR,RandomizedSearchCV,-43.996462,-13885.807271,-18.189899,-39.184372,-10498.105828,-13.240566,0.000313,0.001671


In [10]:
CPU_df.iloc[:, 2:] = np.abs(CPU_df.iloc[:, 2:])
CPU_df = CPU_df.rename(columns={'test_neg_mean_absolute_error': 'test_mean_absolute_error', 'test_neg_mean_squared_error': 'test_mean_squared_error', 
                                            'test_neg_median_absolute_error': 'test_median_absolute_error', 'train_neg_mean_absolute_error': 'train_mean_absolute_error', 
                                            'train_neg_mean_squared_error': 'train_mean_squared_error', 'train_neg_median_absolute_error': 'train_median_absolute_error'})
CPU_df = CPU_df.head(10).style.highlight_max(color = 'pink').highlight_min(color = 'lightblue')
CPU_df

Unnamed: 0,Model,Search_strategy,test_mean_absolute_error,test_mean_squared_error,test_median_absolute_error,train_mean_absolute_error,train_mean_squared_error,train_median_absolute_error,fit_time,score_time
0,LinearRegression,GridSearchCV,43.378262,6383.656697,27.053792,36.695674,3243.698611,25.581576,0.001643,0.0
1,LinearRegression,RandomizedSearchCV,43.378262,6383.656697,27.053792,36.695674,3243.698611,25.581576,0.0016,0.0
2,DecisionTreeRegressor,GridSearchCV,36.153043,4715.49686,17.191567,18.744878,729.438723,13.290054,0.0,0.001703
3,DecisionTreeRegressor,RandomizedSearchCV,39.546574,10164.441928,17.95,19.314293,1273.884085,8.05,0.001605,0.001604
4,LogisticRegression,GridSearchCV,49.538211,9244.966318,18.8,15.677203,1145.408483,4.4,0.086624,0.00148
5,LogisticRegression,RandomizedSearchCV,51.528571,9937.661208,19.3,15.599458,1136.875749,4.4,0.087251,0.000433
6,ARDRegression,GridSearchCV,42.119425,6283.550052,26.405635,36.38983,3263.051322,24.503549,0.007997,0.0
7,ARDRegression,RandomizedSearchCV,42.119425,6283.550052,26.405635,36.38983,3263.051323,24.503549,0.003788,0.00034
8,SVR,GridSearchCV,43.996462,13885.807271,18.189899,39.184372,10498.105828,13.240566,0.001676,0.0
9,SVR,RandomizedSearchCV,43.996462,13885.807271,18.189899,39.184372,10498.105828,13.240566,0.000313,0.001671


In [11]:
html = CPU_df.render()
text_file = open("CPU_Computer_Hardware.html", "w")
text_file.write(html)
text_file.close()

## Boston Housing

In [12]:
housing_data: pd.DataFrame = pd.read_csv("data/housing.data", sep = r'\s+', header=None)

empty:np.ndarray = np.zeros(10)
housing_df = pd.DataFrame({
    'Model':list(empty), 
    'Search_strategy': list(empty), 
    'test_neg_mean_absolute_error': list(empty), 
    'test_neg_mean_squared_error': list(empty), 
    'test_neg_median_absolute_error': list(empty),
    'train_neg_mean_absolute_error': list(empty),
    'train_neg_mean_squared_error': list(empty),
    'train_neg_median_absolute_error': list(empty),
    'fit_time': list(empty),
    'score_time': list(empty)
})

housing_x: np.ndarray = housing_data.iloc[:, :-1].values
housing_y: np.ndarray = housing_data.iloc[:, -1].values

housing_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


- #### LiniarRegression

In [13]:
model = LinearRegression()
param_dict:dict = {"fit_intercept":[True, False], "normalize":[True, False], "n_jobs":[-1, None, 3, 5]}
grid_search:sklearn.model_selection._search.GridSearchCV = GridSearchCV(estimator=model, param_grid=param_dict, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(housing_x, housing_y)
best_param:dict = grid_search.best_params_
print("Best parameters using GridSearchCV: ", best_param)
model_best:sklearn.linear_model._base.LinearRegression = LinearRegression(**best_param)
scores1:dict = cross_validate(model_best, housing_x, housing_y, cv=5, scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'], return_train_score=True)
make_df(housing_df, scores1, 'LinearRegression', 'GridSearchCV', 0)

randomized_search:sklearn.model_selection._search.RandomizedSearchCV = RandomizedSearchCV(estimator=model, param_distributions=param_dict, cv=5, scoring='neg_mean_squared_error')
randomized_search.fit(housing_x, housing_y)
best_param:dict= randomized_search.best_params_
print("Best parameters using RansomizedSearchCV: ", best_param)
model_best:sklearn.linear_model._base.LinearRegression= LinearRegression(**best_param)
scores2:dict = cross_validate(model_best, housing_x, housing_y, cv=5, scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'], return_train_score=True)
make_df(housing_df, scores2, 'LinearRegression', 'RandomizedSearchCV', 1)

print("The intermediate data frame looks like this:")
housing_df.head(n=10)

Best parameters using GridSearchCV:  {'fit_intercept': False, 'n_jobs': -1, 'normalize': True}
Best parameters using RansomizedSearchCV:  {'normalize': False, 'n_jobs': 3, 'fit_intercept': False}
The intermediate data frame looks like this:


Unnamed: 0,Model,Search_strategy,test_neg_mean_absolute_error,test_neg_mean_squared_error,test_neg_median_absolute_error,train_neg_mean_absolute_error,train_neg_mean_squared_error,train_neg_median_absolute_error,fit_time,score_time
0,LinearRegression,GridSearchCV,-3.872881,-34.422842,-2.623381,-3.268663,-23.311412,-2.273831,0.001587,0.0
1,LinearRegression,RandomizedSearchCV,-3.872881,-34.422842,-2.623381,-3.268663,-23.311412,-2.273831,0.0016,0.0
2,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


- #### DecisionTreeRegressor

In [14]:
model = DecisionTreeRegressor()
param_dict:dict = {'criterion':['mse', 'friedman_mse', 'mae'],'splitter':['best', 'random'], 'max_depth': [1, 2, 3, 4, 5],
              'min_samples_leaf':[1, 2, 3, 4, 5], 'max_features':[None, 'auto', 'log2', 'sqrt'],
                 'max_leaf_nodes':[None, 10, 20, 30, 40, 50]}
grid_search:sklearn.model_selection._search.GridSearchCV = GridSearchCV(estimator=model, param_grid=param_dict, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(housing_x, housing_y)
best_param:dict = grid_search.best_params_
print("Best parameters using GridSearchCV: ", best_param)
model_best:sklearn.tree._classes.DecisionTreeRegressor = DecisionTreeRegressor(**best_param)
scores1:dict = cross_validate(model_best, housing_x, housing_y, cv=5, scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'], return_train_score=True)
make_df(housing_df, scores1, 'DecisionTreeRegressor', 'GridSearchCV', 2)

randomized_search:sklearn.model_selection._search.RandomizedSearchCV = RandomizedSearchCV(estimator=model, param_distributions=param_dict, cv=5, scoring='neg_mean_squared_error')
randomized_search.fit(housing_x, housing_y)
best_param:dict= randomized_search.best_params_
print("Best parameters using RansomizedSearchCV: ", best_param)
model_best:sklearn.tree._classes.DecisionTreeRegressor= DecisionTreeRegressor(**best_param)
scores2:dict = cross_validate(model_best, housing_x, housing_y, cv=5, scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'], return_train_score=True)
make_df(housing_df, scores2, 'DecisionTreeRegressor', 'RandomizedSearchCV', 3)

print("The intermediate data frame looks like this:")
housing_df.head(n=10)

Best parameters using GridSearchCV:  {'criterion': 'mae', 'max_depth': 4, 'max_features': None, 'max_leaf_nodes': 50, 'min_samples_leaf': 5, 'splitter': 'random'}
Best parameters using RansomizedSearchCV:  {'splitter': 'best', 'min_samples_leaf': 3, 'max_leaf_nodes': None, 'max_features': 'auto', 'max_depth': 2, 'criterion': 'mae'}
The intermediate data frame looks like this:


Unnamed: 0,Model,Search_strategy,test_neg_mean_absolute_error,test_neg_mean_squared_error,test_neg_median_absolute_error,train_neg_mean_absolute_error,train_neg_mean_squared_error,train_neg_median_absolute_error,fit_time,score_time
0,LinearRegression,GridSearchCV,-3.872881,-34.422842,-2.623381,-3.268663,-23.311412,-2.273831,0.001587,0.0
1,LinearRegression,RandomizedSearchCV,-3.872881,-34.422842,-2.623381,-3.268663,-23.311412,-2.273831,0.0016,0.0
2,DecisionTreeRegressor,GridSearchCV,-4.634023,-44.760334,-3.42,-3.147866,-24.074492,-2.085,0.005289,3.7e-05
3,DecisionTreeRegressor,RandomizedSearchCV,-4.284681,-39.285383,-2.84,-3.407105,-26.106434,-2.37,0.005271,0.001582
4,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


- #### LogisticRegression

In [15]:
model = LogisticRegression()
lab_enc = preprocessing.LabelEncoder()
training_scores_encoded = lab_enc.fit_transform(housing_y)
param_dict:dict = {'penalty': ['l1', 'l2', 'elasticnet'], 'solver':['liblinear', 'saga'], 'max_iter':[20]}
grid_search:sklearn.model_selection._search.GridSearchCV = GridSearchCV(estimator=model, param_grid=param_dict, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(housing_x, training_scores_encoded)
best_param:dict = grid_search.best_params_
print("Best parameters using GridSearchCV: ", best_param)
model_best:sklearn.linear_model._logistic.LogisticRegression = LogisticRegression(**best_param)

scores1:dict = cross_validate(model_best, housing_x, training_scores_encoded, cv=5, scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'], return_train_score=True)
make_df(housing_df, scores1, 'LogisticRegression', 'GridSearchCV', 4)

randomized_search:sklearn.model_selection._search.RandomizedSearchCV = RandomizedSearchCV(estimator=model, param_distributions=param_dict, cv=5, scoring='neg_mean_squared_error')
randomized_search.fit(housing_x, training_scores_encoded)
best_param:dict= randomized_search.best_params_
print("Best parameters using RansomizedSearchCV: ", best_param)
model_best:sklearn.linear_model._logistic.LogisticRegression= LogisticRegression(**best_param)
scores2:dict = cross_validate(model_best, housing_x, training_scores_encoded, cv=5, scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'], return_train_score=True)
make_df(housing_df, scores2, 'LogisticRegression', 'RandomizedSearchCV', 5)

print("The intermediate data frame looks like this:")
housing_df.head(n=10)

Best parameters using GridSearchCV:  {'max_iter': 20, 'penalty': 'l2', 'solver': 'liblinear'}
Best parameters using RansomizedSearchCV:  {'solver': 'liblinear', 'penalty': 'l2', 'max_iter': 20}
The intermediate data frame looks like this:


Unnamed: 0,Model,Search_strategy,test_neg_mean_absolute_error,test_neg_mean_squared_error,test_neg_median_absolute_error,train_neg_mean_absolute_error,train_neg_mean_squared_error,train_neg_median_absolute_error,fit_time,score_time
0,LinearRegression,GridSearchCV,-3.872881,-34.422842,-2.623381,-3.268663,-23.311412,-2.273831,0.001587,0.0
1,LinearRegression,RandomizedSearchCV,-3.872881,-34.422842,-2.623381,-3.268663,-23.311412,-2.273831,0.0016,0.0
2,DecisionTreeRegressor,GridSearchCV,-4.634023,-44.760334,-3.42,-3.147866,-24.074492,-2.085,0.005289,3.7e-05
3,DecisionTreeRegressor,RandomizedSearchCV,-4.284681,-39.285383,-2.84,-3.407105,-26.106434,-2.37,0.005271,0.001582
4,LogisticRegression,GridSearchCV,-26.387808,-1339.661464,-19.5,-14.851706,-666.526964,-7.2,0.17962,0.000158
5,LogisticRegression,RandomizedSearchCV,-26.387808,-1339.661464,-19.5,-14.851706,-666.526964,-7.2,0.167613,0.000801
6,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


- #### ARDRegression

In [16]:
model = ARDRegression()
param_dict:dict = {'tol':[1e-3, 1e-4, 1e-5], 'alpha_1':[1e-6, 1e-8, 1e-9], "fit_intercept":[True, False], "normalize":[True, False]}
grid_search:sklearn.model_selection._search.GridSearchCV = GridSearchCV(estimator=model, param_grid=param_dict, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(housing_x, housing_y)
best_param:dict = grid_search.best_params_
print("Best parameters using GridSearchCV: ", best_param)
model_best:sklearn.linear_model._bayes.ARDRegression = ARDRegression(**best_param)
scores1:dict = cross_validate(model_best, housing_x, housing_y, cv=5, scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'], return_train_score=True)
make_df(housing_df, scores1, 'ARDRegression', 'GridSearchCV', 6)

randomized_search:sklearn.model_selection._search.RandomizedSearchCV = RandomizedSearchCV(estimator=model, param_distributions=param_dict, cv=5, scoring='neg_mean_squared_error')
randomized_search.fit(housing_x, housing_y)
best_param:dict= randomized_search.best_params_
print("Best parameters using RansomizedSearchCV: ", best_param)
model_best:sklearn.linear_model._bayes.ARDRegression= ARDRegression(**best_param)
scores2:dict = cross_validate(model_best, housing_x, housing_y, cv=5, scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'], return_train_score=True)
make_df(housing_df, scores2, 'ARDRegression', 'RandomizedSearchCV', 7)

print("The intermediate data frame looks like this:")
housing_df.head(n=10)

Best parameters using GridSearchCV:  {'alpha_1': 1e-06, 'fit_intercept': True, 'normalize': True, 'tol': 0.001}
Best parameters using RansomizedSearchCV:  {'tol': 1e-05, 'normalize': True, 'fit_intercept': True, 'alpha_1': 1e-09}
The intermediate data frame looks like this:


Unnamed: 0,Model,Search_strategy,test_neg_mean_absolute_error,test_neg_mean_squared_error,test_neg_median_absolute_error,train_neg_mean_absolute_error,train_neg_mean_squared_error,train_neg_median_absolute_error,fit_time,score_time
0,LinearRegression,GridSearchCV,-3.872881,-34.422842,-2.623381,-3.268663,-23.311412,-2.273831,0.001587,0.0
1,LinearRegression,RandomizedSearchCV,-3.872881,-34.422842,-2.623381,-3.268663,-23.311412,-2.273831,0.0016,0.0
2,DecisionTreeRegressor,GridSearchCV,-4.634023,-44.760334,-3.42,-3.147866,-24.074492,-2.085,0.005289,3.7e-05
3,DecisionTreeRegressor,RandomizedSearchCV,-4.284681,-39.285383,-2.84,-3.407105,-26.106434,-2.37,0.005271,0.001582
4,LogisticRegression,GridSearchCV,-26.387808,-1339.661464,-19.5,-14.851706,-666.526964,-7.2,0.17962,0.000158
5,LogisticRegression,RandomizedSearchCV,-26.387808,-1339.661464,-19.5,-14.851706,-666.526964,-7.2,0.167613,0.000801
6,ARDRegression,GridSearchCV,-4.169117,-36.692803,-3.210045,-3.206735,-20.826219,-2.349749,0.0052,0.0002
7,ARDRegression,RandomizedSearchCV,-4.169121,-36.692972,-3.21001,-3.206737,-20.826228,-2.349751,0.0124,0.0006
8,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


- #### Epsilon-SupportVectorRegression

In [17]:
model = SVR()
param_dict:dict = {'kernel': ['rbf', 'liniar'], 'gamma': ['scale', 'auto'], 'C': [1, 20, 30, 40, 50]}
grid_search:sklearn.model_selection._search.GridSearchCV = GridSearchCV(estimator=model, param_grid=param_dict, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(housing_x, housing_y)
best_param :dict= grid_search.best_params_
print("Best parameters using GridSearchCV: ", best_param)
model_best:sklearn.svm._classes.SVR = SVR(**best_param)
scores1:dict = cross_validate(model_best, housing_x, housing_y, cv=5, scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'], return_train_score=True)
make_df(housing_df, scores1, 'SVR', 'GridSearchCV', 8)

randomized_search:sklearn.model_selection._search.RandomizedSearchCV = RandomizedSearchCV(estimator=model, param_distributions=param_dict, cv=5, scoring='neg_mean_squared_error')
randomized_search.fit(housing_x, housing_y)
best_param:dict= randomized_search.best_params_
print("Best parameters using RansomizedSearchCV: ", best_param)
model_best:sklearn.svm._classes.SVR= SVR(**best_param)
scores2:dict = cross_validate(model_best, housing_x, housing_y, cv=5, scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'], return_train_score=True)
make_df(housing_df, scores2, 'SVR', 'RandomizedSearchCV', 9)

print("The intermediate data frame looks like this:")
housing_df.head(n=10)

Best parameters using GridSearchCV:  {'C': 50, 'gamma': 'scale', 'kernel': 'rbf'}
Best parameters using RansomizedSearchCV:  {'kernel': 'rbf', 'gamma': 'scale', 'C': 50}
The intermediate data frame looks like this:


Unnamed: 0,Model,Search_strategy,test_neg_mean_absolute_error,test_neg_mean_squared_error,test_neg_median_absolute_error,train_neg_mean_absolute_error,train_neg_mean_squared_error,train_neg_median_absolute_error,fit_time,score_time
0,LinearRegression,GridSearchCV,-3.872881,-34.422842,-2.623381,-3.268663,-23.311412,-2.273831,0.001587,0.0
1,LinearRegression,RandomizedSearchCV,-3.872881,-34.422842,-2.623381,-3.268663,-23.311412,-2.273831,0.0016,0.0
2,DecisionTreeRegressor,GridSearchCV,-4.634023,-44.760334,-3.42,-3.147866,-24.074492,-2.085,0.005289,3.7e-05
3,DecisionTreeRegressor,RandomizedSearchCV,-4.284681,-39.285383,-2.84,-3.407105,-26.106434,-2.37,0.005271,0.001582
4,LogisticRegression,GridSearchCV,-26.387808,-1339.661464,-19.5,-14.851706,-666.526964,-7.2,0.17962,0.000158
5,LogisticRegression,RandomizedSearchCV,-26.387808,-1339.661464,-19.5,-14.851706,-666.526964,-7.2,0.167613,0.000801
6,ARDRegression,GridSearchCV,-4.169117,-36.692803,-3.210045,-3.206735,-20.826219,-2.349749,0.0052,0.0002
7,ARDRegression,RandomizedSearchCV,-4.169121,-36.692972,-3.21001,-3.206737,-20.826228,-2.349751,0.0124,0.0006
8,SVR,GridSearchCV,-5.629406,-65.433652,-4.014465,-4.244997,-48.953711,-2.541244,0.007299,0.001398
9,SVR,RandomizedSearchCV,-5.629406,-65.433652,-4.014465,-4.244997,-48.953711,-2.541244,0.0064,0.001603


In [18]:
housing_df.iloc[:, 2:] = np.abs(housing_df.iloc[:, 2:])
housing_df = housing_df.rename(columns={'test_neg_mean_absolute_error': 'test_mean_absolute_error', 'test_neg_mean_squared_error': 'test_mean_squared_error', 
                                            'test_neg_median_absolute_error': 'test_median_absolute_error', 'train_neg_mean_absolute_error': 'train_mean_absolute_error', 
                                            'train_neg_mean_squared_error': 'train_mean_squared_error', 'train_neg_median_absolute_error': 'train_median_absolute_error'})
housing_df = housing_df.head(10).style.highlight_max(color = 'pink').highlight_min(color = 'lightblue')
housing_df

Unnamed: 0,Model,Search_strategy,test_mean_absolute_error,test_mean_squared_error,test_median_absolute_error,train_mean_absolute_error,train_mean_squared_error,train_median_absolute_error,fit_time,score_time
0,LinearRegression,GridSearchCV,3.872881,34.422842,2.623381,3.268663,23.311412,2.273831,0.001587,0.0
1,LinearRegression,RandomizedSearchCV,3.872881,34.422842,2.623381,3.268663,23.311412,2.273831,0.0016,0.0
2,DecisionTreeRegressor,GridSearchCV,4.634023,44.760334,3.42,3.147866,24.074492,2.085,0.005289,3.7e-05
3,DecisionTreeRegressor,RandomizedSearchCV,4.284681,39.285383,2.84,3.407105,26.106434,2.37,0.005271,0.001582
4,LogisticRegression,GridSearchCV,26.387808,1339.661464,19.5,14.851706,666.526964,7.2,0.17962,0.000158
5,LogisticRegression,RandomizedSearchCV,26.387808,1339.661464,19.5,14.851706,666.526964,7.2,0.167613,0.000801
6,ARDRegression,GridSearchCV,4.169117,36.692803,3.210045,3.206735,20.826219,2.349749,0.0052,0.0002
7,ARDRegression,RandomizedSearchCV,4.169121,36.692972,3.21001,3.206737,20.826228,2.349751,0.0124,0.0006
8,SVR,GridSearchCV,5.629406,65.433652,4.014465,4.244997,48.953711,2.541244,0.007299,0.001398
9,SVR,RandomizedSearchCV,5.629406,65.433652,4.014465,4.244997,48.953711,2.541244,0.0064,0.001603


In [19]:
html = housing_df.render()
text_file = open("Boston_Housing.html", "w")
text_file.write(html)
text_file.close()

## Wisconsin Breast Cancer

1. [Wisconsin Breast Cancer](http://www.dcc.fc.up.pt/~ltorgo/Regression/DataSets.html); cautati in panelul din stanga Wisconsin Breast Cancer si urmati pasii din "My personal Notes"

My personal Notes :
- I've removed the four instances with unknown values of the last attribute
- I've exchanged the attribute position of attributes n.3 (Time) and n.35 (Lymph node).
- I've removed the attribute outcome as it is the class attribute if the problem is treated as a classification one.

In [20]:
cancer_data: pd.DataFrame = pd.read_csv("data/r_wpbc.data", sep = ',', header=None)

empty = np.zeros(10)
cancer_df = pd.DataFrame({
    'Model':list(empty), 
    'Search_strategy': list(empty), 
    'test_neg_mean_absolute_error': list(empty), 
    'test_neg_mean_squared_error': list(empty), 
    'test_neg_median_absolute_error': list(empty),
    'train_neg_mean_absolute_error': list(empty),
    'train_neg_mean_squared_error': list(empty),
    'train_neg_median_absolute_error': list(empty),
    'fit_time': list(empty),
    'score_time': list(empty)
})

cancer_x: np.ndarray = cancer_data.iloc[:, :-1].values
cancer_y: np.ndarray = cancer_data.iloc[:, -1].values

cancer_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,23,24,25,26,27,28,29,30,31,32
0,5,18.02,27.6,117.5,1013.0,0.09489,0.1036,0.1086,0.07055,0.1865,...,139.7,1436.0,0.1195,0.1926,0.314,0.117,0.2677,0.08113,5.0,31
1,2,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,3.0,61
2,0,21.37,17.44,137.5,1373.0,0.08836,0.1189,0.1255,0.0818,0.2333,...,159.1,1949.0,0.1188,0.3449,0.3414,0.2032,0.4334,0.09067,2.5,116
3,0,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,2.0,123
4,0,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,3.5,27


- #### LiniarRegression

In [21]:
model = LinearRegression()
param_dict:dict = {"fit_intercept":[True, False], "normalize":[True, False], "n_jobs":[-1, None, 3, 5]}
grid_search:sklearn.model_selection._search.GridSearchCV = GridSearchCV(estimator=model, param_grid=param_dict, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(cancer_x, cancer_y)
best_param:dict = grid_search.best_params_
print("Best parameters using GridSearchCV: ", best_param)
model_best:sklearn.linear_model._base.LinearRegression = LinearRegression(**best_param)
scores1:dict = cross_validate(model_best, cancer_x, cancer_y, cv=5, scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'], return_train_score=True)
make_df(cancer_df, scores1, 'LinearRegression', 'GridSearchCV', 0)

randomized_search:sklearn.model_selection._search.RandomizedSearchCV = RandomizedSearchCV(estimator=model, param_distributions=param_dict, cv=5, scoring='neg_mean_squared_error')
randomized_search.fit(cancer_x, cancer_y)
best_param:dict= randomized_search.best_params_
print("Best parameters using RansomizedSearchCV: ", best_param)
model_best:sklearn.linear_model._base.LinearRegression= LinearRegression(**best_param)
scores2:dict = cross_validate(model_best, cancer_x, cancer_y, cv=5, scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'], return_train_score=True)
make_df(cancer_df, scores2, 'LinearRegression', 'RandomizedSearchCV', 1)

print("The intermediate data frame looks like this:")
cancer_df.head(n=10)

Best parameters using GridSearchCV:  {'fit_intercept': False, 'n_jobs': -1, 'normalize': True}
Best parameters using RansomizedSearchCV:  {'normalize': False, 'n_jobs': None, 'fit_intercept': False}
The intermediate data frame looks like this:


Unnamed: 0,Model,Search_strategy,test_neg_mean_absolute_error,test_neg_mean_squared_error,test_neg_median_absolute_error,train_neg_mean_absolute_error,train_neg_mean_squared_error,train_neg_median_absolute_error,fit_time,score_time
0,LinearRegression,GridSearchCV,-29.727065,-1339.66409,-28.793519,-22.279819,-745.178079,-19.407585,0.000799,0.001
1,LinearRegression,RandomizedSearchCV,-29.727065,-1339.66409,-28.793519,-22.279819,-745.178079,-19.407585,0.0006,0.0006
2,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


- #### DecisionTreeRegressor

In [22]:
model = DecisionTreeRegressor()
param_dict:dict = {'criterion':['mse', 'friedman_mse', 'mae'],'splitter':['best', 'random'], 'max_depth': [1, 2, 3, 4, 5],
              'min_samples_leaf':[1, 2, 3, 4, 5], 'max_features':[None, 'auto', 'log2', 'sqrt'],
                 'max_leaf_nodes':[None, 10, 20, 30, 40, 50]}
grid_search:sklearn.model_selection._search.GridSearchCV = GridSearchCV(estimator=model, param_grid=param_dict, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(cancer_x,cancer_y)
best_param:dict = grid_search.best_params_
print("Best parameters using GridSearchCV: ", best_param)
model_best:sklearn.tree._classes.DecisionTreeRegressor = DecisionTreeRegressor(**best_param)
scores1:dict = cross_validate(model_best, cancer_x, cancer_y, cv=5, scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'], return_train_score=True)
make_df(cancer_df, scores1, 'DecisionTreeRegressor', 'GridSearchCV', 2)

randomized_search:sklearn.model_selection._search.RandomizedSearchCV = RandomizedSearchCV(estimator=model, param_distributions=param_dict, cv=5, scoring='neg_mean_squared_error')
randomized_search.fit(cancer_x,cancer_y)
best_param:dict= randomized_search.best_params_
print("Best parameters using RansomizedSearchCV: ", best_param)
model_best:sklearn.tree._classes.DecisionTreeRegressor= DecisionTreeRegressor(**best_param)
scores2:dict = cross_validate(model_best, cancer_x, cancer_y, cv=5, scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'], return_train_score=True)
make_df(cancer_df, scores2, 'DecisionTreeRegressor', 'RandomizedSearchCV', 3)

print("The intermediate data frame looks like this:")
cancer_df.head(n=10)

Best parameters using GridSearchCV:  {'criterion': 'mse', 'max_depth': 3, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'min_samples_leaf': 4, 'splitter': 'random'}
Best parameters using RansomizedSearchCV:  {'splitter': 'best', 'min_samples_leaf': 2, 'max_leaf_nodes': None, 'max_features': 'log2', 'max_depth': 3, 'criterion': 'mse'}
The intermediate data frame looks like this:


Unnamed: 0,Model,Search_strategy,test_neg_mean_absolute_error,test_neg_mean_squared_error,test_neg_median_absolute_error,train_neg_mean_absolute_error,train_neg_mean_squared_error,train_neg_median_absolute_error,fit_time,score_time
0,LinearRegression,GridSearchCV,-29.727065,-1339.66409,-28.793519,-22.279819,-745.178079,-19.407585,0.000799,0.001
1,LinearRegression,RandomizedSearchCV,-29.727065,-1339.66409,-28.793519,-22.279819,-745.178079,-19.407585,0.0006,0.0006
2,DecisionTreeRegressor,GridSearchCV,-32.322305,-1431.96616,-31.326843,-26.294028,-994.314616,-23.676712,0.0,0.0
3,DecisionTreeRegressor,RandomizedSearchCV,-32.42722,-1545.602888,-30.863507,-22.89815,-781.273803,-20.040061,0.000445,0.0
4,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


- #### Logistic Regression

In [23]:
model = LogisticRegression()
param_dict:dict = {'penalty': ['l1', 'l2', 'elasticnet'], 'solver':['liblinear', 'saga'], 'max_iter':[20]}
grid_search:sklearn.model_selection._search.GridSearchCV = GridSearchCV(estimator=model, param_grid=param_dict, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(cancer_x, cancer_y)
best_param:dict = grid_search.best_params_
print("Best parameters using GridSearchCV: ", best_param)
model_best:sklearn.linear_model._logistic.LogisticRegression = LogisticRegression(**best_param)

scores1:dict = cross_validate(model_best, cancer_x, cancer_y, cv=5, scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'], return_train_score=True)
make_df(cancer_df, scores1, 'LogisticRegression', 'GridSearchCV', 4)

randomized_search:sklearn.model_selection._search.RandomizedSearchCV = RandomizedSearchCV(estimator=model, param_distributions=param_dict, cv=5, scoring='neg_mean_squared_error')
randomized_search.fit(cancer_x, cancer_y)
best_param:dict= randomized_search.best_params_
print("Best parameters using RansomizedSearchCV: ", best_param)
model_best:sklearn.linear_model._logistic.LogisticRegression= LogisticRegression(**best_param)
scores2:dict = cross_validate(model_best, cancer_x, cancer_y, cv=5, scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'], return_train_score=True)
make_df(cancer_df, scores2, 'LogisticRegression', 'RandomizedSearchCV', 5)

print("The intermediate data frame looks like this:")
cancer_df.head(n=10)

Best parameters using GridSearchCV:  {'max_iter': 20, 'penalty': 'l1', 'solver': 'liblinear'}
Best parameters using RansomizedSearchCV:  {'solver': 'liblinear', 'penalty': 'l1', 'max_iter': 20}
The intermediate data frame looks like this:


Unnamed: 0,Model,Search_strategy,test_neg_mean_absolute_error,test_neg_mean_squared_error,test_neg_median_absolute_error,train_neg_mean_absolute_error,train_neg_mean_squared_error,train_neg_median_absolute_error,fit_time,score_time
0,LinearRegression,GridSearchCV,-29.727065,-1339.66409,-28.793519,-22.279819,-745.178079,-19.407585,0.000799,0.001
1,LinearRegression,RandomizedSearchCV,-29.727065,-1339.66409,-28.793519,-22.279819,-745.178079,-19.407585,0.0006,0.0006
2,DecisionTreeRegressor,GridSearchCV,-32.322305,-1431.96616,-31.326843,-26.294028,-994.314616,-23.676712,0.0,0.0
3,DecisionTreeRegressor,RandomizedSearchCV,-32.42722,-1545.602888,-30.863507,-22.89815,-781.273803,-20.040061,0.000445,0.0
4,LogisticRegression,GridSearchCV,-35.588664,-2051.189744,-29.9,-18.76641,-1116.609454,-2.1,0.204179,0.002812
5,LogisticRegression,RandomizedSearchCV,-35.704049,-2044.748313,-29.9,-19.449586,-1170.414963,-1.7,0.215726,0.0016
6,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


- #### ARDRegression

In [24]:
model = ARDRegression()
param_dict:dict = {'tol':[1e-3, 1e-4, 1e-5], 'alpha_1':[1e-6, 1e-8, 1e-9], "fit_intercept":[True, False], "normalize":[True, False]}
grid_search:sklearn.model_selection._search.GridSearchCV = GridSearchCV(estimator=model, param_grid=param_dict, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(cancer_x, cancer_y)
best_param:dict = grid_search.best_params_
print("Best parameters using GridSearchCV: ", best_param)
model_best:sklearn.linear_model._bayes.ARDRegression = ARDRegression(**best_param)
scores1:dict = cross_validate(model_best, cancer_x, cancer_y, cv=5, scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'], return_train_score=True)
make_df(cancer_df, scores1, 'ARDRegression', 'GridSearchCV', 6)

randomized_search:sklearn.model_selection._search.RandomizedSearchCV = RandomizedSearchCV(estimator=model, param_distributions=param_dict, cv=5, scoring='neg_mean_squared_error')
randomized_search.fit(cancer_x, cancer_y)
best_param:dict= randomized_search.best_params_
print("Best parameters using RansomizedSearchCV: ", best_param)
model_best:sklearn.linear_model._bayes.ARDRegression= ARDRegression(**best_param)
scores2:dict = cross_validate(model_best, cancer_x, cancer_y, cv=5, scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'], return_train_score=True)
make_df(cancer_df, scores2, 'ARDRegression', 'RandomizedSearchCV', 7)

print("The intermediate data frame looks like this:")
cancer_df.head(n=10)

Best parameters using GridSearchCV:  {'alpha_1': 1e-06, 'fit_intercept': True, 'normalize': True, 'tol': 1e-05}
Best parameters using RansomizedSearchCV:  {'tol': 1e-05, 'normalize': True, 'fit_intercept': True, 'alpha_1': 1e-08}
The intermediate data frame looks like this:


Unnamed: 0,Model,Search_strategy,test_neg_mean_absolute_error,test_neg_mean_squared_error,test_neg_median_absolute_error,train_neg_mean_absolute_error,train_neg_mean_squared_error,train_neg_median_absolute_error,fit_time,score_time
0,LinearRegression,GridSearchCV,-29.727065,-1339.66409,-28.793519,-22.279819,-745.178079,-19.407585,0.000799,0.001
1,LinearRegression,RandomizedSearchCV,-29.727065,-1339.66409,-28.793519,-22.279819,-745.178079,-19.407585,0.0006,0.0006
2,DecisionTreeRegressor,GridSearchCV,-32.322305,-1431.96616,-31.326843,-26.294028,-994.314616,-23.676712,0.0,0.0
3,DecisionTreeRegressor,RandomizedSearchCV,-32.42722,-1545.602888,-30.863507,-22.89815,-781.273803,-20.040061,0.000445,0.0
4,LogisticRegression,GridSearchCV,-35.588664,-2051.189744,-29.9,-18.76641,-1116.609454,-2.1,0.204179,0.002812
5,LogisticRegression,RandomizedSearchCV,-35.704049,-2044.748313,-29.9,-19.449586,-1170.414963,-1.7,0.215726,0.0016
6,ARDRegression,GridSearchCV,-29.654808,-1280.39198,-27.601408,-24.177728,-858.073904,-21.924193,0.048207,0.0016
7,ARDRegression,RandomizedSearchCV,-29.654808,-1280.39198,-27.601408,-24.177728,-858.073905,-21.924192,0.044912,0.0
8,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


- #### Epsilon-SupportVectorRegression

In [25]:
model = SVR()
param_dict:dict = {'kernel': ['rbf', 'liniar'], 'gamma': ['scale', 'auto'], 'C': [1, 20, 30, 40, 50]}
grid_search:sklearn.model_selection._search.GridSearchCV = GridSearchCV(estimator=model, param_grid=param_dict, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(cancer_x, cancer_y)
best_param :dict= grid_search.best_params_
print("Best parameters using GridSearchCV: ", best_param)
model_best:sklearn.svm._classes.SVR = SVR(**best_param)
scores1:dict = cross_validate(model_best, cancer_x,cancer_y, cv=5, scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'], return_train_score=True)
make_df(cancer_df, scores1, 'SVR', 'GridSearchCV', 8)

randomized_search:sklearn.model_selection._search.RandomizedSearchCV = RandomizedSearchCV(estimator=model, param_distributions=param_dict, cv=5, scoring='neg_mean_squared_error')
randomized_search.fit(cancer_x, cancer_y)
best_param:dict= randomized_search.best_params_
print("Best parameters using RansomizedSearchCV: ", best_param)
model_best:sklearn.svm._classes.SVR= SVR(**best_param)
scores2:dict = cross_validate(model_best, cancer_x, cancer_y, cv=5, scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'], return_train_score=True)
make_df(cancer_df, scores2, 'SVR', 'RandomizedSearchCV', 9)

print("The intermediate data frame looks like this:")
cancer_df.head(n=10)

Best parameters using GridSearchCV:  {'C': 50, 'gamma': 'auto', 'kernel': 'rbf'}
Best parameters using RansomizedSearchCV:  {'kernel': 'rbf', 'gamma': 'auto', 'C': 40}
The intermediate data frame looks like this:


Unnamed: 0,Model,Search_strategy,test_neg_mean_absolute_error,test_neg_mean_squared_error,test_neg_median_absolute_error,train_neg_mean_absolute_error,train_neg_mean_squared_error,train_neg_median_absolute_error,fit_time,score_time
0,LinearRegression,GridSearchCV,-29.727065,-1339.66409,-28.793519,-22.279819,-745.178079,-19.407585,0.000799,0.001
1,LinearRegression,RandomizedSearchCV,-29.727065,-1339.66409,-28.793519,-22.279819,-745.178079,-19.407585,0.0006,0.0006
2,DecisionTreeRegressor,GridSearchCV,-32.322305,-1431.96616,-31.326843,-26.294028,-994.314616,-23.676712,0.0,0.0
3,DecisionTreeRegressor,RandomizedSearchCV,-32.42722,-1545.602888,-30.863507,-22.89815,-781.273803,-20.040061,0.000445,0.0
4,LogisticRegression,GridSearchCV,-35.588664,-2051.189744,-29.9,-18.76641,-1116.609454,-2.1,0.204179,0.002812
5,LogisticRegression,RandomizedSearchCV,-35.704049,-2044.748313,-29.9,-19.449586,-1170.414963,-1.7,0.215726,0.0016
6,ARDRegression,GridSearchCV,-29.654808,-1280.39198,-27.601408,-24.177728,-858.073904,-21.924193,0.048207,0.0016
7,ARDRegression,RandomizedSearchCV,-29.654808,-1280.39198,-27.601408,-24.177728,-858.073905,-21.924192,0.044912,0.0
8,SVR,GridSearchCV,-31.762854,-1389.105988,-29.679207,-1.77957,-34.030023,-0.100191,0.004061,0.001599
9,SVR,RandomizedSearchCV,-31.811718,-1400.151172,-29.790628,-3.5751,-91.771481,-0.100108,0.003183,0.001612


In [26]:
cancer_df.iloc[:, 2:] = np.abs(cancer_df.iloc[:, 2:])
cancer_df = cancer_df.rename(columns={'test_neg_mean_absolute_error': 'test_mean_absolute_error', 'test_neg_mean_squared_error': 'test_mean_squared_error', 
                                            'test_neg_median_absolute_error': 'test_median_absolute_error', 'train_neg_mean_absolute_error': 'train_mean_absolute_error', 
                                            'train_neg_mean_squared_error': 'train_mean_squared_error', 'train_neg_median_absolute_error': 'train_median_absolute_error'})
cancer_df = cancer_df.head(10).style.highlight_max(color = 'pink').highlight_min(color = 'lightblue')
cancer_df

Unnamed: 0,Model,Search_strategy,test_mean_absolute_error,test_mean_squared_error,test_median_absolute_error,train_mean_absolute_error,train_mean_squared_error,train_median_absolute_error,fit_time,score_time
0,LinearRegression,GridSearchCV,29.727065,1339.66409,28.793519,22.279819,745.178079,19.407585,0.000799,0.001
1,LinearRegression,RandomizedSearchCV,29.727065,1339.66409,28.793519,22.279819,745.178079,19.407585,0.0006,0.0006
2,DecisionTreeRegressor,GridSearchCV,32.322305,1431.96616,31.326843,26.294028,994.314616,23.676712,0.0,0.0
3,DecisionTreeRegressor,RandomizedSearchCV,32.42722,1545.602888,30.863507,22.89815,781.273803,20.040061,0.000445,0.0
4,LogisticRegression,GridSearchCV,35.588664,2051.189744,29.9,18.76641,1116.609454,2.1,0.204179,0.002812
5,LogisticRegression,RandomizedSearchCV,35.704049,2044.748313,29.9,19.449586,1170.414963,1.7,0.215726,0.0016
6,ARDRegression,GridSearchCV,29.654808,1280.39198,27.601408,24.177728,858.073904,21.924193,0.048207,0.0016
7,ARDRegression,RandomizedSearchCV,29.654808,1280.39198,27.601408,24.177728,858.073905,21.924192,0.044912,0.0
8,SVR,GridSearchCV,31.762854,1389.105988,29.679207,1.77957,34.030023,0.100191,0.004061,0.001599
9,SVR,RandomizedSearchCV,31.811718,1400.151172,29.790628,3.5751,91.771481,0.100108,0.003183,0.001612


In [27]:
html = cancer_df.render()
text_file = open("Winsconsin_Breast_Cancer.html", "w")
text_file.write(html)
text_file.close()

## Communities and Crime

1. [Communities and Crime](http://archive.ics.uci.edu/ml/datasets/communities+and+crime); stergeti primele 5 dimensiuni si trasaturile cu missing values.

In [28]:
communities_data: pd.DataFrame = pd.read_csv("data/communities.data", sep = ',', header=None)
communities_data: pd.DataFrame = communities_data.drop([1, 2, 3, 4, 5], axis = 1)
communities_data: pd.DataFrame = communities_data.replace('?', np.nan)
communities_data: pd.DataFrame = communities_data.dropna()

empty = np.zeros(10)
communities_df = pd.DataFrame({
    'Model':list(empty), 
    'Search_strategy': list(empty), 
    'test_neg_mean_absolute_error': list(empty), 
    'test_neg_mean_squared_error': list(empty), 
    'test_neg_median_absolute_error': list(empty),
    'train_neg_mean_absolute_error': list(empty),
    'train_neg_mean_squared_error': list(empty),
    'train_neg_median_absolute_error': list(empty),
    'fit_time': list(empty),
    'score_time': list(empty)
})

communities_x: np.ndarray = communities_data.iloc[:, :-1].values
communities_y: np.ndarray = communities_data.iloc[:, -1].values

cancer_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,23,24,25,26,27,28,29,30,31,32
0,5,18.02,27.6,117.5,1013.0,0.09489,0.1036,0.1086,0.07055,0.1865,...,139.7,1436.0,0.1195,0.1926,0.314,0.117,0.2677,0.08113,5.0,31
1,2,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,3.0,61
2,0,21.37,17.44,137.5,1373.0,0.08836,0.1189,0.1255,0.0818,0.2333,...,159.1,1949.0,0.1188,0.3449,0.3414,0.2032,0.4334,0.09067,2.5,116
3,0,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,2.0,123
4,0,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,3.5,27


- #### LiniarRegression

In [29]:
model = LinearRegression()
param_dict:dict = {"fit_intercept":[True, False], "normalize":[True, False], "n_jobs":[-1, None, 3, 5]}
grid_search:sklearn.model_selection._search.GridSearchCV = GridSearchCV(estimator=model, param_grid=param_dict, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(communities_x, communities_y)
best_param:dict = grid_search.best_params_
print("Best parameters using GridSearchCV: ", best_param)
model_best:sklearn.linear_model._base.LinearRegression = LinearRegression(**best_param)
scores1:dict = cross_validate(model_best, communities_x, communities_y, cv=5, scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'], return_train_score=True)
make_df(communities_df, scores1, 'LinearRegression', 'GridSearchCV', 0)

randomized_search:sklearn.model_selection._search.RandomizedSearchCV = RandomizedSearchCV(estimator=model, param_distributions=param_dict, cv=5, scoring='neg_mean_squared_error')
randomized_search.fit(communities_x, communities_y)
best_param:dict= randomized_search.best_params_
print("Best parameters using RansomizedSearchCV: ", best_param)
model_best:sklearn.linear_model._base.LinearRegression= LinearRegression(**best_param)
scores2:dict = cross_validate(model_best, communities_x, communities_y, cv=5, scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'], return_train_score=True)
make_df(communities_df, scores2, 'LinearRegression', 'RandomizedSearchCV', 1)

print("The intermediate data frame looks like this:")
communities_df.head(n=10)

Best parameters using GridSearchCV:  {'fit_intercept': False, 'n_jobs': -1, 'normalize': True}
Best parameters using RansomizedSearchCV:  {'normalize': False, 'n_jobs': 3, 'fit_intercept': False}
The intermediate data frame looks like this:


Unnamed: 0,Model,Search_strategy,test_neg_mean_absolute_error,test_neg_mean_squared_error,test_neg_median_absolute_error,train_neg_mean_absolute_error,train_neg_mean_squared_error,train_neg_median_absolute_error,fit_time,score_time
0,LinearRegression,GridSearchCV,-0.162619,-0.043365,-0.137603,-0.085307,-0.012237,-0.068108,0.001601,0.001599
1,LinearRegression,RandomizedSearchCV,-0.162619,-0.043365,-0.137603,-0.085307,-0.012237,-0.068108,0.003215,0.0
2,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


- #### DecisionTreeRegressor

In [30]:
model = DecisionTreeRegressor()
param_dict:dict = {'criterion':['mse', 'friedman_mse', 'mae'],'splitter':['best', 'random'], 'max_depth': [1, 2, 3, 4, 5],
              'min_samples_leaf':[1, 2, 3, 4, 5], 'max_features':[None, 'auto', 'log2', 'sqrt'],
                 'max_leaf_nodes':[None, 10, 20, 30, 40, 50]}
grid_search:sklearn.model_selection._search.GridSearchCV = GridSearchCV(estimator=model, param_grid=param_dict, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(communities_x,communities_y)
best_param:dict = grid_search.best_params_
print("Best parameters using GridSearchCV: ", best_param)
model_best:sklearn.tree._classes.DecisionTreeRegressor = DecisionTreeRegressor(**best_param)
scores1:dict = cross_validate(model_best, communities_x, communities_y, cv=5, scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'], return_train_score=True)
make_df(communities_df, scores1, 'DecisionTreeRegressor', 'GridSearchCV', 2)

randomized_search:sklearn.model_selection._search.RandomizedSearchCV = RandomizedSearchCV(estimator=model, param_distributions=param_dict, cv=5, scoring='neg_mean_squared_error')
randomized_search.fit(communities_x,communities_y)
best_param:dict= randomized_search.best_params_
print("Best parameters using RansomizedSearchCV: ", best_param)
model_best:sklearn.tree._classes.DecisionTreeRegressor= DecisionTreeRegressor(**best_param)
scores2:dict = cross_validate(model_best, communities_x, communities_y, cv=5, scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'], return_train_score=True)
make_df(communities_df, scores2, 'DecisionTreeRegressor', 'RandomizedSearchCV', 3)

print("The intermediate data frame looks like this:")
communities_df.head(n=10)

Best parameters using GridSearchCV:  {'criterion': 'friedman_mse', 'max_depth': 3, 'max_features': None, 'max_leaf_nodes': 40, 'min_samples_leaf': 2, 'splitter': 'random'}
Best parameters using RansomizedSearchCV:  {'splitter': 'random', 'min_samples_leaf': 3, 'max_leaf_nodes': 50, 'max_features': 'auto', 'max_depth': 2, 'criterion': 'friedman_mse'}
The intermediate data frame looks like this:


Unnamed: 0,Model,Search_strategy,test_neg_mean_absolute_error,test_neg_mean_squared_error,test_neg_median_absolute_error,train_neg_mean_absolute_error,train_neg_mean_squared_error,train_neg_median_absolute_error,fit_time,score_time
0,LinearRegression,GridSearchCV,-0.162619,-0.043365,-0.137603,-0.085307,-0.012237,-0.068108,0.001601,0.001599
1,LinearRegression,RandomizedSearchCV,-0.162619,-0.043365,-0.137603,-0.085307,-0.012237,-0.068108,0.003215,0.0
2,DecisionTreeRegressor,GridSearchCV,-0.156831,-0.038833,-0.138354,-0.135794,-0.029513,-0.110221,0.001263,0.0
3,DecisionTreeRegressor,RandomizedSearchCV,-0.167432,-0.042155,-0.148895,-0.15193,-0.03637,-0.129427,0.001617,0.002763
4,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


- #### Logistic Regression

In [31]:
model = LogisticRegression()
lab_enc = preprocessing.LabelEncoder()
training_scores_encoded = lab_enc.fit_transform(communities_y)
param_dict:dict = {'penalty': ['l1', 'l2', 'elasticnet'], 'solver':['liblinear', 'saga'], 'max_iter':[20]}
grid_search:sklearn.model_selection._search.GridSearchCV = GridSearchCV(estimator=model, param_grid=param_dict, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(communities_x, training_scores_encoded)
best_param:dict = grid_search.best_params_
print("Best parameters using GridSearchCV: ", best_param)
model_best:sklearn.linear_model._logistic.LogisticRegression = LogisticRegression(**best_param)

scores1:dict = cross_validate(model_best, communities_x, training_scores_encoded, cv=5, scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'], return_train_score=True)
make_df(communities_df, scores1, 'LogisticRegression', 'GridSearchCV', 4)

randomized_search:sklearn.model_selection._search.RandomizedSearchCV = RandomizedSearchCV(estimator=model, param_distributions=param_dict, cv=5, scoring='neg_mean_squared_error')
randomized_search.fit(communities_x, training_scores_encoded)
best_param:dict= randomized_search.best_params_
print("Best parameters using RansomizedSearchCV: ", best_param)
model_best:sklearn.linear_model._logistic.LogisticRegression= LogisticRegression(**best_param)
scores2:dict = cross_validate(model_best, communities_x, training_scores_encoded, cv=5, scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'], return_train_score=True)
make_df(communities_df, scores2, 'LogisticRegression', 'RandomizedSearchCV', 5)

print("The intermediate data frame looks like this:")
communities_df.head(n=10)

Best parameters using GridSearchCV:  {'max_iter': 20, 'penalty': 'l2', 'solver': 'liblinear'}
Best parameters using RansomizedSearchCV:  {'solver': 'liblinear', 'penalty': 'l2', 'max_iter': 20}
The intermediate data frame looks like this:


Unnamed: 0,Model,Search_strategy,test_neg_mean_absolute_error,test_neg_mean_squared_error,test_neg_median_absolute_error,train_neg_mean_absolute_error,train_neg_mean_squared_error,train_neg_median_absolute_error,fit_time,score_time
0,LinearRegression,GridSearchCV,-0.162619,-0.043365,-0.137603,-0.085307,-0.012237,-0.068108,0.001601,0.001599
1,LinearRegression,RandomizedSearchCV,-0.162619,-0.043365,-0.137603,-0.085307,-0.012237,-0.068108,0.003215,0.0
2,DecisionTreeRegressor,GridSearchCV,-0.156831,-0.038833,-0.138354,-0.135794,-0.029513,-0.110221,0.001263,0.0
3,DecisionTreeRegressor,RandomizedSearchCV,-0.167432,-0.042155,-0.148895,-0.15193,-0.03637,-0.129427,0.001617,0.002763
4,LogisticRegression,GridSearchCV,-15.545833,-468.06002,-10.7,-7.693015,-228.85875,0.0,0.237926,0.001105
5,LogisticRegression,RandomizedSearchCV,-15.545833,-468.06002,-10.7,-7.693015,-228.85875,0.0,0.243926,0.0
6,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


- #### ARDRegression

In [32]:
model = ARDRegression()
param_dict:dict = {'tol':[1e-3, 1e-4, 1e-5], 'alpha_1':[1e-6, 1e-8, 1e-9], "fit_intercept":[True, False], "normalize":[True, False]}
grid_search:sklearn.model_selection._search.GridSearchCV = GridSearchCV(estimator=model, param_grid=param_dict, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(communities_x, communities_y)
best_param:dict = grid_search.best_params_
print("Best parameters using GridSearchCV: ", best_param)
model_best:sklearn.linear_model._bayes.ARDRegression = ARDRegression(**best_param)
scores1:dict = cross_validate(model_best, communities_x, communities_y, cv=5, scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'], return_train_score=True)
make_df(communities_df, scores1, 'ARDRegression', 'GridSearchCV', 6)

randomized_search:sklearn.model_selection._search.RandomizedSearchCV = RandomizedSearchCV(estimator=model, param_distributions=param_dict, cv=5, scoring='neg_mean_squared_error')
randomized_search.fit(communities_x, communities_y)
best_param:dict= randomized_search.best_params_
print("Best parameters using RansomizedSearchCV: ", best_param)
model_best:sklearn.linear_model._bayes.ARDRegression= ARDRegression(**best_param)
scores2:dict = cross_validate(model_best, communities_x, communities_y, cv=5, scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'], return_train_score=True)
make_df(communities_df, scores2, 'ARDRegression', 'RandomizedSearchCV', 7)

print("The intermediate data frame looks like this:")
communities_df.head(n=10)

Best parameters using GridSearchCV:  {'alpha_1': 1e-06, 'fit_intercept': True, 'normalize': True, 'tol': 0.001}
Best parameters using RansomizedSearchCV:  {'tol': 1e-05, 'normalize': True, 'fit_intercept': True, 'alpha_1': 1e-08}
The intermediate data frame looks like this:


Unnamed: 0,Model,Search_strategy,test_neg_mean_absolute_error,test_neg_mean_squared_error,test_neg_median_absolute_error,train_neg_mean_absolute_error,train_neg_mean_squared_error,train_neg_median_absolute_error,fit_time,score_time
0,LinearRegression,GridSearchCV,-0.162619,-0.043365,-0.137603,-0.085307,-0.012237,-0.068108,0.001601,0.001599
1,LinearRegression,RandomizedSearchCV,-0.162619,-0.043365,-0.137603,-0.085307,-0.012237,-0.068108,0.003215,0.0
2,DecisionTreeRegressor,GridSearchCV,-0.156831,-0.038833,-0.138354,-0.135794,-0.029513,-0.110221,0.001263,0.0
3,DecisionTreeRegressor,RandomizedSearchCV,-0.167432,-0.042155,-0.148895,-0.15193,-0.03637,-0.129427,0.001617,0.002763
4,LogisticRegression,GridSearchCV,-15.545833,-468.06002,-10.7,-7.693015,-228.85875,0.0,0.237926,0.001105
5,LogisticRegression,RandomizedSearchCV,-15.545833,-468.06002,-10.7,-7.693015,-228.85875,0.0,0.243926,0.0
6,ARDRegression,GridSearchCV,-0.129531,-0.028001,-0.108245,-0.107551,-0.019105,-0.086297,0.193907,0.001309
7,ARDRegression,RandomizedSearchCV,-0.129534,-0.028002,-0.108247,-0.107554,-0.019106,-0.086248,0.540537,0.001004
8,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


- #### Epsilon-SupportVectorRegression

In [33]:
model = SVR()
param_dict:dict = {'kernel': ['rbf', 'liniar'], 'gamma': ['scale', 'auto'], 'C': [1, 20, 30, 40, 50]}
grid_search:sklearn.model_selection._search.GridSearchCV = GridSearchCV(estimator=model, param_grid=param_dict, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(communities_x, communities_y)
best_param :dict= grid_search.best_params_
print("Best parameters using GridSearchCV: ", best_param)
model_best:sklearn.svm._classes.SVR = SVR(**best_param)
scores1:dict = cross_validate(model_best, communities_x,communities_y, cv=5, scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'], return_train_score=True)
make_df(communities_df, scores1, 'SVR', 'GridSearchCV', 8)

randomized_search:sklearn.model_selection._search.RandomizedSearchCV = RandomizedSearchCV(estimator=model, param_distributions=param_dict, cv=5, scoring='neg_mean_squared_error')
randomized_search.fit(communities_x, communities_y)
best_param:dict= randomized_search.best_params_
print("Best parameters using RansomizedSearchCV: ", best_param)
model_best:sklearn.svm._classes.SVR= SVR(**best_param)
scores2:dict = cross_validate(model_best,communities_x, communities_y, cv=5, scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error'], return_train_score=True)
make_df(communities_df, scores2, 'SVR', 'RandomizedSearchCV', 9)

print("The intermediate data frame looks like this:")
communities_df.head(n=10)

Best parameters using GridSearchCV:  {'C': 40, 'gamma': 'scale', 'kernel': 'rbf'}
Best parameters using RansomizedSearchCV:  {'kernel': 'rbf', 'gamma': 'scale', 'C': 20}
The intermediate data frame looks like this:


Unnamed: 0,Model,Search_strategy,test_neg_mean_absolute_error,test_neg_mean_squared_error,test_neg_median_absolute_error,train_neg_mean_absolute_error,train_neg_mean_squared_error,train_neg_median_absolute_error,fit_time,score_time
0,LinearRegression,GridSearchCV,-0.162619,-0.043365,-0.137603,-0.085307,-0.012237,-0.068108,0.001601,0.001599
1,LinearRegression,RandomizedSearchCV,-0.162619,-0.043365,-0.137603,-0.085307,-0.012237,-0.068108,0.003215,0.0
2,DecisionTreeRegressor,GridSearchCV,-0.156831,-0.038833,-0.138354,-0.135794,-0.029513,-0.110221,0.001263,0.0
3,DecisionTreeRegressor,RandomizedSearchCV,-0.167432,-0.042155,-0.148895,-0.15193,-0.03637,-0.129427,0.001617,0.002763
4,LogisticRegression,GridSearchCV,-15.545833,-468.06002,-10.7,-7.693015,-228.85875,0.0,0.237926,0.001105
5,LogisticRegression,RandomizedSearchCV,-15.545833,-468.06002,-10.7,-7.693015,-228.85875,0.0,0.243926,0.0
6,ARDRegression,GridSearchCV,-0.129531,-0.028001,-0.108245,-0.107551,-0.019105,-0.086297,0.193907,0.001309
7,ARDRegression,RandomizedSearchCV,-0.129534,-0.028002,-0.108247,-0.107554,-0.019106,-0.086248,0.540537,0.001004
8,SVR,GridSearchCV,-0.12318,-0.025479,-0.095667,-0.099719,-0.015987,-0.098766,0.007644,0.002236
9,SVR,RandomizedSearchCV,-0.123438,-0.02611,-0.096672,-0.106104,-0.018735,-0.099629,0.008028,0.004726


In [34]:
communities_df.iloc[:, 2:] = np.abs(communities_df.iloc[:, 2:])
communities_df = communities_df.rename(columns={'test_neg_mean_absolute_error': 'test_mean_absolute_error', 'test_neg_mean_squared_error': 'test_mean_squared_error', 
                                            'test_neg_median_absolute_error': 'test_median_absolute_error', 'train_neg_mean_absolute_error': 'train_mean_absolute_error', 
                                            'train_neg_mean_squared_error': 'train_mean_squared_error', 'train_neg_median_absolute_error': 'train_median_absolute_error'})
communities_df = communities_df.head(10).style.highlight_max(color = 'pink').highlight_min(color = 'lightblue')
communities_df

Unnamed: 0,Model,Search_strategy,test_mean_absolute_error,test_mean_squared_error,test_median_absolute_error,train_mean_absolute_error,train_mean_squared_error,train_median_absolute_error,fit_time,score_time
0,LinearRegression,GridSearchCV,0.162619,0.043365,0.137603,0.085307,0.012237,0.068108,0.001601,0.001599
1,LinearRegression,RandomizedSearchCV,0.162619,0.043365,0.137603,0.085307,0.012237,0.068108,0.003215,0.0
2,DecisionTreeRegressor,GridSearchCV,0.156831,0.038833,0.138354,0.135794,0.029513,0.110221,0.001263,0.0
3,DecisionTreeRegressor,RandomizedSearchCV,0.167432,0.042155,0.148895,0.15193,0.03637,0.129427,0.001617,0.002763
4,LogisticRegression,GridSearchCV,15.545833,468.06002,10.7,7.693015,228.85875,0.0,0.237926,0.001105
5,LogisticRegression,RandomizedSearchCV,15.545833,468.06002,10.7,7.693015,228.85875,0.0,0.243926,0.0
6,ARDRegression,GridSearchCV,0.129531,0.028001,0.108245,0.107551,0.019105,0.086297,0.193907,0.001309
7,ARDRegression,RandomizedSearchCV,0.129534,0.028002,0.108247,0.107554,0.019106,0.086248,0.540537,0.001004
8,SVR,GridSearchCV,0.12318,0.025479,0.095667,0.099719,0.015987,0.098766,0.007644,0.002236
9,SVR,RandomizedSearchCV,0.123438,0.02611,0.096672,0.106104,0.018735,0.099629,0.008028,0.004726


In [35]:
html = communities_df.render()
text_file = open("Communities_and_Crime.html", "w")
text_file.write(html)
text_file.close()

# Documentatie

[Documentatie](./Documentatie.ipynb)