In [1]:
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn import model_selection
from sklearn import metrics
import warnings

warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')


In [2]:
df = sns.load_dataset('diamonds')
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [3]:
df.drop(['depth', 'table', 'x', 'y', 'z'], axis=1, inplace=True)

In [4]:
df = pd.get_dummies(df, drop_first=True)

In [5]:
df.head()

Unnamed: 0,carat,price,cut_Premium,cut_Very Good,cut_Good,cut_Fair,color_E,color_F,color_G,color_H,color_I,color_J,clarity_VVS1,clarity_VVS2,clarity_VS1,clarity_VS2,clarity_SI1,clarity_SI2,clarity_I1
0,0.23,326,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0
1,0.21,326,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0
2,0.23,327,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0
3,0.29,334,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0
4,0.31,335,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0


In [6]:
df['carat'] = np.log(1 + df['carat'])
df['price'] = np.log(1 + df['price'])

In [7]:
df.head()

Unnamed: 0,carat,price,cut_Premium,cut_Very Good,cut_Good,cut_Fair,color_E,color_F,color_G,color_H,color_I,color_J,clarity_VVS1,clarity_VVS2,clarity_VS1,clarity_VS2,clarity_SI1,clarity_SI2,clarity_I1
0,0.207014,5.78996,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0
1,0.19062,5.78996,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0
2,0.207014,5.793014,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0
3,0.254642,5.814131,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0
4,0.270027,5.817111,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0


In [8]:
X_cols = [col for col in df.columns if col != 'price']
X = df[X_cols]
y = df['price']

In [9]:
print(metrics.get_scorer_names())

['accuracy', 'adjusted_mutual_info_score', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', 'completeness_score', 'explained_variance', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'fowlkes_mallows_score', 'homogeneity_score', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_weighted', 'matthews_corrcoef', 'max_error', 'mutual_info_score', 'neg_brier_score', 'neg_log_loss', 'neg_mean_absolute_error', 'neg_mean_absolute_percentage_error', 'neg_mean_gamma_deviance', 'neg_mean_poisson_deviance', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_median_absolute_error', 'neg_root_mean_squared_error', 'normalized_mutual_info_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'r2', 'rand_score', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc', 'roc_auc_ovo', 'roc_auc_ovo_weighted', 'roc_auc_ovr', 'roc_auc_ovr_weighted', 'top_k_accuracy', 

In [11]:
params = {
    'learning_rate': ['constant'],
    'eta0': np.logspace(-4,-1,4),
    'max_iter': np.logspace(0,3,10),
    'loss': ['squared_error', 'epsilon_insensitive'],
    'penalty': ['elasticnet'], 
    'alpha': np.logspace(-3,3,15),
    'l1_ratio': np.linspace(0,1,11),
}
model = linear_model.SGDRegressor()
grid = model_selection.GridSearchCV(
    model, 
    param_grid=params,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    cv=5
)
grid.fit(X, y)



In [12]:
grid.best_params_

{'alpha': 0.001,
 'eta0': 0.1,
 'l1_ratio': 0.9,
 'learning_rate': 'constant',
 'loss': 'squared_error',
 'max_iter': 1000.0,
 'penalty': 'elasticnet'}

In [13]:
model = linear_model.SGDRegressor(**grid.best_params_)
model.fit(X, y)
y_pred = model.predict(X)

print(metrics.mean_squared_error(y, y_pred))

0.04642573315982719
