In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import numpy as np
import pandas as pd
import math

### Data Summary:
* carat　：Carat weight of the diamond
* cut　：Describe cut quality of the diamond. Quality in increasing order Fair, Good, Very Good, Premium, Ideal
* color　：Color of the diamond, with D being the best and J the worst
* clarity　：How obvious inclusions are within the diamond:(in order from best to worst, FL = flawless, I3= level 3 inclusions) FL,IF, VVS1,
* depth　：depth %　The height of a diamond, measured from the culet to the table, divided by its average girdle diameter 
（total depth percentage = z / mean(x, y) = 2 * z / (x + y) (43--79)）
* table　：table　% The width of the diamond's table expressed as a percentage of its average diameter
* price　：the price of the diamond
* x　：length mm
* y　：width mm
* z　：depth in mm

In [81]:
# loading data
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/回歸/diamonds.csv')

In [5]:
data.head(10)

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
5,6,0.24,Very Good,J,VVS2,62.8,57.0,336,3.94,3.96,2.48
6,7,0.24,Very Good,I,VVS1,62.3,57.0,336,3.95,3.98,2.47
7,8,0.26,Very Good,H,SI1,61.9,55.0,337,4.07,4.11,2.53
8,9,0.22,Fair,E,VS2,65.1,61.0,337,3.87,3.78,2.49
9,10,0.23,Very Good,H,VS1,59.4,61.0,338,4.0,4.05,2.39


In [82]:
# validate data
data.info() # 53940 * 10
print('-' * 40) 
data.isna().any() # no need to deal with missing values
data = data.iloc[:,1:]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  53940 non-null  int64  
 1   carat       53940 non-null  float64
 2   cut         53940 non-null  object 
 3   color       53940 non-null  object 
 4   clarity     53940 non-null  object 
 5   depth       53940 non-null  float64
 6   table       53940 non-null  float64
 7   price       53940 non-null  int64  
 8   x           53940 non-null  float64
 9   y           53940 non-null  float64
 10  z           53940 non-null  float64
dtypes: float64(6), int64(2), object(3)
memory usage: 4.5+ MB
----------------------------------------


In [86]:
data.describe()

Unnamed: 0,carat,depth,table,price,x,y,z
count,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0
mean,0.79794,61.749405,57.457184,3932.799722,5.731157,5.734526,3.538734
std,0.474011,1.432621,2.234491,3989.439738,1.121761,1.142135,0.705699
min,0.2,43.0,43.0,326.0,0.0,0.0,0.0
25%,0.4,61.0,56.0,950.0,4.71,4.72,2.91
50%,0.7,61.8,57.0,2401.0,5.7,5.71,3.53
75%,1.04,62.5,59.0,5324.25,6.54,6.54,4.04
max,5.01,79.0,95.0,18823.0,10.74,58.9,31.8


In [8]:
data = data[ data['x'] > 1 ]

In [9]:
corr_metrics = data.iloc[:,[0,4,5,6,7,8,9]].corr()
corr_metrics.style.background_gradient()
# by this table, consider throw away depth/table two variables

Unnamed: 0,carat,depth,table,price,x,y,z
carat,1.0,0.028221,0.181658,0.92161,0.977765,0.953989,0.955933
depth,0.028221,1.0,-0.2957,-0.01067,-0.025097,-0.029141,0.095357
table,0.181658,-0.2957,1.0,0.127165,0.19613,0.18453,0.151599
price,0.92161,-0.01067,0.127165,1.0,0.887227,0.867872,0.863913
x,0.977765,-0.025097,0.19613,0.887227,1.0,0.974933,0.970661
y,0.953989,-0.029141,0.18453,0.867872,0.974933,1.0,0.952149
z,0.955933,0.095357,0.151599,0.863913,0.970661,0.952149,1.0


In [10]:
corr_metrics = data.iloc[:,[0,4,5,6,7,8,9]].describe().corr()
corr_metrics.style.background_gradient()

Unnamed: 0,carat,depth,table,price,x,y,z
carat,1.0,0.999999,0.999999,0.947382,1.0,1.0,1.0
depth,0.999999,1.0,1.0,0.947519,0.999999,0.999999,0.999999
table,0.999999,1.0,1.0,0.947624,0.999999,1.0,1.0
price,0.947382,0.947519,0.947624,1.0,0.947392,0.947664,0.947525
x,1.0,0.999999,0.999999,0.947392,1.0,1.0,1.0
y,1.0,0.999999,1.0,0.947664,1.0,1.0,1.0
z,1.0,0.999999,1.0,0.947525,1.0,1.0,1.0


In [11]:
# preprocessing: for linear model, transform data to scaler form 
from sklearn.preprocessing import MinMaxScaler

data[['carat', 'depth', 'table', 'x', 'y', 'z', 'price']]
scaler = MinMaxScaler()

data[['carat', 'depth', 'table', 'x', 'y', 'z', 'price']] = scaler.fit_transform(data[['carat', 'depth', 'table', 'x', 'y', 'z', 'price']])
data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.006237,Ideal,E,SI2,0.513889,0.230769,0.0,0.031384,0.005433,0.076415
1,0.002079,Premium,E,SI1,0.466667,0.346154,0.0,0.022825,0.002898,0.072642
2,0.006237,Good,E,VS1,0.386111,0.423077,5.4e-05,0.045649,0.007063,0.072642
3,0.018711,Premium,I,VS2,0.538889,0.288462,0.000433,0.067047,0.00996,0.082704
4,0.022869,Good,J,SI2,0.563889,0.288462,0.000487,0.087019,0.012133,0.086478


In [12]:
X = data.loc[:,['carat', 'depth', 'table', 'x', 'y', 'z']]
Y = data.loc[:,'price']
len(X)
len(Y)

53932

In [13]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline


# Select evaluation metric
metric = 'neg_mean_squared_error'

# Select models
models = []
models.append(('LR', LinearRegression()))
models.append(('LASSO', Lasso()))
models.append(('EN', ElasticNet()))
models.append(('KNN', KNeighborsRegressor()))
models.append(('CART', DecisionTreeRegressor()))
models.append(('SVR', SVR()))

# Evaluate each model in turn
results = []
names = []
for name, model in models:
    kfold = KFold(5)
    cv_results = cross_val_score(model, X, Y, cv=kfold, scoring=metric)
    results.append(cv_results)
    names.append(name)
    msg = "%s model: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)
# Without considering catergorical variables, Linear regression has the best performance

LR model: -0.010237 (0.012348)
LASSO model: -0.055983 (0.062337)
EN model: -0.055983 (0.062337)
KNN model: -0.018510 (0.021631)
CART model: -0.021772 (0.021512)
SVR model: -0.017821 (0.022768)


In [20]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
# consider VIF test
# VIF dataframe
column_pair = ['depth','table','carat','x','y','z']
X = data[column_pair].fillna(0)
vif_data = pd.DataFrame()
vif_data["feature"] = column_pair


# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(X.values, i)
                          for i in range(len(X.columns))]
print(vif_data)

  feature         VIF
0   depth  183.036483
1   table   34.323580
2   carat   56.893441
3       x  247.298685
4       y   87.663388
5       z  505.350505


In [21]:
vif_data = pd.DataFrame()
column_pair = ['depth','table','carat','x','y']

X = data[column_pair].fillna(0)
vif_data = pd.DataFrame()
vif_data["feature"] = column_pair


# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(X.values, i)
                          for i in range(len(X.columns))]
print(vif_data)

  feature         VIF
0   depth   29.193289
1   table   31.531110
2   carat   55.761942
3       x  171.101736
4       y   85.715456


In [22]:
vif_data = pd.DataFrame()
column_pair = ['depth','table','carat','y']

X = data[column_pair].fillna(0)
vif_data = pd.DataFrame()
vif_data["feature"] = column_pair


# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(X.values, i)
                          for i in range(len(X.columns))]
print(vif_data)

  feature        VIF
0   depth  28.867910
1   table  30.198617
2   carat  27.741500
3       y  45.473998


In [23]:
vif_data = pd.DataFrame()
column_pair = ['depth','table','carat']

X = data[column_pair].fillna(0)
vif_data = pd.DataFrame()
vif_data["feature"] = column_pair


# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(X.values, i)
                          for i in range(len(X.columns))]
print(vif_data)

  feature        VIF
0   depth  28.024084
1   table  29.086011
2   carat   2.684723


In [24]:
vif_data = pd.DataFrame()
column_pair = ['depth','carat']

X = data[column_pair].fillna(0)
vif_data = pd.DataFrame()
vif_data["feature"] = column_pair


# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(X.values, i)
                          for i in range(len(X.columns))]
print(vif_data)

  feature       VIF
0   depth  2.581261
1   carat  2.581261


In [25]:
# find exact parameters of linear regression
import statsmodels.formula.api as smf
# use unprocess data 
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/回歸/diamonds.csv')
data = data[data['x'] > 1]
results = smf.ols('price ~ carat+depth+table+x+y+z ' , data = data).fit()
# print(results.params)
# pred = results.predict(data)
print(results.summary())

results = smf.ols('price ~ carat+depth ' , data = data).fit()
# pred1 = results.predict(data)
print(results.summary())

# R-squared form 0.86 to 0.851, so we don't need this many variables to predict the price


                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.860
Model:                            OLS   Adj. R-squared:                  0.860
Method:                 Least Squares   F-statistic:                 5.507e+04
Date:                Wed, 01 Jun 2022   Prob (F-statistic):               0.00
Time:                        06:02:08   Log-Likelihood:            -4.7073e+05
No. Observations:               53932   AIC:                         9.415e+05
Df Residuals:                   53925   BIC:                         9.415e+05
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept   2.159e+04    450.303     47.954      0.0

In [None]:
# plot depth and carat with price to see if we can do some transformation
import matplotlib.pyplot as plt
plt.scatter(data['depth'] , data['price'], alpha = 0.6)

In [None]:
plt.scatter(data['carat'] , data['price'], alpha = 0.6)

In [None]:
plt.scatter( np.log2(data['carat']) , np.log2(data['price']), alpha = 0.6)
# looks like we can use log for transformation

In [88]:
# consider quadratic
data['c2'] = data['carat']**2
data['d2'] = data['depth']**2
data['logd'] = np.log2(data['depth']) 
data['logc'] = np.log2(data['carat']) 
data['logp'] = np.log2(data['price'])

results = smf.ols('price ~ carat+depth+c2+d2 ' , data = data).fit()
print(results.summary()) # 0.855
results = smf.ols('logp ~ carat+depth+c2+d2+logc+logd ' , data = data).fit()
print(results.summary()) # 0.938

# After take log to data, we have some improve on R-squared

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.855
Model:                            OLS   Adj. R-squared:                  0.855
Method:                 Least Squares   F-statistic:                 7.970e+04
Date:                Wed, 01 Jun 2022   Prob (F-statistic):               0.00
Time:                        06:55:27   Log-Likelihood:            -4.7164e+05
No. Observations:               53940   AIC:                         9.433e+05
Df Residuals:                   53935   BIC:                         9.433e+05
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept  -1.418e+05   4383.810    -32.337      0.0

In [46]:
vif_data = pd.DataFrame()
column_pair = ['depth','carat','c2','d2','logc','logd']

X = data[column_pair].fillna(0)
vif_data = pd.DataFrame()
vif_data["feature"] = column_pair


# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(X.values, i)
                          for i in range(len(X.columns))]
print(vif_data)
# not singnificant on VIF-test

  feature           VIF
0   depth  3.468777e+06
1   carat  8.827680e+02
2      c2  1.061139e+02
3      d2  6.462211e+05
4    logc  9.903700e+01
5    logd  1.122402e+06


In [138]:
# see if there is a colinear problem
vif_data = pd.DataFrame()
column_pair = ['carat','c2','d2','logc','logd']

X = data[column_pair].fillna(0)
vif_data = pd.DataFrame()
vif_data["feature"] = column_pair


# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(X.values, i)
                          for i in range(len(X.columns))]
print(vif_data)
# not singnificant on VIF-test

  feature          VIF
0   carat   882.557461
1      c2   106.099848
2      d2   619.799816
3    logc    98.928422
4    logd  1423.657582


In [139]:
# see if there is a colinear problem
vif_data = pd.DataFrame()
column_pair = ['carat','c2','d2','logc']

X = data[column_pair].fillna(0)
vif_data = pd.DataFrame()
vif_data["feature"] = column_pair


# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(X.values, i)
                          for i in range(len(X.columns))]
print(vif_data)
# not singnificant on VIF-test

  feature         VIF
0   carat  381.148986
1      c2   51.488785
2      d2  261.068439
3    logc   42.642247


In [141]:
# see if there is a colinear problem
vif_data = pd.DataFrame()
column_pair = ['c2','d2','logc']

X = data[column_pair].fillna(0)
vif_data = pd.DataFrame()
vif_data["feature"] = column_pair


# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(X.values, i)
                          for i in range(len(X.columns))]
print(vif_data)
# not singnificant on VIF-test

  feature       VIF
0      c2  5.333910
1      d2  7.490456
2    logc  4.647904


In [142]:
results = smf.ols('logp ~ c2+d2+logc ' , data = data).fit()
print(results.summary()) # 0.934

# multicollinearity

                            OLS Regression Results                            
Dep. Variable:                   logp   R-squared:                       0.934
Model:                            OLS   Adj. R-squared:                  0.934
Method:                 Least Squares   F-statistic:                 2.561e+05
Date:                Wed, 01 Jun 2022   Prob (F-statistic):               0.00
Time:                        08:11:41   Log-Likelihood:                -23618.
No. Observations:               53940   AIC:                         4.724e+04
Df Residuals:                   53936   BIC:                         4.728e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     13.2139      0.035    375.794      0.0

In [149]:
results = smf.ols('logp ~ logc ' , data = data).fit()
print(results.summary()) # 0.934

                            OLS Regression Results                            
Dep. Variable:                   logp   R-squared:                       0.933
Model:                            OLS   Adj. R-squared:                  0.933
Method:                 Least Squares   F-statistic:                 7.510e+05
Date:                Wed, 01 Jun 2022   Prob (F-statistic):               0.00
Time:                        08:18:38   Log-Likelihood:                -24194.
No. Observations:               53940   AIC:                         4.839e+04
Df Residuals:                   53938   BIC:                         4.841e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     12.1888      0.002   6190.896      0.0

In [87]:
data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [96]:
sum((data['carat'] > 0.5) & (data['carat'] < 1))
#sum(data['carat'] < 0.5)

15948

In [99]:
# consider logistic regression in categorical variables:(clarity cut color)
data.loc[(data['carat'] > 0.5) & (data['carat'] < 1)].loc[:,['price','clarity']].groupby('clarity').mean()


Unnamed: 0_level_0,price
clarity,Unnamed: 1_level_1
I1,1612.505882
IF,3066.409253
SI1,2522.668511
SI2,2470.981209
VS1,2692.208937
VS2,2507.3169
VVS1,2944.524966
VVS2,2750.727127


In [107]:
data.loc[(data['carat'] < 0.5 ) & (data['carat'] > 0.2)].loc[:,['price','clarity']].groupby('clarity').mean()
#smaller on l1, bigger on IF、VVS1、VVS2

Unnamed: 0_level_0,price
clarity,Unnamed: 1_level_1
I1,576.230769
IF,999.28502
SI1,682.264003
SI2,611.875244
VS1,789.575709
VS2,758.487362
VVS1,927.181593
VVS2,835.142582


In [123]:
#use one hot coding for creating dummy variables
dummy_cla = pd.get_dummies(data['clarity'])
fr = [data, dummy_cla]
data = pd.concat(fr , axis = 1)

In [100]:
data.loc[(data['carat'] > 0.5) & (data['carat'] < 1)].loc[:,['price','cut']].groupby('cut').mean()

Unnamed: 0_level_0,price
cut,Unnamed: 1_level_1
Fair,2434.795
Good,2658.440373
Ideal,2502.379632
Premium,2562.238052
Very Good,2689.398674


In [108]:
data.loc[(data['carat'] < 0.5 ) & (data['carat'] > 0.2)].loc[:,['price','cut']].groupby('cut').mean()
#not very significant on price

Unnamed: 0_level_0,price
cut,Unnamed: 1_level_1
Fair,894.575221
Good,694.35079
Ideal,831.091667
Premium,818.374442
Very Good,699.029809


In [179]:
dummy_cut = pd.get_dummies(data['cut'])
fr = [data, dummy_cut]
data = pd.concat(fr , axis = 1)

In [102]:
data.loc[(data['carat'] > 0.5) & (data['carat'] < 1)].loc[:,['price','color']].groupby('color').mean()


Unnamed: 0_level_0,price
color,Unnamed: 1_level_1
D,2786.208316
E,2640.894626
F,2640.710369
G,2498.078998
H,2544.046844
I,2350.022607
J,2039.926217


In [124]:
data.loc[(data['carat'] < 0.5) & (data['carat'] > 0.2)].loc[:,['price','color']].groupby('color').mean()
#smaller on J

Unnamed: 0_level_0,price
color,Unnamed: 1_level_1
D,846.102179
E,829.030722
F,835.602398
G,809.687132
H,700.854419
I,644.230346
J,596.126147


In [125]:
dummy_col = pd.get_dummies(data['color'])
fr = [data, dummy_col]
data = pd.concat(fr , axis = 1)

In [150]:
results = smf.ols('logp ~ logc+I1+IF+SI1+SI2+VS1+VS2+VVS1+VVS2 ' , data = data).fit()
print(results.summary()) # 0.965

                            OLS Regression Results                            
Dep. Variable:                   logp   R-squared:                       0.965
Model:                            OLS   Adj. R-squared:                  0.965
Method:                 Least Squares   F-statistic:                 1.879e+05
Date:                Wed, 01 Jun 2022   Prob (F-statistic):               0.00
Time:                        08:19:12   Log-Likelihood:                -6391.4
No. Observations:               53940   AIC:                         1.280e+04
Df Residuals:                   53931   BIC:                         1.288e+04
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     10.8951      0.002   6066.426      0.0

In [151]:
results = smf.ols('logp ~ logc+D+E+F+G+H+I+J ' , data = data).fit()
print(results.summary())
# R-squared improved : 0.945

                            OLS Regression Results                            
Dep. Variable:                   logp   R-squared:                       0.945
Model:                            OLS   Adj. R-squared:                  0.945
Method:                 Least Squares   F-statistic:                 1.333e+05
Date:                Wed, 01 Jun 2022   Prob (F-statistic):               0.00
Time:                        08:20:01   Log-Likelihood:                -18689.
No. Observations:               53940   AIC:                         3.739e+04
Df Residuals:                   53932   BIC:                         3.747e+04
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     10.6375      0.002   6550.189      0.0

In [185]:
data.rename(columns = {'Very Good':'Very_Good'}, inplace = True)


In [187]:
results = smf.ols('logp ~ logc+Fair+Good+Ideal+Premium+Very_Good' , data = data).fit()
print(results.summary())

# not very significant

                            OLS Regression Results                            
Dep. Variable:                   logp   R-squared:                       0.937
Model:                            OLS   Adj. R-squared:                  0.937
Method:                 Least Squares   F-statistic:                 1.607e+05
Date:                Wed, 01 Jun 2022   Prob (F-statistic):               0.00
Time:                        08:48:48   Log-Likelihood:                -22486.
No. Observations:               53940   AIC:                         4.498e+04
Df Residuals:                   53934   BIC:                         4.504e+04
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     10.0893      0.002   4835.551      0.0

In [152]:
results = smf.ols('logp ~ logc+D+E+F+G+H+I+J+I1+IF+SI1+SI2+VS1+VS2+VVS1+VVS2 ' , data = data).fit()
print(results.summary())


                            OLS Regression Results                            
Dep. Variable:                   logp   R-squared:                       0.982
Model:                            OLS   Adj. R-squared:                  0.982
Method:                 Least Squares   F-statistic:                 2.048e+05
Date:                Wed, 01 Jun 2022   Prob (F-statistic):               0.00
Time:                        08:20:46   Log-Likelihood:                 10582.
No. Observations:               53940   AIC:                        -2.113e+04
Df Residuals:                   53925   BIC:                        -2.100e+04
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      9.6571      0.001   8014.061      0.0

In [178]:
# for simplifying model, use less variables
results = smf.ols('logp ~ logc+J+I+D+I1+SI1+SI2+VVS1+IF ' , data = data).fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                   logp   R-squared:                       0.977
Model:                            OLS   Adj. R-squared:                  0.977
Method:                 Least Squares   F-statistic:                 2.292e+05
Date:                Wed, 01 Jun 2022   Prob (F-statistic):               0.00
Time:                        08:37:53   Log-Likelihood:                 4662.1
No. Observations:               53940   AIC:                            -9302.
Df Residuals:                   53929   BIC:                            -9204.
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     12.5604      0.002   5541.065      0.0

#Conclusion:

We got a final model for predicting price of diamond, but as we can see,
almost all interpretable ratio fall on the same variable: logc.So try  transformation on this dataset is very important.

So if you want to estimate price of a diamond, weight is more important than
all the other elements.
