In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import confusion_matrix, r2_score, mean_squared_error
from sklearn.ensemble import RandomForestClassifier

In [2]:
data = pd.read_csv("diamonds.csv")

In [3]:
data

Unnamed: 0,carat,cut,color,clarity,price
0,0.51,Premium,F,VS1,1749.0
1,2.25,Fair,G,I1,7069.0
2,0.70,Very Good,E,VS2,2757.0
3,0.47,Good,F,VS1,1243.0
4,0.30,Ideal,G,VVS1,789.0
...,...,...,...,...,...
49995,0.71,Ideal,H,VVS1,2918.0
49996,0.43,Ideal,G,VVS2,1056.0
49997,1.14,Premium,G,VS2,6619.0
49998,1.01,Premium,E,VS2,6787.0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    49994 non-null  float64
 1   cut      50000 non-null  object 
 2   color    50000 non-null  object 
 3   clarity  50000 non-null  object 
 4   price    49997 non-null  float64
dtypes: float64(2), object(3)
memory usage: 1.9+ MB


In [5]:
data.describe()

Unnamed: 0,carat,price
count,49994.0,49997.0
mean,0.798602,1672039.0
std,0.474653,372987300.0
min,0.2,326.0
25%,0.4,948.0
50%,0.7,2403.0
75%,1.04,5331.0
max,5.01,83400000000.0


In [6]:
data.dropna(axis=0, inplace=True)

In [7]:
data.drop([10], axis=0, inplace=True)

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49990 entries, 0 to 49999
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    49990 non-null  float64
 1   cut      49990 non-null  object 
 2   color    49990 non-null  object 
 3   clarity  49990 non-null  object 
 4   price    49990 non-null  float64
dtypes: float64(2), object(3)
memory usage: 2.3+ MB


In [9]:
data.describe()

Unnamed: 0,carat,price
count,49990.0,49990.0
mean,0.798612,3939.211922
std,0.474666,3996.073267
min,0.2,326.0
25%,0.4,948.0
50%,0.7,2403.0
75%,1.04,5330.0
max,5.01,18823.0


In [10]:
data = pd.get_dummies(data, drop_first=True)

In [11]:
data

Unnamed: 0,carat,price,cut_Good,cut_Ideal,cut_Premium,cut_Very Good,color_E,color_F,color_G,color_H,color_I,color_J,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,0.51,1749.0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0
1,2.25,7069.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0.70,2757.0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0
3,0.47,1243.0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0
4,0.30,789.0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,0.71,2918.0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0
49996,0.43,1056.0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1
49997,1.14,6619.0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0
49998,1.01,6787.0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0


In [12]:
x = data.drop(["price"], axis=1)

In [13]:
y = data["price"]

In [14]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 123, shuffle=1)

In [15]:
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

In [16]:
y_pred = model.predict(X_test)

In [17]:
model.coef_

array([ 8890.89513413,   670.00235301,  1005.86526154,   877.30530976,
         864.05074815,  -208.46094528,  -308.07015211,  -506.72654791,
        -977.94033506, -1443.63924969, -2322.01604463,  5490.14194039,
        3618.79898589,  2649.36615938,  4567.16606595,  4262.77428607,
        5103.87764874,  4991.70385262])

In [18]:
y_pred

array([2.10622224e+03, 4.49053343e+03, 6.11229032e+02, ...,
       2.08326585e+03, 5.18506606e+00, 6.78239327e+03])

In [19]:
y_test

759      2147.0
33617    4081.0
27169    1021.0
47835    6921.0
28924     766.0
          ...  
2668      945.0
31986    1764.0
39454    1591.0
40705     728.0
34847    4852.0
Name: price, Length: 9998, dtype: float64

In [20]:
mean_squared_error(y_test, y_pred)

1332503.950379086

In [21]:
r2_score(y_test, y_pred)

0.9153343302134074