In [112]:
import seaborn as sns
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [None]:
diamonds = sns.load_dataset('diamonds')
diamonds

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50
53936,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61
53937,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56
53938,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74


In [None]:
# check for missing values
missing_values = diamonds.isnull().sum()
missing_values

Unnamed: 0,0
carat,0
cut,0
color,0
clarity,0
depth,0
table,0
price,0
x,0
y,0
z,0


In [None]:
# identify and remove outliers
Q1 = diamonds[['carat', 'x', 'y', 'z', 'depth', 'table', 'price']].quantile(0.25)
Q3 = diamonds[['carat', 'x', 'y', 'z', 'depth', 'table', 'price']].quantile(0.75)

IQR = Q3 - Q1

diamonds  = diamonds[~((diamonds[['carat', 'x', 'y', 'z', 'depth', 'table', 'price']] < (Q1 - 1.5 * IQR)) |
                               (diamonds[['carat', 'x', 'y', 'z', 'depth', 'table', 'price']] > (Q3 + 1.5 * IQR))).any(axis=1)]

diamonds

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
5,0.24,Very Good,J,VVS2,62.8,57.0,336,3.94,3.96,2.48
...,...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50
53936,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61
53937,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56
53938,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74


In [None]:
# mapping ordinal categorical variables to numerical values
diamonds = diamonds.assign(
    cut=diamonds['cut'].map({
        'Ideal': 5,
        'Premium': 4,
        'Very Good': 3,
        'Good': 2,
        'Fair': 1
    }),
    color=diamonds['color'].map({
        'D': 7,
        'E': 6,
        'F': 5,
        'G': 4,
        'H': 3,
        'I': 2,
        'J': 1
    }),
    clarity=diamonds['clarity'].map({
        'IF': 9,
        'VVS1': 8,
        'VVS2': 7,
        'VS1': 6,
        'VS2': 5,
        'IF': 4,
        'SI1': 3,
        'SI2': 2,
        'I1': 1
    })
)

diamonds

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,5,6,2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,4,6,3,59.8,61.0,326,3.89,3.84,2.31
3,0.29,4,2,5,62.4,58.0,334,4.20,4.23,2.63
4,0.31,2,1,2,63.3,58.0,335,4.34,4.35,2.75
5,0.24,3,1,7,62.8,57.0,336,3.94,3.96,2.48
...,...,...,...,...,...,...,...,...,...,...
53935,0.72,5,7,3,60.8,57.0,2757,5.75,5.76,3.50
53936,0.72,2,7,3,63.1,55.0,2757,5.69,5.75,3.61
53937,0.70,3,7,3,62.8,60.0,2757,5.66,5.68,3.56
53938,0.86,4,3,2,61.0,58.0,2757,6.15,6.12,3.74


In [None]:
# training the model
X_train, X_test, y_train, y_test = train_test_split(diamonds.drop(columns = ['price']), diamonds.price, test_size = 0.2)

model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
# predicting on the train and test sets
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

In [None]:
# evaluation
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

print("R^2 Score on Training Set:", r2_train)
print("R^2 Score on Test Set:", r2_test)

R^2 Score on Training Set: 0.9108900089505836
R^2 Score on Test Set: 0.9110561286681519
