In [47]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import pickle

In [48]:
# Load the dataset
dataset = pd.read_csv("Diamonds_data.csv")

In [49]:
# Perform one-hot encoding on categorical variables
dataset = pd.get_dummies(dataset, columns=['cut', 'color', 'clarity'], drop_first=True)

In [50]:
# Define independent and dependent variables
independent = dataset[['carat', 'x', 'y', 'z', 'cut_Good', 'cut_Ideal', 'cut_Premium',
                       'cut_Very Good', 'color_E', 'color_F', 'color_G', 'color_H', 'color_I',
                       'color_J', 'clarity_IF', 'clarity_SI1', 'clarity_SI2', 'clarity_VS1',
                       'clarity_VS2', 'clarity_VVS1', 'clarity_VVS2']]
dependent = dataset['price']

In [51]:
# Split into training set and test set
X_train, X_test, y_train, y_test = train_test_split(independent, dependent, test_size=1/3, random_state=0)

In [52]:
# Train the Linear Regression model
regressor = LinearRegression()
regressor.fit(X_train, y_train)

In [53]:
# View model coefficients and intercept
print(f"Model Coefficients: {regressor.coef_}")
print(f"Model Intercept: {regressor.intercept_}")

Model Coefficients: [11038.40873374  -833.33771345    14.28542328  -196.78519119
   637.10065502  1002.06568644   876.46581786   842.86684852
  -181.38673074  -237.76969703  -458.43385826  -954.76440583
 -1432.98274117 -2342.2730957   5351.52435664  3661.7880653
  2694.11530536  4595.61405581  4269.85500754  5032.46450084
  4958.36042266]
Model Intercept: -3771.465586949258


In [54]:
# Make predictions on the test set
y_pred = regressor.predict(X_test)

In [55]:
# Calculate R^2 score
r2 = r2_score(y_test, y_pred)
print(f"R^2 Score: {r2}")

R^2 Score: 0.9202478734535089


In [56]:
# Save the model to a file using pickle
filename="final_model_multilinear_diamond_price.sav"
pickle.dump(regressor,open(filename,"wb"))

In [57]:
# Example predictions
def predict_price(carat, cut, color, clarity, x, y, z):
    filename="final_model_multilinear_diamond_price.sav"
    model=pickle.load(open(filename,"rb"))
    # One-hot encode categorical inputs based on the model's categories
    cut_categories = ['Good', 'Ideal', 'Premium', 'Very Good']
    color_categories = ['E', 'F', 'G', 'H', 'I', 'J']
    clarity_categories = ['IF', 'SI1', 'SI2', 'VS1', 'VS2', 'VVS1', 'VVS2']

    cut_encoded = [1 if cut == category else 0 for category in cut_categories]
    color_encoded = [1 if color == category else 0 for category in color_categories]
    clarity_encoded = [1 if clarity == category else 0 for category in clarity_categories]

    # Prepare input features
    input_features = [carat, x, y, z] + cut_encoded + color_encoded + clarity_encoded
    input_features_array = np.array(input_features).reshape(1, -1)

    # Predict using the model
    return model.predict(input_features_array)[0]

In [58]:
# Example usage:
carat_input = 0.7
cut_input = 'Ideal'
color_input = 'E'
clarity_input = 'VS1'
x_input = 5.73
y_input = 5.75
z_input = 3.55

In [59]:
predicted_price = predict_price(carat_input, cut_input, color_input, clarity_input, x_input, y_input, z_input)
print(f"Predicted Price: {predicted_price}")

Predicted Price: 3980.2421952966097




In [60]:
# Example usage with original data:
original_carat_input = 0.23
original_cut_input = 'Ideal'
original_color_input = 'E'
original_clarity_input = 'SI2'
original_x_input = 3.95
original_y_input = 3.98
original_z_input = 2.43

In [61]:
original_predicted_price = predict_price(original_carat_input, original_cut_input, original_color_input, original_clarity_input, original_x_input, original_y_input, original_z_input)
print(f"Original Predicted Price: {original_predicted_price}")
print(f"Expected  Price 326")

Original Predicted Price: -1430.8533151639012
Expected  Price 326


