In [107]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import r2_score
import pickle

In [108]:
# Load the dataset
dataset = pd.read_csv("Diamonds_data.csv")

In [109]:
dataset

Unnamed: 0,carat,cut,color,clarity,price,x,y,z
0,0.23,Ideal,E,SI2,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,334,4.20,4.23,2.63
4,0.31,Good,J,SI2,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,2757,5.75,5.76,3.50
53936,0.72,Good,D,SI1,2757,5.69,5.75,3.61
53937,0.70,Very Good,D,SI1,2757,5.66,5.68,3.56
53938,0.86,Premium,H,SI2,2757,6.15,6.12,3.74


In [110]:
# Perform one-hot encoding on categorical variables
dataset = pd.get_dummies(dataset, dtype=int, drop_first=True)

In [111]:
dataset.columns

Index(['carat', 'price', 'x', 'y', 'z', 'cut_Good', 'cut_Ideal', 'cut_Premium',
       'cut_Very Good', 'color_E', 'color_F', 'color_G', 'color_H', 'color_I',
       'color_J', 'clarity_IF', 'clarity_SI1', 'clarity_SI2', 'clarity_VS1',
       'clarity_VS2', 'clarity_VVS1', 'clarity_VVS2'],
      dtype='object')

In [112]:
# Define independent and dependent variables
independent = dataset[['carat', 'x', 'y', 'z', 'cut_Good', 'cut_Ideal', 'cut_Premium',
                       'cut_Very Good', 'color_E', 'color_F', 'color_G', 'color_H', 'color_I',
                       'color_J', 'clarity_IF', 'clarity_SI1', 'clarity_SI2', 'clarity_VS1',
                       'clarity_VS2', 'clarity_VVS1', 'clarity_VVS2']]
dependent = dataset['price']

In [113]:
independent



Unnamed: 0,carat,x,y,z,cut_Good,cut_Ideal,cut_Premium,cut_Very Good,color_E,color_F,...,color_H,color_I,color_J,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,0.23,3.95,3.98,2.43,0,1,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
1,0.21,3.89,3.84,2.31,0,0,1,0,1,0,...,0,0,0,0,1,0,0,0,0,0
2,0.23,4.05,4.07,2.31,1,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
3,0.29,4.20,4.23,2.63,0,0,1,0,0,0,...,0,1,0,0,0,0,0,1,0,0
4,0.31,4.34,4.35,2.75,1,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53935,0.72,5.75,5.76,3.50,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
53936,0.72,5.69,5.75,3.61,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
53937,0.70,5.66,5.68,3.56,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
53938,0.86,6.15,6.12,3.74,0,0,1,0,0,0,...,1,0,0,0,0,1,0,0,0,0


In [114]:
dependent

0         326
1         326
2         327
3         334
4         335
         ... 
53935    2757
53936    2757
53937    2757
53938    2757
53939    2757
Name: price, Length: 53940, dtype: int64

In [115]:
# Split into training set and test set
X_train, X_test, y_train, y_test = train_test_split(independent, dependent, test_size=1/3, random_state=0)

In [116]:
#standardization
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [117]:
X_train

array([[-1.0095679 , -1.15241392, -1.14784879, ..., -0.54259547,
        -0.27003777,  3.12358852],
       [ 0.23367786,  0.35264161,  0.39097671, ...,  1.84299363,
        -0.27003777, -0.3201446 ],
       [-0.20883334,  0.09437764,  0.03652814, ..., -0.54259547,
        -0.27003777, -0.3201446 ],
       ...,
       [-1.0095679 , -1.2325648 , -1.16513896, ..., -0.54259547,
        -0.27003777, -0.3201446 ],
       [-0.81992024, -0.82290472, -0.81933548, ..., -0.54259547,
        -0.27003777, -0.3201446 ],
       [ 0.23367786,  0.4506038 ,  0.39097671, ..., -0.54259547,
        -0.27003777, -0.3201446 ]])

In [123]:
# Define the different kernel options
kernels = ['linear', 'poly', 'rbf', 'sigmoid']
results = {}

# Loop through each kernel, train the model, and calculate the R^2 score
for kernel in kernels:
    regressor = SVR(kernel=kernel, C=100)
    regressor.fit(X_train, y_train)
    y_pred = regressor.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    results[kernel] = r2
    print(f"R^2 Score for kernel='{kernel}': {r2}")

# Print the results
print("\nComparison of R^2 scores for different kernels:")
for kernel, r2 in results.items():
    print(f"Kernel: {kernel}, R^2 Score: {r2}")

R^2 Score for kernel='linear': 0.9027794778812165
R^2 Score for kernel='poly': 0.890138496512053
R^2 Score for kernel='rbf': 0.9446368588944677
R^2 Score for kernel='sigmoid': -9.736710822648726

Comparison of R^2 scores for different kernels:
Kernel: linear, R^2 Score: 0.9027794778812165
Kernel: poly, R^2 Score: 0.890138496512053
Kernel: rbf, R^2 Score: 0.9446368588944677
Kernel: sigmoid, R^2 Score: -9.736710822648726


In [124]:
# Save the model to a file using pickle
regressor = SVR(kernel='rbf', C=100)
regressor.fit(X_train, y_train)
filename="final_model_svm_diamond_price.sav"
pickle.dump(regressor,open(filename,"wb"))

In [125]:
# Example predictions
def predict_price(carat, cut, color, clarity, x, y, z):
    filename="final_model_svm_diamond_price.sav"
    model=pickle.load(open(filename,"rb"))
    # One-hot encode categorical inputs based on the model's categories
    cut_categories = ['Good', 'Ideal', 'Premium', 'Very Good']
    color_categories = ['E', 'F', 'G', 'H', 'I', 'J']
    clarity_categories = ['IF', 'SI1', 'SI2', 'VS1', 'VS2', 'VVS1', 'VVS2']

    cut_encoded = [1 if cut == category else 0 for category in cut_categories]
    color_encoded = [1 if color == category else 0 for category in color_categories]
    clarity_encoded = [1 if clarity == category else 0 for category in clarity_categories]

    # Prepare input features
    input_features = [carat, x, y, z] + cut_encoded + color_encoded + clarity_encoded
    input_features_array = np.array(input_features).reshape(1, -1)

    # Predict using the model
    return model.predict(input_features_array)[0]

In [126]:
# Example usage:
carat_input = 0.7
cut_input = 'Ideal'
color_input = 'E'
clarity_input = 'VS1'
x_input = 5.73
y_input = 5.75
z_input = 3.55
predicted_price = predict_price(carat_input, cut_input, color_input, clarity_input, x_input, y_input, z_input)
print(f"Predicted Price: {predicted_price}")

Predicted Price: 11135.635368112704


In [127]:
# Example usage with original data:
original_carat_input = 0.23
original_cut_input = 'Ideal'
original_color_input = 'E'
original_clarity_input = 'SI2'
original_x_input = 3.95
original_y_input = 3.98
original_z_input = 2.43
original_predicted_price = predict_price(original_carat_input, original_cut_input, original_color_input, original_clarity_input, original_x_input, original_y_input, original_z_input)
print(f"Original Predicted Price: {original_predicted_price}")
print(f"Expected  Price 326")

Original Predicted Price: 14298.284520816958
Expected  Price 326
