In [1]:
import pandas as pd
df = pd.read_csv("diamonds.csv", index_col=0)
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [2]:
df["cut"].unique()

array(['Ideal', 'Premium', 'Good', 'Very Good', 'Fair'], dtype=object)

In [3]:
df["cut"].astype("category").cat.codes
#good for classification
#we will not use it

1        2
2        3
3        1
4        3
5        1
        ..
53936    2
53937    1
53938    4
53939    3
53940    2
Length: 53940, dtype: int8

In [4]:
cut_class_dict = {"Fair":1, "Good":2, "Very Good":3, "Premium":4, "Ideal":5}
clarity_dict = {"I3": 1, "I2": 2, "I1": 3, "SI2": 4, "SI1": 5, "VS2": 6, "VS1": 7, "VVS2": 8, "VVS1": 9, "IF": 10, "FL": 11}
color_dict = {"J": 1,"I": 2,"H": 3,"G": 4,"F": 5,"E": 6,"D": 7}
#meaningful values in order

df['cut']=df['cut'].map(cut_class_dict)
df['clarity']=df['clarity'].map(clarity_dict)
df['color']=df['color'].map(color_dict)

df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,5,6,4,61.5,55.0,326,3.95,3.98,2.43
2,0.21,4,6,5,59.8,61.0,326,3.89,3.84,2.31
3,0.23,2,6,7,56.9,65.0,327,4.05,4.07,2.31
4,0.29,4,2,6,62.4,58.0,334,4.2,4.23,2.63
5,0.31,2,1,4,63.3,58.0,335,4.34,4.35,2.75


In [5]:
import sklearn
from sklearn import svm, preprocessing
#better to shuffle the date, especially if is sorted in any way.
#it looks like the dataset is ordered by price
df = sklearn.utils.shuffle(df)
#feature set:
x = df.drop("price",axis=1).values #drop the column (axis=1)
#scaling the data:
#bring the range of all the values to the range of the model
x=preprocessing.scale(x)
#labels (price):
y =df['price'].values
#df['price'].head()

test_size=200

x_train = x[:-test_size]
y_train = y[:-test_size]

x_test = x[-test_size:]
y_test = y[-test_size:]

#classifier
clf = svm.SVR(kernel="linear")
clf.fit(x_train, y_train)

SVR(kernel='linear')

In [6]:
x

array([[-0.8395232 , -1.70527938, -0.23855468, ..., -0.96381268,
        -0.90579121, -0.83426413],
       [-0.88171668,  0.98147332,  1.52502147, ..., -0.96381268,
        -0.93205805, -0.9334575 ],
       [ 3.21105117, -0.80969515, -1.41427211, ...,  2.51289674,
         2.3688092 ,  2.59499247],
       ...,
       [ 1.48111837,  0.08588908,  0.34930404, ...,  1.49662783,
         1.43195828,  1.29130814],
       [-0.3332014 , -0.80969515,  1.52502147, ..., -0.23281224,
        -0.18783069, -0.05488763],
       [-1.05049061,  0.98147332, -1.41427211, ..., -1.23125187,
        -1.23850462, -1.18852618]])

In [7]:
clf.score(x_test,y_test)

0.8674618265514162

In [8]:
x_test

array([[-0.60745904, -0.80969515,  1.52502147, ..., -0.58048318,
        -0.52929972, -0.50834305],
       [ 0.531765  , -2.60086361,  0.34930404, ...,  0.5873346 ,
         0.52137421,  0.80951176],
       [ 1.10137702, -1.70527938, -2.00213083, ...,  1.08655441,
         1.09048926,  1.2487967 ],
       ...,
       [ 1.48111837,  0.08588908,  0.34930404, ...,  1.49662783,
         1.43195828,  1.29130814],
       [-0.3332014 , -0.80969515,  1.52502147, ..., -0.23281224,
        -0.18783069, -0.05488763],
       [-1.05049061,  0.98147332, -1.41427211, ..., -1.23125187,
        -1.23850462, -1.18852618]])

In [9]:
for X,y in zip(x_test, y_test):
    #print(f"X is {X}")
    print(f"Model: {clf.predict([X])[0]}, Actual: {y}")

Model: 1541.8053782089341, Actual: 1569
Model: 4950.03528818104, Actual: 5186
Model: 6400.621076371512, Actual: 4573
Model: 3650.8341504084674, Actual: 2661
Model: 6078.707901799224, Actual: 6471
Model: 1510.923210848815, Actual: 1353
Model: 12484.545690457757, Actual: 14556
Model: 4854.554162565658, Actual: 4899
Model: 1639.0182625730195, Actual: 1589
Model: 375.35455531621074, Actual: 707
Model: 842.0155037884988, Actual: 807
Model: 3788.324976884848, Actual: 3307
Model: 161.19995178050885, Actual: 552
Model: 5489.450919435323, Actual: 7415
Model: 1445.69867993842, Actual: 1001
Model: 1511.8197437391696, Actual: 1693
Model: 818.8175305448108, Actual: 836
Model: 1017.5587660022552, Actual: 999
Model: -21.53955111614414, Actual: 526
Model: 2343.635283594317, Actual: 1708
Model: 13853.522376491972, Actual: 14108
Model: 1373.8806730690967, Actual: 1304
Model: 345.26074390364056, Actual: 788
Model: 3132.170119983729, Actual: 2585
Model: 2403.7601692516623, Actual: 2453
Model: 535.63285340

In [10]:
clf = svm.SVR(kernel="rbf")
clf.fit(x_train, y_train)

SVR()

In [11]:
clf.score(x_test,y_test)

0.5477869114871553

In [12]:
for X,y in zip(x_test, y_test):
    print(f"Model: {clf.predict([X])[0]}, Actual: {y}")

Model: 1393.003393733414, Actual: 1569
Model: 4006.1299137440787, Actual: 5186
Model: 4940.688753511315, Actual: 4573
Model: 3534.706310684136, Actual: 2661
Model: 5860.931927218635, Actual: 6471
Model: 1488.5067118889815, Actual: 1353
Model: 5998.1108811123395, Actual: 14556
Model: 4499.525589388678, Actual: 4899
Model: 1873.0824849083997, Actual: 1589
Model: 1041.452041042619, Actual: 707
Model: 762.2498858136546, Actual: 807
Model: 3482.1421068909954, Actual: 3307
Model: 1400.7020935715384, Actual: 552
Model: 4688.453304046844, Actual: 7415
Model: 1317.487104889017, Actual: 1001
Model: 1263.5746949200209, Actual: 1693
Model: 970.9141993106728, Actual: 836
Model: 799.3796632455669, Actual: 999
Model: 559.3326837580262, Actual: 526
Model: 2502.6284652656886, Actual: 1708
Model: 6173.942816203966, Actual: 14108
Model: 1495.5333361161652, Actual: 1304
Model: 1002.1505090866599, Actual: 788
Model: 2965.5495110639367, Actual: 2585
Model: 2443.2638452244078, Actual: 2453
Model: 968.2763835