In [2]:
import pandas as pd

df = pd.read_csv("datasets/diamonds.csv", index_col=0)
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [3]:
df['cut'].unique()

array(['Ideal', 'Premium', 'Good', 'Very Good', 'Fair'], dtype=object)

In [4]:
cut_class_dict = {"Fair": 1, "Good": 2, "Very Good": 3, "Premium": 4, "Ideal": 5}

In [5]:
df['clarity'].unique()

array(['SI2', 'SI1', 'VS1', 'VS2', 'VVS2', 'VVS1', 'I1', 'IF'],
      dtype=object)

In [6]:
clarity_dict = {"I3": 1, "I2": 2, "I1": 3, "SI2": 4, "SI1": 5, "VS2": 6, "VS1": 7, "VVS2": 8, "VVS1": 9, "IF": 10, "FL": 11}
color_dict = {"J": 1,"I": 2,"H": 3,"G": 4,"F": 5,"E": 6,"D": 7}

In [7]:
df['cut'] = df['cut'].map(cut_class_dict)
df['clarity'] = df['clarity'].map(clarity_dict)
df['color'] = df['color'].map(color_dict)
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,5,6,4,61.5,55.0,326,3.95,3.98,2.43
2,0.21,4,6,5,59.8,61.0,326,3.89,3.84,2.31
3,0.23,2,6,7,56.9,65.0,327,4.05,4.07,2.31
4,0.29,4,2,6,62.4,58.0,334,4.2,4.23,2.63
5,0.31,2,1,4,63.3,58.0,335,4.34,4.35,2.75


In [12]:
import sklearn
from sklearn.linear_model import SGDRegressor
from sklearn import svm, preprocessing

df = sklearn.utils.shuffle(df) # always shuffle your data to avoid any biases that may emerge b/c of some order.

X = df.drop("price", axis=1).values
X = preprocessing.scale(X)
y = df["price"].values

In [13]:
X

array([[-1.0926841 ,  0.98147332, -0.23855468, ..., -1.40063002,
        -1.34357201, -1.30189003],
       [-0.8395232 , -0.80969515, -0.23855468, ..., -0.9459834 ,
        -0.90579121, -0.90511654],
       [-0.20662095, -1.70527938,  0.93716275, ..., -0.10800729,
        -0.13529699,  0.03013526],
       ...,
       [ 1.92414994,  0.98147332, -0.8264134 , ...,  1.7373231 ,
         1.72964923,  1.67391115],
       [-0.62855578,  0.98147332,  0.34930404, ..., -0.5626539 ,
        -0.52929972, -0.49417257],
       [ 0.86931286, -1.70527938, -1.41427211, ...,  0.86368842,
         0.81030954,  1.05040995]])

In [14]:
test_size = 200

X_train = X[:-test_size]
y_train = y[:-test_size]

X_test = X[-test_size:]
y_test = y[-test_size:]

clf = svm.SVR(kernel="linear")
clf.fit(X_train, y_train)


SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [15]:
clf.score(X_test, y_test)

0.8636153020651703

In [16]:
for X,y in list(zip(X_test, y_test))[:10]:
    print(f"model predicts {clf.predict([X])[0]}, real value: {y}")

model predicts 5345.338452971712, real value: 4829
model predicts 4220.596334800687, real value: 4092
model predicts 2174.4855745865943, real value: 1690
model predicts 827.0021099679225, real value: 747
model predicts 6798.273760936092, real value: 5546
model predicts 613.1315121839598, real value: 598
model predicts 1886.6115912646853, real value: 1576
model predicts 6711.339699641458, real value: 7276
model predicts 2103.4521154602257, real value: 1785
model predicts 441.64504767771086, real value: 706


In [17]:
clf = svm.SVR(kernel="rbf")
clf.fit(X_train, y_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [18]:
clf.score(X_test, y_test)

0.5876682287046402

In [19]:
for X,y in list(zip(X_test, y_test))[:10]:
    print(f"model predicts {clf.predict([X])[0]}, real value: {y}")

model predicts 5090.322344650553, real value: 4829
model predicts 3962.4643856552793, real value: 4092
model predicts 2194.8576050177944, real value: 1690
model predicts 830.8867100048928, real value: 747
model predicts 5746.399814772028, real value: 5546
model predicts 1030.5008871341283, real value: 598
model predicts 1525.0070398272237, real value: 1576
model predicts 5513.008619670565, real value: 7276
model predicts 1781.2760056015939, real value: 1785
model predicts 349.7584905166368, real value: 706
