Objective: Feed all of the diamond dataset values through a regression model with the goal of predicting prices of these diamonds. #regression

In [1]:
import pandas as pd

df = pd.read_csv("datasets/diamonds.csv", index_col=0) #dont generate duplicate indexes
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [2]:
df["cut"].unique()

array(['Ideal', 'Premium', 'Good', 'Very Good', 'Fair'], dtype=object)

Assign arbitrary code to unique values (eg. cuts)

In [3]:
cut_class_dict = {"Fair": 1, "Good": 2, "Very Good": 3, "Premium": 4, "Ideal": 5}
clarity_dict = {"I3": 1, "I2": 2, "I1": 3, "SI2": 4, "SI1": 5, "VS2": 6, "VS1": 7, "VVS2": 8, "VVS1": 9, "IF": 10, "FL": 11}
color_dict = {"J": 1,"I": 2,"H": 3,"G": 4,"F": 5,"E": 6,"D": 7}


df["cut"] = df['cut'].map(cut_class_dict)
df["clarity"] = df['clarity'].map(clarity_dict)
df["color"] = df['color'].map(color_dict)

df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,5,6,4,61.5,55.0,326,3.95,3.98,2.43
2,0.21,4,6,5,59.8,61.0,326,3.89,3.84,2.31
3,0.23,2,6,7,56.9,65.0,327,4.05,4.07,2.31
4,0.29,4,2,6,62.4,58.0,334,4.2,4.23,2.63
5,0.31,2,1,4,63.3,58.0,335,4.34,4.35,2.75


In [4]:
import sklearn
from sklearn.linear_model import SGDRegressor

df = sklearn.utils.shuffle(df) # always shuffle your data to avoid any biases that may emerge b/c of some order.

X = df.drop("price", axis=1).values
y = df["price"].values

In [5]:
test_size = 200

X_train = X[:-test_size]
y_train = y[:-test_size]

X_test = X[-test_size:]
y_test = y[-test_size:]

In [6]:
clf = SGDRegressor(max_iter=1000)
clf.fit(X_train, y_train)

print(clf.score(X_test, y_test))

-126959524.25045304


In [7]:
for X,y in list(zip(X_test, y_test))[:10]:
    print(clf.predict([X])[0], y)

36885153.19058943 18572
59319642.19429064 1999
-3213780.869050026 5221
78020118.20477104 434
18770555.13153553 832
7570097.341078758 6295
18906571.2821455 5767
34754757.59892559 1101
73179181.97574139 12224
53574067.79190016 846


In [8]:
from sklearn import svm

clf = svm.SVR()

clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))

-0.13284344056897934


In [9]:
for X,y in list(zip(X_test, y_test))[:10]:
    print(clf.predict([X])[0], y)

2559.3094125462885 18572
2409.4460119870487 1999
2492.9275549266135 5221
2284.7839527918745 434
2313.503061014993 832
2455.6275915380297 6295
2422.7531895599595 5767
2396.995960812321 1101
2501.682718478425 12224
2306.324116795473 846


In [10]:
clf = SGDRegressor(max_iter=10000)

clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))

for X,y in list(zip(X_test, y_test))[:10]:
    print(clf.predict([X])[0], y)

-126574565.96111627
-52979267.791023254 18572
-65512597.20752096 1999
-27612744.939785004 5221
-79245827.02051687 434
-41092050.74426508 832
-35717002.78983736 6295
-38192605.896811485 5767
-51199871.194293976 1101
-72986580.55026913 12224
-61319794.46011877 846


In [11]:
import sklearn
from sklearn import svm, preprocessing

df = sklearn.utils.shuffle(df) # always shuffle your data to avoid any biases that may emerge b/c of some order.

X = df.drop("price", axis=1).values
X = preprocessing.scale(X)
y = df["price"].values

test_size = 200

X_train = X[:-test_size]
y_train = y[:-test_size]

X_test = X[-test_size:]
y_test = y[-test_size:]

clf = svm.SVR()

clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))

for X,y in list(zip(X_test, y_test))[:10]:
    print(f"model predicts {clf.predict([X])[0]}, real value: {y}")

0.5521628352895042
model predicts 1400.6307368494154, real value: 1418
model predicts 6041.964586985125, real value: 6294
model predicts 4414.398476865016, real value: 5906
model predicts 716.216073234853, real value: 828
model predicts 6624.415680352927, real value: 8549
model predicts 1382.7041874883355, real value: 1232
model predicts 4210.888306832347, real value: 4398
model predicts 4970.77611790473, real value: 14666
model predicts 6220.3587169427865, real value: 14715
model predicts 1235.082760339175, real value: 478
