#### Imports

In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

from sklearn.linear_model import LinearRegression

from sklearn.preprocessing import StandardScaler

from sklearn.svm import SVR

#### Importing Data

In [2]:
score_cols = [
    "1of1_score","watch_score","background_score",
    "beard_score","chain_score","eyes_score",
    "hat_over_headphones_score","hat_under_headphones_score","headphones_score",
    "long_hair_score","mouth_score","shirt_score",
    "short_hair_score","smoke_score","type_score","rarity_score"
]

rarity_cols = [
    "1of1_rarity","watch_rarity","background_rarity",
    "beard_rarity","chain_rarity","eyes_rarity",
    "hat_over_headphones_rarity","hat_under_headphones_rarity","headphones_rarity",
    "long_hair_rarity","mouth_rarity","shirt_rarity","short_hair_rarity","smoke_rarity"
]

feature_cols = [
    '1/1','4:20 watch','background','beard', 'chain', 'eyes','hat over headphones','hat under headphones','headphones','long hair','mouth','shirt','short hair','smoke','type'
]

non_cols = ['date', 'token_id', 'payment', 'buyer', 'seller', 'price', 'priceUsd']

In [3]:
dfv2 = pd.read_csv('./data/model_data_v2.csv')
dfv2.head()

Unnamed: 0,date,token_id,payment,buyer,seller,price,priceUsd,1/1,4:20 watch,background,...,30d_avg_price,average_price,floor_price,last_10_avg,last_10_max,last_10_min,last_25_avg,last_25_max,last_25_min,last_sale_price
0,2021-11-30 21:35:15+00:00,1848,ETH,0xdd6d3324c75a5ae1480ef9f3394702dc28adb65e,0x97da20dc2bd46174da79d71a385b7248527fbee6,0.349,1623.037992,0,sub rose,blue,...,2.510842,1.147349,2.79,1833.707393,2325.269329,1581.183144,2232.053932,4650.538658,1581.183144,0.34
1,2021-11-30 21:35:15+00:00,6531,ETH,0xc2690edde996e5c56c9b7fafcdbdf257a780446b,0x7c121489e50e6672bbb59cc4c3fc86eaa8fb8364,0.35,1627.68853,0,sub black,blue,...,2.510842,1.147349,2.79,1819.755777,2325.269329,1581.183144,2111.139927,4418.011725,1581.183144,0.349
2,2021-11-30 21:35:15+00:00,4147,ETH,0xa7856843b4298a55a23a902ced79cc63c880d0d7,0x9e9b7dc5a0cfa97e3cc984d836d969ac13746951,0.385,1790.457383,0,sub red,red,...,2.510842,1.147349,2.79,1792.317599,2325.269329,1581.183144,2054.403355,4418.011725,1581.183144,0.35
3,2021-11-30 21:35:15+00:00,6176,ETH,0xd6a4eb571e6812d72f86076e9d2f902fef288278,0xf50131d7d2b5239fe1e934658fe3f6131532a437,0.45,2092.742396,0,0,blue,...,2.510842,1.147349,2.79,1769.064905,2325.269329,1581.183144,2052.54314,4418.011725,1581.183144,0.385
4,2021-11-30 21:35:15+00:00,7061,ETH,0x2ab5cf2e5b79a96bf70302de6f799c80e2a4b7cf,0xdf3fb2a51568f84c2fbafea5f8ad039b8d931b12,0.4,1860.215463,0,0,green,...,2.510842,1.147349,2.79,1785.341791,2325.269329,1581.183144,1998.596891,4418.011725,1581.183144,0.45


#### Best Models

In [4]:
#Optimal Support Vector Machine Regressor Model

X = dfv2.drop(columns=non_cols+feature_cols+rarity_cols)
y = dfv2['priceUsd']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

z=StandardScaler()
Z_train = z.fit_transform(X_train)
Z_test = z.transform(X_test)

svr = SVR(C=0.01, gamma=1, kernel='poly')
svr.fit(Z_train, y_train)
svr.score(Z_train, y_train), svr.score(Z_test, y_test)

(0.8625250995842414, 0.8632503439566752)

In [5]:
preds = svr.predict(Z_test)

In [6]:
mean_squared_error(y_test, preds, squared=False)

1789.924022165694

In [7]:
mean_absolute_error(y_test, preds)

901.5117147292058

In [8]:
r2_score(y_test, preds)

0.8632503439566752

In [9]:
svr_df = X_test.copy()
svr_df['preds'] = preds
svr_df = svr_df.merge(dfv2.priceUsd, left_index=True, right_index=True)
svr_df = svr_df.merge(dfv2.price, left_index=True, right_index=True)
svr_df = svr_df.merge(dfv2.token_id, left_index=True, right_index=True)
svr_df = svr_df[['token_id', 'rarity_score', 'last_sale_price', 'price', 'priceUsd', 'preds']]
svr_df['resids'] = svr_df['priceUsd']-svr_df['preds']
svr_df['abs_resids'] = abs(svr_df['priceUsd']-svr_df['preds'])
svr_df = svr_df.sort_values(by='abs_resids', ascending=False)
svr_df.head(10)

Unnamed: 0,token_id,rarity_score,last_sale_price,price,priceUsd,preds,resids,abs_resids
2864,1079,712.388097,0.216,2.5,11770.143827,-20964.44808,32734.591906,32734.591906
17060,904,3356.067783,2.25,50.0,126373.806442,96813.671363,29560.135079,29560.135079
17473,6812,171.372734,2.89,10.0,25365.038948,6343.350804,19021.688144,19021.688144
19440,8821,13.876604,4.0,10.5,27885.885009,9178.813607,18707.071403,18707.071403
19494,8024,114.590344,3.4,14.0,35519.998986,17977.38965,17542.609336,17542.609336
19476,2650,28.213427,3.495,8.5,22085.66773,5101.562984,16984.104746,16984.104746
17328,1146,91.38856,1.948275,8.0,20292.031158,4393.530954,15898.500204,15898.500204
19701,2992,89.764287,3.2,10.0,24391.72687,9545.646446,14846.080424,14846.080424
19749,1247,103.365804,3.4,9.0,23876.383968,9610.971147,14265.412821,14265.412821
19753,905,852.082798,3.0,50.0,132555.351359,118757.332636,13798.018724,13798.018724


In [10]:
#Best Linear Regression Model

X = dfv2.drop(columns=non_cols+feature_cols+rarity_cols)
y = dfv2['priceUsd']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, train_size=0.5, shuffle=True)

lr = LinearRegression()
lr.fit(X_train, y_train)
lr.score(X_train, y_train), lr.score(X_test, y_test)

(0.8063627074474381, 0.8059624297484691)

In [11]:
scores = cross_val_score(lr, X, y, cv=10)
scores.mean()

0.5978231820909435

In [12]:
y_preds = lr.predict(X_test)

In [13]:
mean_squared_error(y_test, y_preds, squared=False)

2046.7184077579416

In [14]:
mean_absolute_error(y_test, y_preds)

697.512718283176

In [15]:
r2_score(y_test, y_preds)

0.8059624297484691

In [16]:
lr_df = X_test.copy()
lr_df['predictions'] = y_preds
lr_df = lr_df.merge(dfv2.priceUsd, left_index=True, right_index=True)
lr_df = lr_df.merge(dfv2.price, left_index=True, right_index=True)
lr_df = lr_df.merge(dfv2.token_id, left_index=True, right_index=True)
lr_df = lr_df[['token_id', 'rarity_score', 'last_sale_price', 'price', 'priceUsd', 'predictions']]
lr_df['residual'] = lr_df['priceUsd']-lr_df['predictions']
lr_df['abs_residual'] = abs(lr_df['priceUsd']-lr_df['predictions'])
lr_df = lr_df.sort_values(by='abs_residual', ascending=False)
lr_df.head(10)

Unnamed: 0,token_id,rarity_score,last_sale_price,price,priceUsd,predictions,residual,abs_residual
19753,905,852.082798,3.0,50.0,132555.351359,37402.644781,95152.706578,95152.706578
17060,904,3356.067783,2.25,50.0,126373.806442,56605.946659,69767.859783,69767.859783
17805,2983,473.910244,3.4,22.0,55803.085685,19665.837329,36137.248357,36137.248357
379,4707,3442.308819,0.22,3.5,16153.89727,46998.831656,-30844.934386,30844.934386
18860,4657,127.165984,7.99,20.0,50381.478942,20246.768842,30134.710099,30134.710099
19544,2210,299.295651,2.5,15.0,37845.702467,8481.979344,29363.723123,29363.723123
16112,7161,363.428213,2.3,15.98,40351.763883,13629.738576,26722.025307,26722.025307
15949,372,477.524645,1.8,15.0,37877.125047,12700.226751,25176.898296,25176.898296
15706,8662,752.776034,1.65,20.0,50502.833396,25350.627328,25152.206068,25152.206068
15191,9651,473.11335,0.93,15.0,37877.125047,13056.471918,24820.653129,24820.653129


In [17]:
coef_df = pd.DataFrame(dict(zip(X.columns, lr.coef_)), index=['coefs']).T
coef_df

Unnamed: 0,coefs
type_rarity,2088.204
1of1_score,-4.807479
watch_score,-5.659738
background_score,-825.4833
beard_score,2.896938
chain_score,2.447727
eyes_score,-18.71362
hat_over_headphones_score,21.16199
hat_under_headphones_score,-2.867071
headphones_score,-1.661495


In [18]:
svr_df.to_csv('./data/svr_analysis.csv', index=False)
lr_df.to_csv('./data/lr_analysis.csv', index=False)
coef_df.to_csv('./data/lr_coefficients.csv')