#### Imports

In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.svm import LinearSVR
from sklearn.svm import SVR

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.regularizers import l1, l2, l1_l2

In [2]:
df = pd.read_csv('./data/nft_all.csv')
df['date'] = pd.to_datetime(df['date'])
df.head()

Unnamed: 0,date,token_id,payment,buyer,seller,price,priceUsd,1/1,4:20 watch,background,...,hat_over_headphones_score,hat_under_headphones_score,headphones_score,long_hair_score,mouth_score,shirt_score,short_hair_score,smoke_score,type_score,rarity_score
0,2022-02-19 05:46:20+00:00,37,ETH,0x6365398036d4ef045254fce2e525ac59760b186b,0xe6fe14a30631b22d3f78d9ec7869eb62d6e73daa,3.3,8370.462853,0,sub red,blue,...,0.0,0.0,36.49635,0.0,4.844961,33.112583,28.735632,0.0,1.690331,117.931694
1,2022-02-24 02:15:15+00:00,37,ETH,0xcb4652bb0396880c3b2609d5a37e19df73180ede,0x6365398036d4ef045254fce2e525ac59760b186b,3.75,9653.748962,0,sub red,blue,...,0.0,0.0,36.49635,0.0,4.844961,33.112583,28.735632,0.0,1.690331,117.931694
2,2022-02-07 06:38:47+00:00,69,ETH,0xc441eb1f41b06826db885e81486b086294a925fd,0xdb6cfff6c0955676bfa8ce795d5beffbab793298,6.9,17423.477522,0,0,green,...,0.0,0.0,2.532287,0.0,1.260081,16.366612,45.248869,2.677376,1.690331,76.219962
3,2022-02-01 03:59:04+00:00,140,ETH,0x8279648470eb92cbcd00ceb8ca30c2adfac20740,0xdb6cfff6c0955676bfa8ce795d5beffbab793298,10.69,32893.590421,pop mfer,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10000.0
4,2022-02-10 20:54:29+00:00,140,ETH,0x0c7598889d8ee611e94566d5cac5cc7c62f516da,0xc55db9017f9fa92f514050fb64439a2cac4ec458,12.0,30301.700037,pop mfer,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10000.0


#### Subsetting Columns for Modeling Purposes

In [3]:
score_cols = [
    "1of1_score","watch_score","background_score",
    "beard_score","chain_score","eyes_score",
    "hat_over_headphones_score","hat_under_headphones_score","headphones_score",
    "long_hair_score","mouth_score","shirt_score",
    "short_hair_score","smoke_score","type_score","rarity_score"
]

In [4]:
rarity_cols = [
    "1of1_rarity","watch_rarity","background_rarity",
    "beard_rarity","chain_rarity","eyes_rarity",
    "hat_over_headphones_rarity","hat_under_headphones_rarity","headphones_rarity",
    "long_hair_rarity","mouth_rarity","shirt_rarity","short_hair_rarity","smoke_rarity"
]

In [5]:
feature_cols = [
    '1/1','4:20 watch','background','beard', 'chain', 'eyes','hat over headphones','hat under headphones','headphones','long hair','mouth','shirt','short hair','smoke','type'
]

In [6]:
#to read csv in as a series:
stats = pd.read_csv('./data/stats.csv', index_col=0, squeeze=True)
stats

one_day_volume                262.080800
one_day_change                 -0.012042
one_day_sales                  81.000000
one_day_average_price           3.235565
seven_day_volume             4460.061519
seven_day_change               -0.436667
seven_day_sales              1082.000000
seven_day_average_price         4.122053
thirty_day_volume           19705.088064
thirty_day_change              15.286376
thirty_day_sales             7848.000000
thirty_day_average_price        2.510842
total_volume                23444.929392
total_sales                 20434.000000
total_supply                10020.000000
count                       10020.000000
num_owners                   5072.000000
average_price                   1.147349
num_reports                     7.000000
market_cap                  41302.972660
floor_price                     2.790000
Name: 0, dtype: float64

In [7]:
df['1d_avg_price'] = stats['one_day_average_price']
df['7d_avg_price'] = stats['seven_day_average_price']
df['30d_avg_price'] = stats['thirty_day_average_price']
df['average_price'] = stats['average_price']
df['floor_price'] = stats['floor_price']

stats_cols = ['1d_avg_price', '7d_avg_price', '30d_avg_price', 'average_price', 'floor_price']

In [8]:
non_cols = ['date', 'token_id', 'payment', 'buyer', 'seller', 'price', 'priceUsd']

In [9]:
#sort dataframe by datetime, ascending

df = df.sort_values(by='date')
df.shape

(19784, 58)

In [10]:
df.to_csv('./data/model_data_v1.csv', index=False)

#### Linear Regression

In [28]:
#fitting a preliminary linear model

X = df.drop(columns=non_cols+feature_cols+rarity_cols)
y = df['priceUsd']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

lr = LinearRegression()
lr.fit(X_train, y_train)
lr.score(X_train, y_train), lr.score(X_test, y_test)

(0.22292488207207473, 0.2610555532491463)

In [29]:
#SS barely improves model performance

z=StandardScaler()
Z_train = z.fit_transform(X_train)
Z_test = z.transform(X_test)

lr = LinearRegression()
lr.fit(Z_train, y_train)
lr.score(Z_train, y_train), lr.score(Z_test, y_test)

(0.22292544743440268, 0.2612702939046466)

In [26]:
#create a copy of the dataframe that is limited to the most recent txn for each token_id

df_lim = df.copy()
#df_lim = 
df_lim = df_lim.drop_duplicates(subset='token_id', keep='last')
df_lim.shape

(8983, 58)

In [14]:
df_lim.to_csv('./data/model_single_txns.csv', index=False)

In [15]:
#df_lim (one transaction per token) reduces variance between r2 scores but under performs modeling all txns

X = df_lim.drop(columns=non_cols+feature_cols+rarity_cols)
y = df_lim['priceUsd']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

lr = LinearRegression()
lr.fit(X_train, y_train)
lr.score(X_train, y_train), lr.score(X_test, y_test)

(0.24891829453098935, 0.23323534387284495)

In [16]:
# using score cols leads to better performance compareed to using rarity cols

X = df_lim.drop(columns=non_cols+feature_cols+score_cols)
y = df_lim['priceUsd']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

lr = LinearRegression()
lr.fit(X_train, y_train)
lr.score(X_train, y_train), lr.score(X_test, y_test)

(0.2032307703417925, 0.20203787808327411)

In [17]:
# using only total 'rarity_score' with rarity cols performs better than using score cols instead of rarity cols and performs better than using all rarity cols

X = df_lim.drop(columns=non_cols+feature_cols+score_cols[:-1])
y = df_lim['priceUsd']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

lr = LinearRegression()
lr.fit(X_train, y_train)
lr.score(X_train, y_train), lr.score(X_test, y_test)

(0.24698474690138683, 0.2339769369370236)

#### Support Vector Machine

In [18]:
# support vector regressor is poor performing
# fitted cols of best scores; rarity scores and stats

X = df.drop(columns=non_cols+feature_cols+rarity_cols)
y = df['priceUsd']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

z=StandardScaler()
Z_train = z.fit_transform(X_train)
Z_test = z.transform(X_test)

svr = LinearSVR()
svr.fit(Z_train, y_train)
svr.score(Z_train, y_train), svr.score(Z_test, y_test)

(0.018896219266142156, 0.10551028031300891)

In [19]:
#linear SVR only uses a linear kernel, attempting SVR,, default kernel = 'rbf'

X = df.drop(columns=non_cols+feature_cols+rarity_cols)
y = df['priceUsd']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

z=StandardScaler()
Z_train = z.fit_transform(X_train)
Z_test = z.transform(X_test)

svr = SVR()
svr.fit(Z_train, y_train)
svr.score(Z_train, y_train), svr.score(Z_test, y_test)

(-0.15959392217453505, -0.14697128183047625)

#### Neighbors, Tree, Ensemble Methods

In [20]:
#setting up scaled data

X = df.drop(columns=non_cols+feature_cols+rarity_cols)
y = df['priceUsd']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

z=StandardScaler()
Z_train = z.fit_transform(X_train)
Z_test = z.transform(X_test)

In [21]:
kn = KNeighborsRegressor()
kn.fit(Z_train, y_train)
kn.score(Z_train, y_train), kn.score(Z_test, y_test)

(0.39145020990285095, 0.15592833041599075)

In [22]:
dr = DecisionTreeRegressor(random_state=42)
dr.fit(X_train, y_train)
dr.score(X_train, y_train), dr.score(X_test, y_test)

(0.6347123017614219, -0.2535874535039875)

In [23]:
br = BaggingRegressor(random_state=42)
br.fit(X_train, y_train)
br.score(X_train, y_train), br.score(X_test, y_test)

(0.5805455125559239, 0.038099980392167776)

In [24]:
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)
rf.score(X_train, y_train), rf.score(X_test, y_test)

(0.6057902673009586, 0.05767219190889128)

In [25]:
ada = AdaBoostRegressor(random_state=42)
ada.fit(X_train, y_train)
ada.score(X_train, y_train), ada.score(X_test, y_test)

(0.2245621946007691, 0.15010840618013976)