In [1]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
import warnings
warnings.filterwarnings("ignore")

In [2]:
root_dir = "20230509"
df_rates = pd.read_csv(f"{root_dir}/eth_usd_fx_rates.csv")
df_metadata = pd.read_csv(f"{root_dir}/token_metadata.csv")
df_sales = pd.read_csv(f"{root_dir}/token_sales.csv")

# Exploratory Data Analysis

## ETH Opening prices

In [3]:
df_rates.head()

Unnamed: 0,date,open
0,31/12/2016,8.162
1,01/01/2017,8.018
2,02/01/2017,8.154
3,03/01/2017,8.335
4,04/01/2017,9.639


In [4]:
df_rates.shape

(2263, 2)

In [5]:
df_rates.dtypes

date     object
open    float64
dtype: object

We need to convert our date column into a datetime.

In [6]:
df_rates['date'] = pd.to_datetime(df_rates['date'])
df_rates['date'].min(), df_rates['date'].max()

(Timestamp('2016-12-31 00:00:00'), Timestamp('2023-12-04 00:00:00'))

In [7]:
df_rates['open'].describe()

count    2263.000000
mean     1034.275408
std      1130.476592
min         8.018000
25%       201.005000
50%       429.180000
75%      1642.175000
max      4810.970000
Name: open, dtype: float64

In [8]:
# box plot of opening ETH price since 2017
fig = px.box(df_rates, y="open")
fig.show()

Opening ETH price over time

In [9]:
df_rates.sort_values("date", inplace=True)

In [10]:
# Calculate Moving Average
window_size = 50  # size of the moving window
df_rates['MA'] = df_rates['open'].rolling(window=window_size).mean()

# Calculate Exponential Moving Average
df_rates['EMA'] = df_rates['open'].ewm(span=window_size, adjust=False).mean()

# Sort the dataframe by date
df_rates = df_rates.sort_values(by='date')  

# Create a plotly graph
fig = go.Figure()

# Add traces
fig.add_trace(go.Scatter(x=df_rates['date'], y=df_rates['open'], mode='lines', name='Open Price'))
fig.add_trace(go.Scatter(x=df_rates['date'], y=df_rates['MA'], mode='lines', name=f'{window_size}-Day Moving Average'))
fig.add_trace(go.Scatter(x=df_rates['date'], y=df_rates['EMA'], mode='lines', name=f'{window_size}-Day EMA'))

# Show the plot
fig.show()

We can immidietly identify two intresting periods where the opening price of Etherium had an increasing trend followed by a decreasing trend. These periods are between 2018-2019 and 2021 and 2023.

## Token Metadata

In [11]:
df_metadata.head()

Unnamed: 0,Skin Tone,Type,Hair,Eyewear,Mouth,Headwear,Facial Hair,Smoking Device,Other:Earring,Neckwear,Skin Feature,Other:Medical Mask,Other:Clown Nose,Trait Count,rarest_property_name,token_index,rarity_score
0,Medium,Female,Blonde Bob,Green Eye Shadow,,,,,Earring,,,,,5 Trait Count,Hair:Blonde Bob,0,117.11941
1,Darker,Male,Mohawk,,Smile,,,,,,,,,4 Trait Count,Mouth:Smile,1,72.698512
2,Lighter,Female,Wild Hair,,,,,,,,,,,3 Trait Count,Hair:Wild Hair,2,58.332241
3,Darker,Male,Wild Hair,Nerd Glasses,,,,Pipe,,,,,,5 Trait Count,Smoking Device:Pipe,3,78.818332
4,Medium,Male,Wild Hair,Big Shades,,,Goat,,Earring,,,,,6 Trait Count,Facial Hair:Goat,4,91.02535


In [12]:
df_metadata.shape

(10000, 17)

In [13]:
df_metadata.isna().sum()

Skin Tone                  0
Type                       0
Hair                    3788
Eyewear                 3928
Mouth                   7455
Headwear                6458
Facial Hair             6497
Smoking Device          8450
Other:Earring           7541
Neckwear                9627
Skin Feature            9104
Other:Medical Mask      9825
Other:Clown Nose        9788
Trait Count                0
rarest_property_name       0
token_index                0
rarity_score               0
dtype: int64

Let's get idea of the distribution of rarity score

In [14]:
fig = px.box(df_metadata, y="rarity_score")
fig.show()

Let's check the unique values per Trait 

In [15]:
for column in df_metadata.columns[:-4]:
    title = f"Column: {column}"
    print(title)
    print("-"*len(title))
    print(f"Nulls: {df_metadata[column].isna().sum()}")
    print("Unique values: ", df_metadata[column].unique())
    print("\n")

Column: Skin Tone
-----------------
Nulls: 0
Unique values:  ['Medium' 'Darker' 'Lighter' 'Albino' 'Zombie' 'Ape' 'Alien']


Column: Type
------------
Nulls: 0
Unique values:  ['Female' 'Male' 'Zombie' 'Ape' 'Alien']


Column: Hair
------------
Nulls: 3788
Unique values:  ['Blonde Bob' 'Mohawk' 'Wild Hair' 'Half Shaved' nan 'Wild White Hair'
 'Messy Hair' 'Straight Hair Dark' 'Blonde Short' 'Straight Hair Blonde'
 'Stringy Hair' 'Crazy Hair' 'Mohawk Dark' 'Dark Hair' 'Peak Spike'
 'Frumpy Hair' 'Red Mohawk' 'Shaved Head' 'Straight Hair' 'Vampire Hair'
 'Mohawk Thin' 'Purple Hair' 'Clown Hair Green' 'Pigtails' 'Orange Side'
 'Wild Blonde']


Column: Eyewear
---------------
Nulls: 3928
Unique values:  ['Green Eye Shadow' nan 'Nerd Glasses' 'Big Shades' 'Purple Eye Shadow'
 'Clown Eyes Blue' 'Blue Eye Shadow' 'Clown Eyes Green' 'Regular Shades'
 'Small Shades' 'Eye Mask' 'Horned Rim Glasses' 'Classic Shades' 'VR'
 'Eye Patch' '3D Glasses' 'Welding Goggles']


Column: Mouth
-------------
N

In [16]:
df_metadata['Trait Count'].unique()

array(['5 Trait Count', '4 Trait Count', '3 Trait Count', '6 Trait Count',
       '7 Trait Count', '2 Trait Count', '8 Trait Count', '9 Trait Count'],
      dtype=object)

In [17]:

fig = px.bar(df_metadata['Trait Count'].value_counts().to_frame().reset_index(), 
             x='index', y='Trait Count',
             title="Traits Frequency")
fig.show()

Tokens with 4 to 6 traits are the most common

In [18]:
fig = px.bar(df_metadata[["Trait Count", "rarity_score"]].groupby("Trait Count").mean().reset_index(), 
             x='Trait Count', y='rarity_score',
             title="Average rarity score per number of traits")
fig.show()

Tokens that have 2 or 8 trait counts have similar rarity score on average.

In [19]:
fig = px.bar(df_metadata[["rarest_property_name", "Trait Count"]].groupby("rarest_property_name").count().reset_index().sort_values("Trait Count", ascending=False).head(5), 
             x='rarest_property_name', y='Trait Count',
             title="Top 5 most common rare property")
fig.show()

## Token Sales

In [20]:
df_sales.head()

Unnamed: 0,token_index,timestamp,eth,usd,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8
0,544,1498251906,0.01,3.2697,,,,,
1,3134,1498252232,0.01,3.2697,,,,,
2,5056,1498254413,0.1,32.684,,,,,
3,5719,1498255065,0.04,13.0736,,,,,
4,6548,1498255212,0.03,9.8052,,,,,


In [21]:
df_sales.shape

(21329, 9)

In [22]:
df_sales.dtypes

token_index      int64
timestamp        int64
eth            float64
usd            float64
Unnamed: 4     float64
Unnamed: 5     float64
Unnamed: 6     float64
Unnamed: 7     float64
Unnamed: 8     float64
dtype: object

In [23]:
df_sales['timestamp'] = pd.to_datetime(df_sales['timestamp'], unit='s')
df_sales.head()

Unnamed: 0,token_index,timestamp,eth,usd,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8
0,544,2017-06-23 21:05:06,0.01,3.2697,,,,,
1,3134,2017-06-23 21:10:32,0.01,3.2697,,,,,
2,5056,2017-06-23 21:46:53,0.1,32.684,,,,,
3,5719,2017-06-23 21:57:45,0.04,13.0736,,,,,
4,6548,2017-06-23 22:00:12,0.03,9.8052,,,,,


In [24]:
df_sales['timestamp'].min(), df_sales['timestamp'].max()

(Timestamp('2017-06-23 21:05:06'), Timestamp('2023-05-09 01:55:35'))

In [25]:
df_sales.isna().sum()

token_index        0
timestamp          0
eth                0
usd                0
Unnamed: 4     21329
Unnamed: 5     21329
Unnamed: 6     21329
Unnamed: 7     21329
Unnamed: 8     21329
dtype: int64

Will drop empty columns

In [26]:
df_sales.drop(columns=['Unnamed: 4', 'Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8'], inplace=True)

In [27]:
df_sales.columns

Index(['token_index', 'timestamp', 'eth', 'usd'], dtype='object')

We need to merge the token sales dataset to with the token metadata dataset so we can get additional features.

As a first step, we will only take the tokens recorded in the sales dataset.

In [28]:
# take unique samples of token index from sales dataset
unique_tokens_in_sales = list(set(df_sales['token_index'].unique()))

# create new metadata dataframe with only tokens in sales dataset
df_metadata_2 = df_metadata[df_metadata['token_index'].isin(unique_tokens_in_sales)]
df_metadata.shape, df_metadata_2.shape

((10000, 17), (6924, 17))

We have 3076 tokens that have not been sold.

In [29]:
df_sales['eth'].describe()

count    2.132900e+04
mean     4.941974e+01
std      9.911308e+01
min      1.000000e-18
25%      9.800000e+00
50%      3.700000e+01
75%      7.000000e+01
max      8.000000e+03
Name: eth, dtype: float64

In [30]:
df_sales['token_index'].value_counts().to_frame().describe()

Unnamed: 0,token_index
count,6924.0
mean,3.080445
std,2.457679
min,1.0
25%,1.0
50%,3.0
75%,4.0
max,34.0


In [31]:
# build distribution plot
fig = ff.create_distplot([df_sales['token_index'].value_counts().to_numpy()], ['distplot'], show_hist=True, show_rug=False, show_curve=False)
fig.show()

25% of tokens have been sold once and 75% of tokens have been sold only 4 times or less.

Let's merge both datasets

In [32]:
# Perform the merge
merged_df = pd.merge(df_metadata_2, df_sales, on='token_index', how='left')
merged_df['date'] = merged_df['timestamp'].dt.date

In [33]:
merged_df.head()

Unnamed: 0,Skin Tone,Type,Hair,Eyewear,Mouth,Headwear,Facial Hair,Smoking Device,Other:Earring,Neckwear,...,Other:Medical Mask,Other:Clown Nose,Trait Count,rarest_property_name,token_index,rarity_score,timestamp,eth,usd,date
0,Medium,Female,Blonde Bob,Green Eye Shadow,,,,,Earring,,...,,,5 Trait Count,Hair:Blonde Bob,0,117.11941,2017-06-23 22:56:29,0.98,320.313,2017-06-23
1,Medium,Female,Blonde Bob,Green Eye Shadow,,,,,Earring,,...,,,5 Trait Count,Hair:Blonde Bob,0,117.11941,2017-07-07 18:39:12,1.6,394.544,2017-07-07
2,Medium,Female,Blonde Bob,Green Eye Shadow,,,,,Earring,,...,,,5 Trait Count,Hair:Blonde Bob,0,117.11941,2018-11-30 07:32:35,25.0,2936.5,2018-11-30
3,Darker,Male,Mohawk,,Smile,,,,,,...,,,4 Trait Count,Mouth:Smile,1,72.698512,2017-06-26 21:23:26,0.42,102.7404,2017-06-26
4,Darker,Male,Mohawk,,Smile,,,,,,...,,,4 Trait Count,Mouth:Smile,1,72.698512,2019-04-06 05:21:51,31.0,5118.72,2019-04-06


In [34]:
merged_df['years'] = merged_df['timestamp'].dt.year
merged_df[['Type' ,'years', 'eth']][(merged_df['years']>=2021) & (merged_df['years']<=2022)].groupby(["years", "Type"]).apply(lambda x: x.sort_values('years').tail(1))[['eth']]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,eth
years,Type,Unnamed: 2_level_1,Unnamed: 3_level_1
2021,Alien,16281,4200.0
2021,Ape,17256,140.0
2021,Female,21328,110.0
2021,Male,21324,9e-06
2021,Zombie,21325,99.99
2022,Alien,11700,8000.0
2022,Ape,11165,2501.0
2022,Female,21320,78.0
2022,Male,21301,74.95
2022,Zombie,17978,825.0


In [35]:
merged_df['years'] = merged_df['timestamp'].dt.year
merged_df[['Type' ,'years', 'eth']][(merged_df['years']>=2018) & (merged_df['years']<=2019)].groupby(["years", "Type"]).apply(lambda x: x.sort_values('years').tail(1))[['eth']]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,eth
years,Type,Unnamed: 2_level_1,Unnamed: 3_level_1
2018,Alien,16280,12.0
2018,Ape,17893,3.99
2018,Female,21310,0.1
2018,Male,21167,0.1
2018,Zombie,18704,2.5
2019,Female,21306,0.45
2019,Male,21278,0.5
2019,Zombie,19532,5.0


We saw earlier that during the periods of 2018-2019 and 2021-2022, when Ethereum prices were experiencing a fluctuating trend of increasing and then decreasing, an interesting observation was made regarding the prices of tokens. Despite the overall volatility of the cryptocurrency market, it was noticed that the prices of NFTs tended to surge higher. This phenomenon can be attributed to the fact that Ethereum blockchain, being the primary platform for NFT creation and transactions, contributed to the increased valuation of these tokens as Ethereum's own price movements influenced the broader NFT market. Thus, while Ethereum prices experienced volatility, the NFT market demonstrated resilience and saw substantial appreciation in value.

# Data preprocessing and Cleaning

In this section, our methodology will involve a step-by-step approach to building and improving our predictive model. Initially, we will establish a baseline model using a merged dataframe, considering only the existing features. To capture temporal patterns, we will create lagged values by extracting the most recent sale price for each token and attempt to predict it. This baseline model serves as a starting point for subsequent experiments.

In the following experiments, we will continue working with the dataset we obtained. Our next step will be to incorporate lagged values for Ethereum's opening price. For the second experiment, these lagged values will be merged with the existing dataset, serving as additional features. This expanded feature set will provide the model with more information to make predictions.

For our third and final experiment, we will construct a time-series forecasting model using the dataset containing Ethereum's opening prices. The predictions generated by this forecasting model will then be utilized as input features for the valuation prediction model. By leveraging the insights obtained from the time-series forecasting model, we aim to enhance the accuracy and performance of our final predictive model.

By following this systematic approach, we can progressively refine our model, incorporating relevant features and leveraging temporal patterns in the data to improve the accuracy of our valuations.

In [36]:
# list features
merged_df.columns

Index(['Skin Tone', 'Type', 'Hair', 'Eyewear', 'Mouth', 'Headwear',
       'Facial Hair', 'Smoking Device', 'Other:Earring', 'Neckwear',
       'Skin Feature', 'Other:Medical Mask', 'Other:Clown Nose', 'Trait Count',
       'rarest_property_name', 'token_index', 'rarity_score', 'timestamp',
       'eth', 'usd', 'date', 'years'],
      dtype='object')

In [71]:
# taking last price of each token
last_price_df = merged_df.sort_values(by=['date'], ascending=True)
last_price_df = merged_df.drop_duplicates(subset=['token_index'], keep='last')

In [72]:
# we will drop columns that are no use to us
last_price_df = last_price_df.drop(columns=["usd", "date", "timestamp"])
last_price_df

Unnamed: 0,Skin Tone,Type,Hair,Eyewear,Mouth,Headwear,Facial Hair,Smoking Device,Other:Earring,Neckwear,Skin Feature,Other:Medical Mask,Other:Clown Nose,Trait Count,rarest_property_name,token_index,rarity_score,eth,years
2,Medium,Female,Blonde Bob,Green Eye Shadow,,,,,Earring,,,,,5 Trait Count,Hair:Blonde Bob,0,117.119410,25.000000,2018
5,Darker,Male,Mohawk,,Smile,,,,,,,,,4 Trait Count,Mouth:Smile,1,72.698512,60.000000,2020
8,Darker,Female,,,Hot Lipstick,Pilot Helmet,,Pipe,,,,,,5 Trait Count,Headwear:Pilot Helmet,14,239.465714,2.500000,2018
9,Lighter,Male,Peak Spike,,,,,,,,,,,3 Trait Count,Hair:Peak Spike,33,68.015914,88.800000,2017
12,Lighter,Female,Straight Hair,Big Shades,,,,,,,,,,4 Trait Count,Hair:Straight Hair,53,93.656590,129.990000,2021
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21321,Medium,Male,Vampire Hair,Regular Shades,,,Shadow Beard,,,,,,,5 Trait Count,Hair:Vampire Hair,9987,113.190823,64.000000,2023
21322,Darker,Female,,,Hot Lipstick,Pilot Helmet,,Cigarette,,,,,,5 Trait Count,Headwear:Pilot Helmet,9989,218.325800,0.300000,2017
21324,Albino,Male,Mohawk,Horned Rim Glasses,,,,,,,,,,4 Trait Count,Hair:Mohawk,9990,55.655401,0.000009,2021
21325,Zombie,Zombie,,,,Cap Forward,Front Beard,,,,,,,4 Trait Count,Skin Tone:Zombie,9997,306.081831,99.990000,2021


In [73]:
# check the nulls
last_price_df.isna().sum()

Skin Tone                  0
Type                       0
Hair                    2638
Eyewear                 2667
Mouth                   5211
Headwear                4453
Facial Hair             4367
Smoking Device          5859
Other:Earring           5131
Neckwear                6671
Skin Feature            6254
Other:Medical Mask      6801
Other:Clown Nose        6778
Trait Count                0
rarest_property_name       0
token_index                0
rarity_score               0
eth                        0
years                      0
dtype: int64

In [74]:
last_price_df['eth'].describe()

count    6.924000e+03
mean     6.442043e+01
std      1.594048e+02
min      2.000000e-10
25%      1.493250e+01
50%      5.300000e+01
75%      8.291250e+01
max      8.000000e+03
Name: eth, dtype: float64

In [75]:
fig = px.box(last_price_df, y="eth")
fig.show()

We will replace nulls with the string "-" which will indicate that the token doesn't have that trait.

In [76]:
last_price_df = last_price_df.fillna("-")
last_price_df

Unnamed: 0,Skin Tone,Type,Hair,Eyewear,Mouth,Headwear,Facial Hair,Smoking Device,Other:Earring,Neckwear,Skin Feature,Other:Medical Mask,Other:Clown Nose,Trait Count,rarest_property_name,token_index,rarity_score,eth,years
2,Medium,Female,Blonde Bob,Green Eye Shadow,-,-,-,-,Earring,-,-,-,-,5 Trait Count,Hair:Blonde Bob,0,117.119410,25.000000,2018
5,Darker,Male,Mohawk,-,Smile,-,-,-,-,-,-,-,-,4 Trait Count,Mouth:Smile,1,72.698512,60.000000,2020
8,Darker,Female,-,-,Hot Lipstick,Pilot Helmet,-,Pipe,-,-,-,-,-,5 Trait Count,Headwear:Pilot Helmet,14,239.465714,2.500000,2018
9,Lighter,Male,Peak Spike,-,-,-,-,-,-,-,-,-,-,3 Trait Count,Hair:Peak Spike,33,68.015914,88.800000,2017
12,Lighter,Female,Straight Hair,Big Shades,-,-,-,-,-,-,-,-,-,4 Trait Count,Hair:Straight Hair,53,93.656590,129.990000,2021
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21321,Medium,Male,Vampire Hair,Regular Shades,-,-,Shadow Beard,-,-,-,-,-,-,5 Trait Count,Hair:Vampire Hair,9987,113.190823,64.000000,2023
21322,Darker,Female,-,-,Hot Lipstick,Pilot Helmet,-,Cigarette,-,-,-,-,-,5 Trait Count,Headwear:Pilot Helmet,9989,218.325800,0.300000,2017
21324,Albino,Male,Mohawk,Horned Rim Glasses,-,-,-,-,-,-,-,-,-,4 Trait Count,Hair:Mohawk,9990,55.655401,0.000009,2021
21325,Zombie,Zombie,-,-,-,Cap Forward,Front Beard,-,-,-,-,-,-,4 Trait Count,Skin Tone:Zombie,9997,306.081831,99.990000,2021


In [78]:
last_price_df.columns

Index(['Skin Tone', 'Type', 'Hair', 'Eyewear', 'Mouth', 'Headwear',
       'Facial Hair', 'Smoking Device', 'Other:Earring', 'Neckwear',
       'Skin Feature', 'Other:Medical Mask', 'Other:Clown Nose', 'Trait Count',
       'rarest_property_name', 'token_index', 'rarity_score', 'eth', 'years'],
      dtype='object')

In [79]:
# we will drop columns that are no use to us
last_price_df = last_price_df.drop(columns=["years"])
last_price_df

Unnamed: 0,Skin Tone,Type,Hair,Eyewear,Mouth,Headwear,Facial Hair,Smoking Device,Other:Earring,Neckwear,Skin Feature,Other:Medical Mask,Other:Clown Nose,Trait Count,rarest_property_name,token_index,rarity_score,eth
2,Medium,Female,Blonde Bob,Green Eye Shadow,-,-,-,-,Earring,-,-,-,-,5 Trait Count,Hair:Blonde Bob,0,117.119410,25.000000
5,Darker,Male,Mohawk,-,Smile,-,-,-,-,-,-,-,-,4 Trait Count,Mouth:Smile,1,72.698512,60.000000
8,Darker,Female,-,-,Hot Lipstick,Pilot Helmet,-,Pipe,-,-,-,-,-,5 Trait Count,Headwear:Pilot Helmet,14,239.465714,2.500000
9,Lighter,Male,Peak Spike,-,-,-,-,-,-,-,-,-,-,3 Trait Count,Hair:Peak Spike,33,68.015914,88.800000
12,Lighter,Female,Straight Hair,Big Shades,-,-,-,-,-,-,-,-,-,4 Trait Count,Hair:Straight Hair,53,93.656590,129.990000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21321,Medium,Male,Vampire Hair,Regular Shades,-,-,Shadow Beard,-,-,-,-,-,-,5 Trait Count,Hair:Vampire Hair,9987,113.190823,64.000000
21322,Darker,Female,-,-,Hot Lipstick,Pilot Helmet,-,Cigarette,-,-,-,-,-,5 Trait Count,Headwear:Pilot Helmet,9989,218.325800,0.300000
21324,Albino,Male,Mohawk,Horned Rim Glasses,-,-,-,-,-,-,-,-,-,4 Trait Count,Hair:Mohawk,9990,55.655401,0.000009
21325,Zombie,Zombie,-,-,-,Cap Forward,Front Beard,-,-,-,-,-,-,4 Trait Count,Skin Tone:Zombie,9997,306.081831,99.990000


In [80]:
# feature columns
feature_columns = last_price_df.columns[:-1].to_list()

# output column
output_column = last_price_df.columns[-1]

feature_columns, output_column

(['Skin Tone',
  'Type',
  'Hair',
  'Eyewear',
  'Mouth',
  'Headwear',
  'Facial Hair',
  'Smoking Device',
  'Other:Earring',
  'Neckwear',
  'Skin Feature',
  'Other:Medical Mask',
  'Other:Clown Nose',
  'Trait Count',
  'rarest_property_name',
  'token_index',
  'rarity_score'],
 'eth')

In [85]:
X, y = last_price_df[feature_columns], last_price_df[output_column]

Let's process our features so they can be ready to be digested into a ML algorithm.

In [86]:
# lets transform number of traits to int
X['Trait Count'] = X['Trait Count'].apply(lambda x: int(x.split(" Trait Count")[0]))

In [87]:
def encode_categorical(df, column):
    dummies = pd.get_dummies(df[column], prefix=column)
    df = pd.concat([df, dummies], axis=1)
    df = df.drop(columns=[column])
    return df

In [88]:
# categorical encoding to categorical features
for column in X.columns.to_list():
    if column not in ['rarity_score', 'Trait Count']:
        X = encode_categorical(X, column)

In [89]:
X.head()

Unnamed: 0,Trait Count,rarity_score,Skin Tone_Albino,Skin Tone_Alien,Skin Tone_Ape,Skin Tone_Darker,Skin Tone_Lighter,Skin Tone_Medium,Skin Tone_Zombie,Type_Alien,...,token_index_9976,token_index_9979,token_index_9980,token_index_9981,token_index_9982,token_index_9987,token_index_9989,token_index_9990,token_index_9997,token_index_9998
2,5,117.11941,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
5,4,72.698512,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,5,239.465714,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,3,68.015914,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12,4,93.65659,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


One last step is dividing our dataset into training and valdiation sets. We will choose a 70/30 split.

In [90]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets, with 70% for training and 30% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Machine learning

In [91]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, explained_variance_score
import numpy as np

In [92]:
def calculate_regression_metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    explained_variance = explained_variance_score(y_true, y_pred)

    metrics = {
        'Mean Absolute Error': mae,
        'Root Mean Squared Error': rmse,
        'R-squared': r2,
        'Explained Variance Score': explained_variance
    }

    for metric, value in metrics.items():
        print(metric + ':', value)

In [93]:
models = [LinearRegression(),  GradientBoostingRegressor(), KNeighborsRegressor()]
model_names = ['Linear Regression', 'Gradient Boosting Regressor', 'KNeighbors Regressor']

In [94]:
for model, name  in zip(models, model_names):
    print(name)
    print("-"*len(name))
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    calculate_regression_metrics(y_true=y_test, y_pred=y_pred)

    print("\n")

Linear Regression
-----------------
Mean Absolute Error: 50.05808102005191
Root Mean Squared Error: 127.65060307738221
R-squared: -1.1438803997185265
Explained Variance Score: -1.1425638645190972


Gradient Boosting Regressor
---------------------------
Mean Absolute Error: 46.190146686730316
Root Mean Squared Error: 94.53588392168545
R-squared: -0.17583989275242806
Explained Variance Score: -0.17583913857529043


KNeighbors Regressor
--------------------
Mean Absolute Error: 53.89135213324658
Root Mean Squared Error: 140.52601262191254
R-squared: -1.598174157892971
Explained Variance Score: -1.5929031999916576




All models performed pourly on the testing set. Lets try a few more things like data standardization and normalization.

In [95]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

def normalize_and_standardize(df_train, df_test, column):
    # Create the scalers
    min_max_scaler = MinMaxScaler()
    standard_scaler = StandardScaler()

    # Reshape the data to fit the scaler
    train_data = df_train[column].values.reshape(-1, 1)
    test_data = df_test[column].values.reshape(-1, 1)

    # Fit and transform on the training data
    train_data_min_max_scaled = min_max_scaler.fit_transform(train_data)
    train_data_standard_scaled = standard_scaler.fit_transform(train_data)

    # Transform the test data based on the fitted scaler from the training data
    test_data_min_max_scaled = min_max_scaler.transform(test_data)
    test_data_standard_scaled = standard_scaler.transform(test_data)

    # Replace original columns in the dataframes
    df_train[column+'_min_max_scaled'] = train_data_min_max_scaled
    df_train[column+'_standard_scaled'] = train_data_standard_scaled
    df_test[column+'_min_max_scaled'] = test_data_min_max_scaled
    df_test[column+'_standard_scaled'] = test_data_standard_scaled

    return df_train, df_test


In [96]:
X_train, X_test = normalize_and_standardize(X_train, X_test, column='rarity_score')
X_train, X_test = normalize_and_standardize(X_train, X_test, column='Trait Count')
X_train, X_test = X_train.drop(columns=["rarity_score_min_max_scaled"]), X_test.drop(columns=["rarity_score_min_max_scaled"])
X_train, X_test = X_train.drop(columns=["Trait Count_min_max_scaled"]), X_test.drop(columns=["Trait Count_min_max_scaled"])


for model, name  in zip(models, model_names):
    print(name)
    print("-"*len(name))
    model.fit(X_train.drop(columns=['rarity_score', 'Trait Count']), y_train)
    y_pred = model.predict(X_test.drop(columns=['rarity_score', 'Trait Count']))
    calculate_regression_metrics(y_true=y_test, y_pred=y_pred)

    print("\n")

Linear Regression
-----------------
Mean Absolute Error: 50.114613846917194
Root Mean Squared Error: 128.79496486551562
R-squared: -1.1824916048729461
Explained Variance Score: -1.1811304111639176


Gradient Boosting Regressor
---------------------------
Mean Absolute Error: 46.19861603778371
Root Mean Squared Error: 94.07564701979338
R-squared: -0.16441888333440224
Explained Variance Score: -0.16441854287584157


KNeighbors Regressor
--------------------
Mean Absolute Error: 52.299316277477594
Root Mean Squared Error: 138.6993716017625
R-squared: -1.5310679164128596
Explained Variance Score: -1.5281013508872396




In [97]:
X_train, X_test = normalize_and_standardize(X_train, X_test, column='rarity_score')
X_train, X_test = normalize_and_standardize(X_train, X_test, column='Trait Count')
X_train, X_test = X_train.drop(columns=["rarity_score_standard_scaled"]), X_test.drop(columns=["rarity_score_standard_scaled"])
X_train, X_test = X_train.drop(columns=["Trait Count_standard_scaled"]), X_test.drop(columns=["Trait Count_standard_scaled"])

for model, name  in zip(models, model_names):
    print(name)
    print("-"*len(name))
    model.fit(X_train.drop(columns=['rarity_score', 'Trait Count']), y_train)
    y_pred = model.predict(X_test.drop(columns=['rarity_score', 'Trait Count']))
    calculate_regression_metrics(y_true=y_test, y_pred=y_pred)

    print("\n")

Linear Regression
-----------------
Mean Absolute Error: 49.34892066661637
Root Mean Squared Error: 114.99814801695959
R-squared: -0.7399489430687873
Explained Variance Score: -0.7391530905965553


Gradient Boosting Regressor
---------------------------
Mean Absolute Error: 46.231161543820306
Root Mean Squared Error: 97.15766759112282
R-squared: -0.2419639007213099
Explained Variance Score: -0.2419543744367052


KNeighbors Regressor
--------------------
Mean Absolute Error: 50.86268885725006
Root Mean Squared Error: 119.9080455273347
R-squared: -0.8916964994112857
Explained Variance Score: -0.8903201346042937




The decision to apply standard scaling and normalization was made with the expectation that it would improve the model's ability to learn and generalize patterns within the data. Unfortunately, the results revealed that the added preprocessing steps seemed to not have any impact of significance on the model's performance.

# Providing Valuations

We will utilize our linear regression model to generate valuations without data normalization, as this approach has demonstrated superior performance and accuracy.

In [123]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [105]:
[elem for elem in X.columns if 'scaled' in elem]

[]

In [107]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

In [124]:
cols_drop = [elem for elem in X_test.columns if 'token_index' not in elem]
X_test = X_test.drop(columns=cols_drop)

In [127]:
# Convert dummy variables back to a single column
X_test_single = X_test.idxmax(axis=1).to_frame().rename({0:"token_index"}, axis=1)['token_index'].apply(lambda x: int(x.split("_")[-1])).to_frame()
X_test_single.head()

Unnamed: 0,token_index
1352,1252
11659,5807
9815,4967
3968,2477
5674,3217


In [129]:
X_test_single['valuations'] = y_pred
X_test_single.head()

Unnamed: 0,token_index,valuations
1352,1252,64.315538
11659,5807,55.200164
9815,4967,60.131748
3968,2477,85.43048
5674,3217,40.496121


These are the valuations in ETH for each token

In [130]:
X_test_single.to_csv("nft_valuations.csv")