In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import datetime
from scipy import stats
import requests
from requests import get
from bs4 import BeautifulSoup
import xgboost as xgb
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from numpy import absolute
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics

In [2]:
df_tr = pd.read_csv('transaction.csv', header = None, index_col = 0)
df_tr.columns = ['type', 'from', 'to', 'date', 'punk_id', 'amount_in_eth', 'amount_in_dol']
df_tr = df_tr[ ['punk_id'] + [ col for col in df_tr.columns if col != 'punk_id' ] ]
df_tr.head()

Unnamed: 0_level_0,punk_id,type,from,to,date,amount_in_eth,amount_in_dol
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0,Bid Withdrawn,0x983ace,,2021-05-10,250.0,1030000.0
1,0,Bid,0x983ace,,2021-04-11,250.0,537615.0
2,0,Bid Withdrawn,0xd7510a,,2021-03-03,100.0,160973.0
3,0,Bid,0xd7510a,,2021-02-20,100.0,188897.0
4,0,Bid Withdrawn,natealex,,2020-09-16,69.0,25532.0


In [3]:
df_tr['punk_id'] = df_tr['punk_id'].astype('category')
df_tr['type'] = df_tr['type'].astype('category')
df_tr['from'] = df_tr['from'].astype(str)
df_tr['to'] = df_tr['to'].astype(str)
df_tr['date'] = pd.to_datetime(df_tr['date'], format = '%Y.%m.%d')

In [4]:
df_tr['amount_in_eth'] = df_tr['amount_in_eth'].astype(str)
df_tr['amount_in_eth'] = df_tr['amount_in_eth'].str.replace('Y', '')
df_tr['amount_in_eth'] = df_tr['amount_in_eth'].str.replace('Z', '')
df_tr['amount_in_eth'] = df_tr['amount_in_eth'].str.replace('M', '')
df_tr['amount_in_eth'] = df_tr['amount_in_eth'].str.replace('T', '')
df_tr['amount_in_eth'] = df_tr['amount_in_eth'].str.replace('B', '')
df_tr['amount_in_eth'] = df_tr['amount_in_eth'].replace(',', '', regex=True)
df_tr['amount_in_eth'] = df_tr['amount_in_eth'].astype(float)

df_tr['amount_in_dol'] = df_tr['amount_in_dol'].astype(str)
df_tr['amount_in_dol'] = df_tr['amount_in_dol'].str.replace('Y', '')
df_tr['amount_in_dol'] = df_tr['amount_in_dol'].str.replace('B', '')
df_tr['amount_in_dol'] = df_tr['amount_in_dol'].str.replace('T', '')
df_tr['amount_in_dol'] = df_tr['amount_in_dol'].str.replace('P', '')
df_tr['amount_in_dol'] = df_tr['amount_in_dol'].str.replace('Z', '')
df_tr['amount_in_dol'] = df_tr['amount_in_dol'].replace('<', '', regex=True)
df_tr['amount_in_dol'] = df_tr['amount_in_dol'].replace(',', '', regex=True)
df_tr['amount_in_dol'] = df_tr['amount_in_dol'].astype(float)

In [5]:
df_tr['amount_in_eth'] = df_tr['amount_in_eth'].fillna(0)
df_tr['amount_in_dol'] = df_tr['amount_in_dol'].fillna(0)

In [6]:
df_acc = pd.read_csv('accessories.csv', header=None, sep='\n')
df_acc = df_acc[0].str.split(',', expand=True)
df_acc.drop(0, axis=1, inplace=True)

In [7]:
dict={1:'gender', 
      2:  'acc_1',
      3:'acc_2', 
      4: 'acc_3',
      5:'acc_4',
      6: 'acc_5',
      7:'acc_6',
      8:'acc_7'}
df_acc.rename(columns=dict,
          inplace=True)
  
df_acc.head()

Unnamed: 0,gender,acc_1,acc_2,acc_3,acc_4,acc_5,acc_6,acc_7
0,Female,Green Eye Shadow,Earring,Blonde Bob,,,,
1,Male,Smile,Mohawk,,,,,
2,Female,Wild Hair,,,,,,
3,Male,Wild Hair,Nerd Glasses,Pipe,,,,
4,Male,Big Shades,Wild Hair,Earring,Goat,,,


# Transactions

In [8]:
tr_types = df_tr.groupby(['punk_id', 'type']).size().reset_index().pivot(columns='type', index='punk_id', values=0)
total_bids = tr_types['Bid']
total_sales = tr_types['Sold']

In [9]:
#Get Mean of Bids + Sales for each Punk
bids_sales = df_tr[df_tr['type'].str.contains("Bid") | df_tr['type'].str.contains("Sold")]
bids_sales = bids_sales[~bids_sales.type.str.contains("Withdrawn")] 
bids_sales = bids_sales.drop(['from', 'to', 'date', 'type'], axis=1)
bids_sales = bids_sales.groupby(['punk_id'])['amount_in_eth', 'amount_in_dol'].mean().reset_index()

bids_sales['total_bids'] = total_bids
bids_sales['total_sales'] = total_sales

bids_sales

  bids_sales = bids_sales.groupby(['punk_id'])['amount_in_eth', 'amount_in_dol'].mean().reset_index()


Unnamed: 0,punk_id,amount_in_eth,amount_in_dol,total_bids,total_sales
0,0,29.910455,35933.818182,19,3
1,1,17.956667,8037.466667,12,3
2,2,4.655714,2023.214286,14,0
3,3,3.890000,1304.272727,11,0
4,4,6.250000,12092.166667,6,0
...,...,...,...,...,...
9995,9995,19.016667,44053.333333,3,0
9996,9996,0.525000,716.500000,2,0
9997,9997,69.071818,142668.818182,9,2
9998,9998,27.666667,36981.333333,1,2


# Accessories

In [10]:
# helper functions
def fix_eth(x):
  x=str(x)
  x=x.replace('Ξ', '')
  x=x.replace('<', '')
  x=x.replace(',', '')
  if 'K' in x:
    x=float(x.replace('K', ''))
    x=x*1000
    return x
  return x

In [11]:
page = requests.get("https://www.larvalabs.com/cryptopunks/attributes")
soup = BeautifulSoup(page.text, "html.parser")

tr_hist = soup.find_all('table')
df_acc_att = pd.read_html(str(tr_hist))[1]

df_acc_att = pd.DataFrame(df_acc_att.to_records())
df_acc_att.columns
dict={'index':'index',
      "('Attributes', 'Attribute')":'attributes',
      "('Unnamed: 1_level_0', '#')":'number',
      "('Unnamed: 2_level_0', 'Avail')":'avail',
      "('Unnamed: 3_level_0', 'Avg Sale')":"avg_sale",
      "('Unnamed: 4_level_0', 'Cheapest')":'cheapest',
      "('Unnamed: 5_level_0', 'More Examples')":'more_examples'}
df_acc_att.rename(columns=dict,
          inplace=True)
df_acc_att.columns

df_acc_att.avg_sale=df_acc_att.avg_sale.apply(fix_eth)
df_acc_att.cheapest=df_acc_att.cheapest.apply(fix_eth)

df_acc_att.avg_sale = df_acc_att.avg_sale.astype(np.float32)
df_acc_att.cheapest = df_acc_att.cheapest.astype(np.float32)
df_acc_att.attributes = df_acc_att.attributes.astype('category')
df_acc_att.drop('more_examples', axis=1, inplace=True)
df_acc_att.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87 entries, 0 to 86
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   index       87 non-null     int64   
 1   attributes  87 non-null     category
 2   number      87 non-null     int64   
 3   avail       87 non-null     int64   
 4   avg_sale    87 non-null     float32 
 5   cheapest    87 non-null     float32 
dtypes: category(1), float32(2), int64(3)
memory usage: 5.7 KB


In [12]:
list_att = df_acc_att.attributes
g = pd.DataFrame(columns=list_att)

g.rename(columns={'index':'number'},
          inplace=True)

x=g.copy()
x

attributes,Beanie,Choker,Pilot Helmet,Tiara,Orange Side,Buck Teeth,Welding Goggles,Pigtails,Pink With Hat,Top Hat,...,Regular Shades,Horned Rim Glasses,Big Shades,Nerd Glasses,Black Lipstick,Mole,Purple Lipstick,Hot Lipstick,Cigarette,Earring


In [51]:
for counter in range(df_acc.shape[0]):
  for i in g:
    if df_acc.loc[counter,:].str.contains(i).any():
      # print(counter, ' contains ', i)
      x.loc[counter,i] = 1



x.to_csv('accessories_transformed_full.csv')

In [13]:
acc = pd.read_csv('accessories_transformed_full.csv', index_col = 0)
acc

Unnamed: 0,Beanie,Choker,Pilot Helmet,Tiara,Orange Side,Buck Teeth,Welding Goggles,Pigtails,Pink With Hat,Top Hat,...,Regular Shades,Horned Rim Glasses,Big Shades,Nerd Glasses,Black Lipstick,Mole,Purple Lipstick,Hot Lipstick,Cigarette,Earring
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
9997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [14]:
types = df_acc['gender']
types_dummies = pd.get_dummies(types)
types_dummies

Unnamed: 0,Alien,Ape,Female,Male,Zombie
0,0,0,1,0,0
1,0,0,0,1,0
2,0,0,1,0,0
3,0,0,0,1,0
4,0,0,0,1,0
...,...,...,...,...,...
9995,0,0,1,0,0
9996,0,0,0,1,0
9997,0,0,0,0,1
9998,0,0,1,0,0


# Simple Linear Regression

In [15]:
df = bids_sales.join(acc, on=bids_sales.index, how='left', lsuffix='_', rsuffix='_')
df = df.join(types_dummies, on=bids_sales.index, how='left', lsuffix='_', rsuffix='_')
df

Unnamed: 0,punk_id,amount_in_eth,amount_in_dol,total_bids,total_sales,Beanie,Choker,Pilot Helmet,Tiara,Orange Side,...,Mole,Purple Lipstick,Hot Lipstick,Cigarette,Earring,Alien,Ape,Female,Male,Zombie
0,0,29.910455,35933.818182,19,3,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0,0,1,0,0
1,1,17.956667,8037.466667,12,3,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,1,0
2,2,4.655714,2023.214286,14,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0,1,0,0
3,3,3.890000,1304.272727,11,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,1,0
4,4,6.250000,12092.166667,6,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9995,19.016667,44053.333333,3,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0,1,0,0
9996,9996,0.525000,716.500000,2,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0,0,0,1,0
9997,9997,69.071818,142668.818182,9,2,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,0,1
9998,9998,27.666667,36981.333333,1,2,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0,1,0,0


In [16]:
df = df.fillna(0) 
df.to_csv('avg_price_reg.csv', mode='w', index=False)

In [17]:
y = df['amount_in_dol']
y.sort_values(ascending=False)

3100    1.105732e+06
2066    9.767923e+05
3443    6.063575e+05
6784    5.983280e+05
7804    5.982928e+05
            ...     
7991    0.000000e+00
3190    0.000000e+00
3191    0.000000e+00
7994    0.000000e+00
2035    0.000000e+00
Name: amount_in_dol, Length: 10000, dtype: float64

In [18]:
X = df.iloc[:,3:]
X = X.astype(int)
X

Unnamed: 0,total_bids,total_sales,Beanie,Choker,Pilot Helmet,Tiara,Orange Side,Buck Teeth,Welding Goggles,Pigtails,...,Mole,Purple Lipstick,Hot Lipstick,Cigarette,Earring,Alien,Ape,Female,Male,Zombie
0,19,3,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
1,12,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,14,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,11,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,6,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
9996,2,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,0,0,0,1,0
9997,9,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
9998,1,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [29]:
regressor = LinearRegression()
regressor.fit(X_train, y_train)
print('Adjusted Rsquared: {}.'.format(1 - (1-regressor.score(X_train, y_train))*(len(y_train)-1)/(len(y_train)-X_train.shape[1]-1)))

Adjusted Rsquared: 0.18603463899473727.


In [84]:
coeff_df = pd.DataFrame(regressor.coef_, X.columns, columns=['Coefficient'])
coeff_df

Unnamed: 0,Coefficient
total_bids,1.902278e+03
total_sales,1.782718e+03
Beanie,3.299662e+04
Choker,7.706662e+01
Pilot Helmet,3.891486e+03
...,...
Alien,-2.366406e+16
Ape,-2.366406e+16
Female,-2.366406e+16
Male,-2.366406e+16


In [85]:
y_pred = regressor.predict(X_test)
df_check = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
df_check

Unnamed: 0,Actual,Predicted
9394,975.666667,30844.0
898,14417.000000,6252.0
2398,6193.000000,13164.0
5906,0.000000,1552.0
2343,3378.500000,11540.0
...,...,...
4004,23988.250000,11644.0
7375,444.500000,4072.0
9307,15596.000000,2920.0
8394,31983.375000,19120.0


In [86]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 16877.513835913076
Mean Squared Error: 944835568.6110889
Root Mean Squared Error: 30738.177704787395
