In [21]:
import pandas as pd
import numpy as np
import tensorflow as tf
import sklearn
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# tf.config.experimental.set_memory_growth()
tf.test.is_gpu_available()

True

In [22]:
# gpus = tf.config.experimental.list_physical_devices('GPU')
# for gpu in gpus:
#         tf.config.experimental.set_memory_growth(gpu, True)
# tf.config.experimental.set_virtual_device_configuration(gpus[0], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=3096)])
# # tf.config.experimental.set_memory_growth()

## Load data

In [23]:
data = pd.read_csv('data/cards.csv')    
prices = pd.read_csv('data/prices.csv')

pd.options.display.max_columns = data.shape[1] #display option
# data.describe(include='all')

  interactivity=interactivity, compiler=compiler, result=result)


In [24]:
prices_no_duplicates = prices.drop_duplicates(subset=['uuid'])
# prices_no_duplicates.describe(include='all')

In [25]:
# prices.head()
# data.head()
data.columns

Index(['index', 'id', 'artist', 'asciiName', 'borderColor', 'colorIdentity',
       'colorIndicator', 'colors', 'convertedManaCost', 'duelDeck',
       'edhrecRank', 'faceConvertedManaCost', 'flavorName', 'flavorText',
       'frameEffect', 'frameEffects', 'frameVersion', 'hand', 'hasFoil',
       'hasNoDeckLimit', 'hasNonFoil', 'isAlternative', 'isArena', 'isBuyABox',
       'isDateStamped', 'isFullArt', 'isMtgo', 'isOnlineOnly', 'isOversized',
       'isPaper', 'isPromo', 'isReprint', 'isReserved', 'isStarter',
       'isStorySpotlight', 'isTextless', 'isTimeshifted', 'layout',
       'leadershipSkills', 'life', 'loyalty', 'manaCost', 'mcmId', 'mcmMetaId',
       'mtgArenaId', 'mtgoFoilId', 'mtgoId', 'multiverseId', 'name', 'names',
       'number', 'originalText', 'originalType', 'otherFaceIds', 'power',
       'printings', 'purchaseUrls', 'rarity', 'scryfallId',
       'scryfallIllustrationId', 'scryfallOracleId', 'setCode', 'side',
       'subtypes', 'supertypes', 'tcgplayerProduc

In [26]:
data_merged = pd.merge(data, prices_no_duplicates, on='uuid', how='inner')

In [27]:
data_merged.drop_duplicates(subset='name', keep="last", inplace=True)

### Include selected features

In [28]:
# len(data_merged)
data_merged = data_merged[['name', 'text', 'price', 'convertedManaCost', 'rarity', 'power', 'life', 'colors']]
# print(data_merged.dtypes)

In [29]:
data_merged[data_merged.price > 100][['name', 'text', 'price', 'convertedManaCost']]

Unnamed: 0,name,text,price,convertedManaCost
5003,Ali from Cairo,Damage that would reduce your life total to le...,304.95,4.0
5006,Bazaar of Baghdad,"{T}: Draw two cards, then discard three cards.",1499.99,0.0
5012,City in a Bottle,Whenever one or more other nontoken permanents...,399.95,2.0
5021,Diamond Valley,"{T}, Sacrifice a creature: You gain life equal...",250.99,0.0
5022,Drop of Honey,"At the beginning of your upkeep, destroy the c...",419.96,1.0
...,...,...,...,...
34102,Ravages of War,Destroy all lands.,179.92,4.0
34182,Zodiac Dragon,When Zodiac Dragon is put into your graveyard ...,243.20,9.0
35944,Grim Tutor,Search your library for a card and put that ca...,231.87,3.0
39121,"Ugin, the Spirit Dragon","[+2]: Ugin, the Spirit Dragon deals 3 damage t...",138.21,8.0


### Zamiania typu na string dla tokenizera

In [30]:
data_merged['text'] = data_merged['text'].astype(str)

### Tokenizacja

In [31]:
# vocab_size = 1000 #cannot be too long for lstm layer
vocab_size = 1024
max_length = 287 #95% cards will fit this text length

# max_length = data_merged.text.map(lambda x: len(x)).max()
# max_length = int((max_length * 3) / 4)

# max_length = 100
trunc_type='post'
# padding_type='post'
padding_type = 'pre'

In [32]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token='<OOV>')
tokenizer.fit_on_texts(data_merged['text'])
sequences = tokenizer.texts_to_sequences(data_merged['text'])

In [33]:
# tokenizer.word_index

### Padding

In [34]:
padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
# padded

### Attach converted mana cost

In [35]:
data_merged['convertedManaCost'].to_numpy().reshape(-1, 1)

array([[4.],
       [4.],
       [5.],
       ...,
       [5.],
       [2.],
       [5.]])

In [36]:
padded = np.append(padded, data_merged['convertedManaCost'].to_numpy().reshape(-1, 1), axis=1)
max_length = max_length + 1
# padded = data_merged['convertedManaCost'].to_numpy().reshape(-1, 1)
# max_length = 1

### get rarity dummies

In [37]:
padded[0].shape

(288,)

In [38]:
# data_merged.rarity
# pd.get_dummies(data_merged['rarity'])
padded = np.append(padded, pd.get_dummies(data_merged['rarity'], drop_first=True).to_numpy(), axis=1)
max_length = max_length + 3

### Power

In [39]:
data_merged['power'] = pd.to_numeric(data_merged['power'], errors='coerce')
data_merged['power'].fillna(data_merged['power'].mean(), inplace=True)

In [40]:
padded = np.append(padded, data_merged['power'].to_numpy().reshape(-1, 1), axis=1)

In [41]:
max_length += 1

### Life

In [42]:
data_merged['life'].fillna(data_merged['life'].mean(), inplace=True)
padded = np.append(padded, data_merged['life'].to_numpy().reshape(-1, 1), axis=1)
max_length += 1

### Preprocess prices

In [20]:
# prices.describe()
# prices.head()

In [43]:
prices_filtered = data_merged['price']

In [44]:
prices_np = prices_filtered.to_numpy()

### train, test, validation split (80, 10, 10)

In [45]:
X_train, X_test, y_train, y_test = train_test_split(padded, prices_np, test_size=0.2)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5)

### Standarize

In [46]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_val = scaler.transform(X_val)

y_scaler = StandardScaler()
y_train = y_scaler.fit_transform(y_train.reshape(-1, 1))
y_test = y_scaler.transform(y_test.reshape(-1, 1))
y_val = y_scaler.transform(y_val.reshape(-1, 1))

## Model

In [47]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(vocab_size, 256, input_length=max_length),
    tf.keras.layers.LSTM(256, activation='relu', return_sequences=True),
#     tf.keras.layers.Dense(256),
    tf.keras.layers.Dense(128),
    tf.keras.layers.Dense(64),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(1)
])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 293, 256)          262144    
_________________________________________________________________
lstm (LSTM)                  (None, 293, 256)          525312    
_________________________________________________________________
dense (Dense)                (None, 293, 128)          32896     
_________________________________________________________________
dense_1 (Dense)              (None, 293, 64)           8256      
_________________________________________________________________
flatten (Flatten)            (None, 18752)             0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 18753     
Total params: 847,361
Trainable params: 847,361
Non-trainable params: 0
__________________________________________________

### Tensorboard

In [48]:
log_dir = "logs\\fit\\" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1, profile_batch=0)

In [49]:
# optimizer = tf.keras.optimizers.RMSprop(0.0005) #todo: replace with Adam
optimizer = tf.keras.optimizers.Adam(0.0001)
model.compile(loss='mse', optimizer=optimizer, metrics=['mae', 'mse'])

In [50]:
X_train[0].shape,max_length, vocab_size, max_length

((293,), 293, 1024, 293)

### Train

In [51]:
epochs = 500
batch_size = 128
# model.fit(X_train, y_train, epochs=epochs, validation_data=(X_val, y_val), batch_size=batch_size)
model.fit(X_train, y_train, epochs=epochs, callbacks=[tensorboard_callback], validation_data=(X_val, y_val), batch_size=batch_size)

Train on 16521 samples, validate on 2066 samples
Epoch 1/500
  128/16521 [..............................] - ETA: 5:12

InvalidArgumentError: 2 root error(s) found.
  (0) Invalid argument:  indices[85,287] = -1 is not in [0, 1024)
	 [[node sequential/embedding/embedding_lookup (defined at <ipython-input-51-f9524d83fe1c>:4) ]]
	 [[sequential/embedding/embedding_lookup/_20]]
  (1) Invalid argument:  indices[85,287] = -1 is not in [0, 1024)
	 [[node sequential/embedding/embedding_lookup (defined at <ipython-input-51-f9524d83fe1c>:4) ]]
0 successful operations.
0 derived errors ignored. [Op:__inference_distributed_function_2491]

Errors may have originated from an input operation.
Input Source operations connected to node sequential/embedding/embedding_lookup:
 sequential/embedding/embedding_lookup/1621 (defined at C:\Users\Przemyslaw\.conda\envs\mtg\lib\contextlib.py:112)

Input Source operations connected to node sequential/embedding/embedding_lookup:
 sequential/embedding/embedding_lookup/1621 (defined at C:\Users\Przemyslaw\.conda\envs\mtg\lib\contextlib.py:112)

Function call stack:
distributed_function -> distributed_function


### read some value

In [None]:
reverse_word_index = dict([(value, key) for (key, value) in tokenizer.word_index.items()])
def decode_sentence(text):
    text = scaler.inverse_transform(text)
    return ' '.join([reverse_word_index.get(i, '?') for i in text])

def decode_price(val):
    return y_scaler.inverse_transform(val)

In [None]:
for i in range(100):
    price = decode_price(y_train[i])
    if price > 1:
        print(i)
        print(decode_price(y_train[i]))

In [None]:
print(decode_sentence(X_train[70]))

In [None]:
data_merged[data_merged['name'].str.contains("Zur the Enchanter")]