This notebook will evaluate the text using TFIDF and some other metrics.

In [1]:
import re
import nltk
import glob
import json
import numpy as np
from basic import *

In [2]:
raw = fetchData("cards.json")
data = cleanData(raw)
colors = list(data.keys())

In [3]:
def tokenizeBucket(listColor): # This takes ONE list, e.g. 'W'
    tokensList = []
    means = [] # This function also gets the mean!!

    for cardDict in listColor:
        if 'oracle_text' in list(cardDict.keys()):
            tokens = nltk.word_tokenize(cardDict['oracle_text'])
            for t in tokens:
                tokensList.append(t)
            means.append(len(tokens))

    print("Mean = " + str(np.mean(means)))

    return tokensList

In [4]:
tokens = [] # will be a list(list(str))

for i, key in enumerate(data.keys()):
    print(colors[i])
    tokens.append(tokenizeBucket(data[key]))

W
Mean = 31.297722705961153
U
Mean = 34.77502984820058
B
Mean = 33.60487641950568
R
Mean = 33.38563423331635
G
Mean = 32.880027619540826
colorless
Mean = 34.478184991274


In [5]:
def print_common_ngrams(tokenlist, ngram, qty):

    tokenlist = [w for w in tokenlist if w not in stoplist] 

    raw = nltk.ngrams(tokenlist, ngram)
    fdist = nltk.FreqDist(raw)
    for pair in fdist.most_common(qty): # print n most common bigrams
        print(pair[0]) # most_common gives list(tuple(tuple(str, str), int)) so this is how we extract the words2
    print()


In [6]:
stoplist = ["{", "}", 'W', 'U', 'B', 'R', 'G', "T", "—", "TK", ",", ":", ".", "'s", "a", "may"]

In [7]:
for i, tl in enumerate(tokens):
    print(colors[i])
    print_common_ngrams(tl, 6, 15)

W
('damage', 'that', 'would', 'be', 'dealt', 'to')
('When', 'CARDNAME', 'enters', 'the', 'battlefield', 'you')
('At', 'the', 'beginning', 'of', 'your', 'upkeep')
('enters', 'the', 'battlefield', 'under', 'your', 'control')
('from', 'your', 'graveyard', 'to', 'the', 'battlefield')
('Flying', 'When', 'CARDNAME', 'enters', 'the', 'battlefield')
('gets', '+1/+1', 'until', 'end', 'of', 'turn')
(')', 'When', 'CARDNAME', 'enters', 'the', 'battlefield')
('on', 'the', 'bottom', 'of', 'your', 'library')
('When', 'CARDNAME', 'enters', 'the', 'battlefield', 'create')
('When', 'CARDNAME', 'enters', 'the', 'battlefield', 'exile')
('all', 'damage', 'that', 'would', 'be', 'dealt')
('the', 'battlefield', 'under', 'its', 'owner', 'control')
('to', 'the', 'battlefield', 'under', 'its', 'owner')
('where', 'X', 'is', 'the', 'number', 'of')

U
('At', 'the', 'beginning', 'of', 'your', 'upkeep')
('on', 'the', 'bottom', 'of', 'your', 'library')
('Flying', 'When', 'CARDNAME', 'enters', 'the', 'battlefield')
('t

In [8]:
def print_collocations(tokenlist, n, f):
    find = nltk.collocations.BigramCollocationFinder.from_words(tokenlist)
    find.apply_freq_filter(f) 
    best = find.nbest(nltk.collocations.BigramAssocMeasures().pmi, 100)

    for pair in best:
        if pair[0][0].upper() == pair[0][0] and pair[1][0].upper() == pair[1][0]:
            best.remove(pair)

    print("** Common Collocations **")
    for i, pair in enumerate(best):
        if i == n:
            break
        print(pair)

    print()

In [9]:
for i, tl in enumerate(tokens):
    print(colors[i])
    print_collocations(tl, 15, 3)

W
** Common Collocations **
('Skipped', 'chapters')
('lasts', 'indefinitely')
('legend', 'rule')
('square', 'brackets')
('Shattered', 'Seraph')
('Spell', 'mastery')
('summoning', 'sickness')
('Basic', 'landcycling')
('Fateful', 'hour')
('counts', 'toward')
('final', 'chapter')
('Totem', 'armor')
('Battle', 'cry')
('Friends', 'forever')
('nor', 'night')

U
** Common Collocations **
('Timing', 'rules')
('Shattered', 'Seraph')
('Spell', 'mastery')
('Totem', 'armor')
('chaos', 'ensues')
('wears', 'off')
('Friends', 'forever')
('counts', 'toward')
('friends', 'forever')
('square', 'brackets')
('Basic', 'landcycling')
('crews', 'Vehicles')
('Secret', 'council')
('Eldrazi', 'Scion')
('done', 'activating')

B
** Common Collocations **
('Basic', 'landcycling')
('Fathomless', 'descent')
('Kher', 'Keep')
('Spell', 'mastery')
('done', 'activating')
('lasts', 'indefinitely')
('square', 'brackets')
('chaos', 'ensues')
('high', 'bid')
('postcombat', 'main')
('precombat', 'main')
('effect', 'lasts')
(

# TF-IDF

In [38]:
# imports
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from pathlib import Path
import pandas as pd
import glob
import os

In [22]:
# Create card text files

#folder creation
os.makedirs(os.path.join(os.getcwd(), "cards"), exist_ok=True)
cardDir = os.path.join(os.getcwd(), "cards")
for i in colors:
    newDir = os.path.join(cardDir, i)
    os.makedirs(newDir, exist_ok=True)

In [56]:
# make all cards
for color, cardset in data.items():
    
    for card in cardset:
        title = card['name']
        title = re.sub('[!\"\\/:*?\"<>|\"]', "", title)
        title += ".txt"

        path = os.path.join('cards', color, title)
        if os.path.exists(path):
            break # here so you dont waste time on subsequent runs

        desc = card['oracle_text']

        with open(path, 'w+', encoding="utf8") as f:
            f.write(desc)



In [57]:
bigdir = os.path.join(os.getcwd(), 'cards')
dirs = glob.glob(bigdir + "/*") 

In [80]:
models = []

for dir in dirs:
    # get some vars
    list_txts = glob.glob(dir + "/*.txt")
    list_names = [Path(text).stem for text in list_txts]

    # make tdidf model + vectorize
    tfidf_vectorizer = TfidfVectorizer(input='filename', stop_words='english')
    tfidf_vector = tfidf_vectorizer.fit_transform(list_txts)

    # make into df
    tfidf_df = pd.DataFrame(tfidf_vector.toarray(), index=list_names, columns=tfidf_vectorizer.get_feature_names_out())
    tfidf_df.stack().reset_index()
    tfidf_df = tfidf_df.stack().reset_index()
    tfidf_df = tfidf_df.rename(columns={0:'tfidf', 'level_0': 'document','level_1': 'term', 'level_2': 'term'})
    top_tfidf = tfidf_df.sort_values(by=['document','tfidf'], ascending=[True,False]).groupby(['document']).head(15)
    models.append(top_tfidf)

In [81]:
model_b = models[0]
model_colorless = models[1]
model_g = models[2]
model_r = models[3]
model_u = models[4]
model_w = models[5]

In [126]:
colors

['W', 'U', 'B', 'R', 'G', 'colorless']

In [128]:
models = [model_w, model_u, model_b, model_r, model_g, model_colorless] # reassign for printing in the next part

In [129]:
model_u[model_u['term'].str.contains('affinity')].sort_values('tfidf', ascending=False)

Unnamed: 0,document,term,tfidf
7472568,Thought Monitor,affinity,0.666206
5367651,Qumulox,affinity,0.607101
6556362,Somber Hoverguard,affinity,0.607101
7482879,Thoughtcast,affinity,0.591028
3358479,Into Thin Air,affinity,0.499337
6815610,Steelfin Whale,affinity,0.480883
730647,Blinkmoth Infusion,affinity,0.474527
5347029,Quicksilver Behemoth,affinity,0.424388
863217,Broodstar,affinity,0.402339
1069437,Chromescale Drake,affinity,0.385669


In [130]:
filter = model_u.groupby('document').filter(lambda group: 'draw' in group['term'].values and 'card' in group['term'].values)
filter[filter['term'].isin(['draw', 'card'])].shape[0]


1692

In [132]:
def queryTwo(models, w1, w2):
    for i, model in enumerate(models):
        print("Color: " + colors[i])
        filter = model.groupby('document').filter(lambda group: w1 in group['term'].values and w2 in group['term'].values)
        count = filter[filter['term'].isin([w1, w2])].shape[0]
        print(count)


In [147]:
queryTwo(models, 'discard', 'draw')

Color: W
196
Color: U
542
Color: B
252
Color: R
466
Color: G
154
Color: colorless
70
