This notebook will evaluate the text using TFIDF and some other metrics.

In [1]:
import re
import nltk
import glob
import json
import numpy as np
from basic import *

In [2]:
raw = fetchData("cards.json")
data = cleanData(raw)
colors = list(data.keys())

In [3]:
def tokenizeBucket(listColor): # This takes ONE list, e.g. 'W'
    tokensList = []
    means = [] # This function also gets the mean!!

    for cardDict in listColor:
        if 'oracle_text' in list(cardDict.keys()):
            tokens = nltk.word_tokenize(cardDict['oracle_text'])
            for t in tokens:
                tokensList.append(t)
            means.append(len(tokens))

    print("Mean = " + str(np.mean(means)))

    return tokensList

In [4]:
tokens = [] # will be a list(list(str))

for i, key in enumerate(data.keys()):
    print(colors[i])
    tokens.append(tokenizeBucket(data[key]))

W
Mean = 31.284996651038178
U
Mean = 34.75746205014498
B
Mean = 33.583333333333336
R
Mean = 33.366955340465275
G
Mean = 32.868634558950454
colorless
Mean = 34.50697836706211


In [5]:
stoplist = ["{", "}", 'W', 'U', 'B', 'R', 'G', "T", "—", "TK"]

In [6]:
def print_common_ngrams(tokenlist, ngram, qty):

    tokenlist = [w for w in tokenlist if w not in stoplist] 

    raw = nltk.ngrams(tokenlist, ngram)
    fdist = nltk.FreqDist(raw)
    for pair in fdist.most_common(qty): # print n most common bigrams
        print(pair[0]) # most_common gives list(tuple(tuple(str, str), int)) so this is how we extract the words2
    print()


In [7]:
for i, tl in enumerate(tokens):
    print(colors[i])
    print_common_ngrams(tl, 4, 15)

W
('until', 'end', 'of', 'turn')
('CARDNAME', 'enters', 'the', 'battlefield')
('end', 'of', 'turn', '.')
('enters', 'the', 'battlefield', ',')
('When', 'CARDNAME', 'enters', 'the')
('a', '+1/+1', 'counter', 'on')
('At', 'the', 'beginning', 'of')
('.', 'When', 'CARDNAME', 'enters')
('the', 'beginning', 'of', 'your')
('put', 'a', '+1/+1', 'counter')
('damage', 'that', 'would', 'be')
('that', 'would', 'be', 'dealt')
(',', 'put', 'a', '+1/+1')
('would', 'be', 'dealt', 'to')
('from', 'your', 'graveyard', 'to')

U
('CARDNAME', 'enters', 'the', 'battlefield')
('enters', 'the', 'battlefield', ',')
('When', 'CARDNAME', 'enters', 'the')
('until', 'end', 'of', 'turn')
('end', 'of', 'turn', '.')
('its', 'owner', "'s", 'hand')
('to', 'its', 'owner', "'s")
('At', 'the', 'beginning', 'of')
('owner', "'s", 'hand', '.')
('draw', 'a', 'card', '.')
('ca', "n't", 'be', 'blocked')
('Draw', 'a', 'card', '.')
('the', 'beginning', 'of', 'your')
('of', 'your', 'library', '.')
('.', 'If', 'you', 'do')

B
('unti

In [8]:
def print_collocations(tokenlist, n, filter):
    find = nltk.collocations.BigramCollocationFinder.from_words(tokenlist)
    find.apply_freq_filter(filter) 

    print("** Common Collocations **")
    for colloc in find.nbest(nltk.collocations.BigramAssocMeasures().pmi, n):
        print(colloc)
    print()

In [9]:
for i, tl in enumerate(tokens):
    print(colors[i])
    print_collocations(tl, 15, 3)

W
** Common Collocations **
('Skipped', 'chapters')
('lasts', 'indefinitely')
('legend', 'rule')
('square', 'brackets')
('Rakish', 'Revelers')
('Shattered', 'Seraph')
('Spell', 'mastery')
('summoning', 'sickness')
('Basic', 'landcycling')
('Reinforce', '1—')
('Fateful', 'hour')
('counts', 'toward')
('final', 'chapter')
('Totem', 'armor')
('Awaken', '4—')

U
** Common Collocations **
('Tanazir', 'Quandrix')
('Timing', 'rules')
('Glamorous', 'Outlaw')
('Shattered', 'Seraph')
('Spell', 'mastery')
('Totem', 'armor')
('wears', 'off')
('chaos', 'ensues')
('Basic', 'landcycling')
('Friends', 'forever')
('counts', 'toward')
('friends', 'forever')
('square', 'brackets')
('crews', 'Vehicles')
('Secret', 'council')

B
** Common Collocations **
('Masked', 'Bandits')
('Basic', 'landcycling')
('Fathomless', 'descent')
('Glamorous', 'Outlaw')
('Kher', 'Keep')
('Shattered', 'Seraph')
('Spell', 'mastery')
('done', 'activating')
('lasts', 'indefinitely')
('square', 'brackets')
('chaos', 'ensues')
('high