This notebook will evaluate the text using TFIDF and some other metrics.

In [1]:
import re
import nltk
import glob
import json
import numpy as np
from basic import *

In [2]:
raw = fetchData("cards.json")
data = cleanData(raw)
colors = list(data.keys())

In [3]:
def tokenizeBucket(listColor): # This takes ONE list, e.g. 'W'
    tokensList = []
    means = [] # This function also gets the mean!!

    for cardDict in listColor:
        if 'oracle_text' in list(cardDict.keys()):
            tokens = nltk.word_tokenize(cardDict['oracle_text'])
            for t in tokens:
                tokensList.append(t)
            means.append(len(tokens))

    print("Mean = " + str(np.mean(means)))

    return tokensList

In [4]:
tokens = [] # will be a list(list(str))

for i, key in enumerate(data.keys()):
    print(colors[i])
    tokens.append(tokenizeBucket(data[key]))

W
Mean = 31.297722705961153
U
Mean = 34.77502984820058
B
Mean = 33.60504342017368
R
Mean = 33.38563423331635
G
Mean = 32.880027619540826
colorless
Mean = 34.478184991274


In [5]:
stoplist = ["{", "}", 'W', 'U', 'B', 'R', 'G', "T", "—", "TK"]

In [6]:
def print_common_ngrams(tokenlist, ngram, qty):

    tokenlist = [w for w in tokenlist if w not in stoplist] 

    raw = nltk.ngrams(tokenlist, ngram)
    fdist = nltk.FreqDist(raw)
    for pair in fdist.most_common(qty): # print n most common bigrams
        print(pair[0]) # most_common gives list(tuple(tuple(str, str), int)) so this is how we extract the words2
    print()


In [7]:
for i, tl in enumerate(tokens):
    print(colors[i])
    print_common_ngrams(tl, 4, 15)

W
('until', 'end', 'of', 'turn')
('CARDNAME', 'enters', 'the', 'battlefield')
('end', 'of', 'turn', '.')
('enters', 'the', 'battlefield', ',')
('When', 'CARDNAME', 'enters', 'the')
('a', '+1/+1', 'counter', 'on')
('At', 'the', 'beginning', 'of')
('.', 'When', 'CARDNAME', 'enters')
('the', 'beginning', 'of', 'your')
('put', 'a', '+1/+1', 'counter')
('damage', 'that', 'would', 'be')
('that', 'would', 'be', 'dealt')
(',', 'put', 'a', '+1/+1')
('would', 'be', 'dealt', 'to')
('from', 'your', 'graveyard', 'to')

U
('CARDNAME', 'enters', 'the', 'battlefield')
('enters', 'the', 'battlefield', ',')
('When', 'CARDNAME', 'enters', 'the')
('until', 'end', 'of', 'turn')
('end', 'of', 'turn', '.')
('its', 'owner', "'s", 'hand')
('to', 'its', 'owner', "'s")
('At', 'the', 'beginning', 'of')
('owner', "'s", 'hand', '.')
('draw', 'a', 'card', '.')
('ca', "n't", 'be', 'blocked')
('Draw', 'a', 'card', '.')
('the', 'beginning', 'of', 'your')
('of', 'your', 'library', '.')
('.', 'If', 'you', 'do')

B
('unti

In [8]:
def print_collocations(tokenlist, n, f):
    find = nltk.collocations.BigramCollocationFinder.from_words(tokenlist)
    find.apply_freq_filter(f) 
    best = find.nbest(nltk.collocations.BigramAssocMeasures().pmi, 100)

    for pair in best:
        if bool(re.search('^[A-Z].*', pair[0])) and bool(re.search('^[A-Z].*', pair[1])):
            best.remove(pair)

    print("** Common Collocations **")
    for i, pair in enumerate(best):
        if i == n:
            break
        print(pair)

    print()

In [9]:
for i, tl in enumerate(tokens):
    print(colors[i])
    print_collocations(tl, 15, 3)

W
** Common Collocations **
('Skipped', 'chapters')
('lasts', 'indefinitely')
('legend', 'rule')
('square', 'brackets')
('Shattered', 'Seraph')
('Spell', 'mastery')
('summoning', 'sickness')
('Basic', 'landcycling')
('Fateful', 'hour')
('counts', 'toward')
('final', 'chapter')
('Totem', 'armor')
('Mirrodin', '!')
('Battle', 'cry')
('Friends', 'forever')

U
** Common Collocations **
('Timing', 'rules')
('Shattered', 'Seraph')
('Spell', 'mastery')
('Totem', 'armor')
('chaos', 'ensues')
('wears', 'off')
('Friends', 'forever')
('counts', 'toward')
('friends', 'forever')
('square', 'brackets')
('Basic', 'landcycling')
('crews', 'Vehicles')
('Secret', 'council')
('Eldrazi', 'Scion')
('done', 'activating')

B
** Common Collocations **
('Basic', 'landcycling')
('Fathomless', 'descent')
('Kher', 'Keep')
('Spell', 'mastery')
('done', 'activating')
('lasts', 'indefinitely')
('square', 'brackets')
('chaos', 'ensues')
('high', 'bid')
('postcombat', 'main')
('precombat', 'main')
('effect', 'lasts')


# TF-IDF

In [None]:
# imports
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from pathlib import Path
import pandas as pd
import glob

'card'

In [None]:
# making tdidf model + vectorizing 
tfidf_vectorizer = TfidfVectorizer(input='filename', stop_words='english')
tfidf_vector = tfidf_vectorizer.fit_transform(list_txts)