This notebook will evaluate the text using TFIDF and some other metrics.

In [None]:
import re
import nltk
import glob
import json
import numpy as np
from basic import *

In [None]:
raw = fetchData("cards.json")
data = cleanData(raw)
colors = list(data.keys())

In [None]:
def tokenizeBucket(listColor): # This takes ONE list, e.g. 'W'
    tokensList = []
    means = [] # This function also gets the mean!!

    for cardDict in listColor:
        if 'oracle_text' in list(cardDict.keys()):
            tokens = nltk.word_tokenize(cardDict['oracle_text'])
            for t in tokens:
                tokensList.append(t)
            means.append(len(tokens))

    print("Mean = " + str(np.mean(means)))

    return tokensList

In [None]:
tokens = [] # will be a list(list(str))

for i, key in enumerate(data.keys()):
    print(colors[i])
    tokens.append(tokenizeBucket(data[key]))

In [None]:
def print_common_ngrams(tokenlist, ngram, qty):

    tokenlist = [w for w in tokenlist if w not in stoplist] 

    raw = nltk.ngrams(tokenlist, ngram)
    fdist = nltk.FreqDist(raw)
    for pair in fdist.most_common(qty): # print n most common bigrams
        print(pair[0]) # most_common gives list(tuple(tuple(str, str), int)) so this is how we extract the words2
    print()


In [None]:
stoplist = ["{", "}", 'W', 'U', 'B', 'R', 'G', "T", "—", "TK", ",", ":", ".", "'s", "a", "may"]

In [None]:
for i, tl in enumerate(tokens):
    print(colors[i])
    print_common_ngrams(tl, 6, 15)

In [None]:
def print_collocations(tokenlist, n, f):
    find = nltk.collocations.BigramCollocationFinder.from_words(tokenlist)
    find.apply_freq_filter(f) 
    best = find.nbest(nltk.collocations.BigramAssocMeasures().pmi, 100)

    for pair in best:
        if pair[0][0].upper() == pair[0][0] and pair[1][0].upper() == pair[1][0]:
            best.remove(pair)

    print("** Common Collocations **")
    for i, pair in enumerate(best):
        if i == n:
            break
        print(pair)

    print()

In [None]:
for i, tl in enumerate(tokens):
    print(colors[i])
    print_collocations(tl, 15, 3)

# TF-IDF

In [None]:
# imports
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from pathlib import Path
import pandas as pd
import glob
import os

In [None]:
bigdir = os.path.join(os.getcwd(), 'cards')
dirs = glob.glob(bigdir + "/*") 

In [None]:
models = []

for dir in dirs:
    # get some vars
    list_txts = glob.glob(dir + "/*.txt")
    list_names = [Path(text).stem for text in list_txts]

    # make tdidf model + vectorize
    tfidf_vectorizer = TfidfVectorizer(input='filename', stop_words='english')
    tfidf_vector = tfidf_vectorizer.fit_transform(list_txts)

    # make into df
    tfidf_df = pd.DataFrame(tfidf_vector.toarray(), index=list_names, columns=tfidf_vectorizer.get_feature_names_out())
    tfidf_df.stack().reset_index()
    tfidf_df = tfidf_df.stack().reset_index()
    tfidf_df = tfidf_df.rename(columns={0:'tfidf', 'level_0': 'document','level_1': 'term', 'level_2': 'term'})
    top_tfidf = tfidf_df.sort_values(by=['document','tfidf'], ascending=[True,False]).groupby(['document']).head(15)
    models.append(top_tfidf)

In [None]:
model_b = models[0]
model_colorless = models[1]
model_g = models[2]
model_r = models[3]
model_u = models[4]
model_w = models[5]

In [None]:
model_b.keys()

In [None]:
model_b