In [None]:
import numpy as np
from utilities import *
import numpy as np
from numpy import dot

def norm_vec(v):
    return v / np.linalg.norm(v)
np.set_printoptions(precision=3)

from sympy import *
init_printing()

<h1>Seasons Represented as Matrices</h1>

Now we are going to do much of what is in Notebook 9 from a slightly different perspective. Namely, we are going to take all of the document vectors for the corpus, and think of them as the columns in one big matrix.

There are a couple of reasons to do this:

1. The less interesting reason: It can make it possible for us to write more compact code.
2. The more interesting reason: Creating matrices in this manner provides new ways to conceptualize what we're doing. These new conceptualizations provide launching points for more advanced algorithms. (We won't get to these more advanced algorithms today).

Bruce: Go back to the slides here

## Get everything ready, pretty much as before

**Load the corpus**

In [None]:
from seasons_module import load_seasons_corpus
seasons_corpus = load_seasons_corpus()

**Compile the vocabulary in the usual way.**

In [None]:
set_vocab = set([])
for fname in seasons_corpus.keys():
    set_vocab = set_vocab.union(set(seasons_corpus[fname][0]))
f = open("lists/seasons_stop_list.txt")
stop_list = set(f.read().split("\n"))
pruned_vocab = set(sorted([w for w in list(set_vocab) if w not in stop_list]))

**Compute the corpus and document frequency for each term.**

In [None]:
word_fdist = nltk.FreqDist() # the corpus frequences
doc_fdist = nltk.FreqDist()# the document frequencies
for word in pruned_vocab:
    word_fdist[word] = 0
    doc_fdist[word] = 0
    for name in seasons_corpus.keys():
        if word in seasons_corpus[name][0]:
            doc_fdist[word] += 1
            word_fdist[word] += seasons_corpus[name][0].count(word)

### Create a very small vocabulary 

Just 10 words, to make it more simple to understand what's going on.

In [None]:
small_vocab = [w[0] for w in word_fdist.most_common(10)]
print(small_vocab)

**Compute the weighted document vector for each document**

Same as before, but now using our smaller vocabulary.

In [None]:
def tf(tf, df, cf, N):
    return tf

def logtf(tf, df, cf, N):
    if tf == 0:
        result = 0
    else:
        result = (1 + np.log(tf))
    return result

def onehot(tf, df, cf, N):
    if tf == 0:
        return 0
    else:
        return 1

def tfidf(tf, df, cf, N):
    if tf == 0:
        result = 0
    else:
        result = (1 + np.log(tf)) * np.log(N  / df)
    return result

def compute_vector(words, vocab, df, N, weight_function):
    new_vector = []
    for w in vocab:
        tf = words.count(w)
        new_vector.append(weight_function(tf, df[w], 0, N))
    return norm_vec(np.array(new_vector))

In [None]:
# compute the document vector for each document
doc_vectors = {}
N = len(seasons_corpus.keys())
wf = tf
for fname in seasons_corpus.keys():
    doc_vectors[fname] = compute_vector(seasons_corpus[fname][0], small_vocab, doc_fdist, N, wf)

In [None]:
print(len(doc_vectors), len(doc_vectors['angelapre']))

## Create a termxdocument matrix

This is a matrix where every row corresponds to a word in the vocabulary, and every column corresponds to a document.

Another way to say this: Each column in the matrix is the document vector for a document.

In [None]:
td_matrix = np.zeros([len(small_vocab), len(doc_vectors)])
i = 0
name_index = {}
name_list = []
for name in doc_vectors.keys():
    td_matrix[:, i] = doc_vectors[name]
    name_index[name] = i
    name_list += [name]
    i = i + 1

In [None]:
td_matrix.shape

In [None]:
def round_matrix(the_matrix, prec = 2):
    sh = the_matrix.shape
    if len(sh) == 1:
        for i in range(sh[0]):
            the_matrix[i] = round(the_matrix[i], prec)
    else:
        for i in range(sh[0]):
            for j in range(sh[1]):
                the_matrix[i, j] = round(the_matrix[i, j], prec)
    return the_matrix

In [None]:
Matrix(round_matrix(td_matrix))

#### Document x document similarity

We can get the similarity of two documents by multiplying the termxdocument matrix by it's transpose.

In [None]:
dd_matrix = np.dot(td_matrix.transpose(), td_matrix)

In [None]:
dd_matrix.shape

In [None]:
Matrix(round_matrix(dd_matrix))

**Load the comparison vectors and build a matrix with them.**

In [None]:
from seasons_module import load_seasons_comparison_files
comparison_dict = load_seasons_comparison_files()

In [None]:
# Compute vectors for the comparison documents
comparison_vectors = {}
for fname in comparison_dict.keys():
    comparison_vectors[fname] = compute_vector(comparison_dict[fname], small_vocab, doc_fdist, N, wf)

In [None]:
ctd_matrix = np.zeros([len(small_vocab), len(comparison_vectors)])
i = 0
cdname_index = {}
for name in comparison_vectors.keys():
    ctd_matrix[:, i] = comparison_vectors[name]
    cdname_index[name] = i
    i = i + 1

In [None]:
Matrix(round_matrix(ctd_matrix))

In [None]:
Matrix(round_matrix(td_matrix.transpose()))

We can multiply the comparison matrix by the transpose of the termxdocument matrix in order to get the similarities between the student transcripts and the comparison documents.

In [None]:
code_matrix = np.dot(td_matrix.transpose(), ctd_matrix)

In [None]:
Matrix(round_matrix(code_matrix))

In [None]:
inverted_cdname_index = dict(map(lambda item:(item[1],item[0]), cdname_index.items()))
student_codes = {}
for name in seasons_corpus.keys():
    row = list(code_matrix[name_index[name]])
    maxcode = row.index(max(row))
    student_codes[name] = inverted_cdname_index[maxcode]
print(student_codes)

**How similar are our results to the codes assigned by human coders?**

In [None]:
def compute_accuracy():
    number_right = 0
    total_possible = 0
    for name in student_codes.keys():
        if seasons_corpus[name][1] != "none":
            total_possible += 1
            if student_codes[name] == seasons_corpus[name][1]:
                number_right += 1
    return 1.0 * number_right / total_possible

In [None]:
compute_accuracy()

In [None]:
gold_list = []
test_list = []
for name in student_codes.keys():
    if seasons_corpus[name][1] != "none":
        gold_list += [seasons_corpus[name][1]]
        test_list += [student_codes[name]]
cm = nltk.ConfusionMatrix(gold_list, test_list)
print(cm)