In [None]:
%matplotlib inline
import numpy as np
from numpy import dot

In [None]:
def norm_vec(v):
    return v / np.linalg.norm(v)
np.set_printoptions(precision=3)

In [None]:
from utilities import *

<h2>The Seasons Corpus</h2>

Bruce: Go to slides here to show examples from the data

In [None]:
from seasons_module import load_seasons_corpus

**Load the corpus**

In [None]:
seasons_corpus = load_seasons_corpus()

In [None]:
print(seasons_corpus["angelapre"])

**Compile the vocabulary**

This is every unique word in the corpus

In [None]:
set_vocab = set([])
for fname in seasons_corpus.keys():
    set_vocab = set_vocab.union(set(seasons_corpus[fname][0]))

Read in a stop list. Then remove all of these words from the vocabulary

In [None]:
f = open("lists/seasons_stop_list.txt")
stop_list = set(f.read().split("\n"))

In [None]:
pruned_vocab = set(sorted([w for w in list(set_vocab) if w not in stop_list]))

In [None]:
len(pruned_vocab)

**Compute the document vector for each document**

In [None]:
doc_vectors = {}
for fname in seasons_corpus.keys():
    doc_vectors[fname] = np.array([seasons_corpus[fname][0].count(word) for word in pruned_vocab])

In [None]:
print(doc_vectors["angelapre"])

**Normalize the vectors**

In [None]:
# normalize the vectors
for fname, vec in doc_vectors.items():
    doc_vectors[fname] = norm_vec(vec)

In [None]:
print(doc_vectors["angelapre"])

**Compare some pairs of students**

In [None]:
def compare_students(s1, s2):
    return round(dot(doc_vectors[s1], doc_vectors[s2]), 3)

In [None]:
compare_students('alipre', 'vanessapre')

In [None]:
tab = ListTable()
tab.append(["name", "similarity", "code"])
for name in doc_vectors.keys():
    tab.append([name, str(compare_students(name, 'angelapre')), seasons_corpus[name][1]])
tab

**Compare to pre-written comparison documents**

In [None]:
from seasons_module import load_seasons_comparison_files
comparison_dict = load_seasons_comparison_files()

In [None]:
# Compute vectors for the comparison documents
comparison_vectors = {}
for fname in comparison_dict.keys():
    comparison_vectors[fname] = norm_vec(np.array([comparison_dict[fname].count(word) for word in pruned_vocab])

In [None]:
def compare_to_compvecs(s1):
    resultdict = {}
    for cname in comparison_vectors.keys():
        resultdict[cname] = dot(doc_vectors[s1], comparison_vectors[cname])
    return resultdict

In [None]:
compare_to_compvecs("angelapre")

In [None]:
def max_from_dict(the_dict):
    key, value = max(the_dict.items(), key=lambda x:x[1])
    return key

student_codes = {}
for name in doc_vectors.keys():
    student_codes[name] = max_from_dict(compare_to_compvecs(name))

**How similar are our results to the codes assigned by human coders?**

In [None]:
def compute_accuracy():
    number_right = 0
    total_possible = 0
    for name in student_codes.keys():
        if seasons_corpus[name][1] != "none":
            total_possible += 1
            if student_codes[name] == seasons_corpus[name][1]:
                number_right += 1
    return 1.0 * number_right / total_possible

In [None]:
compute_accuracy()

In [None]:
gold_list = []
test_list = []
for name in student_codes.keys():
    if seasons_corpus[name][1] != "none":
        gold_list += [seasons_corpus[name][1]]
        test_list += [student_codes[name]]
cm = nltk.ConfusionMatrix(gold_list, test_list)
cm

In [None]:
print(cm)

## Some slightly different ways of computing document vectors

### First variant: use just a subset of the vocabulary when constructing the vectors

In [None]:
word_fdist = nltk.FreqDist()
for fname in seasons_corpus.keys():
    pruned_transcript_words = [w for w in seasons_corpus[fname][0] if w not in stop_list]
    word_fdist.update(pruned_transcript_words)
word_fdist.most_common(25)

In [None]:
new_vocab = [w[0] for w in word_fdist.most_common(50) if w not in stop_list]

In [None]:
# compute the document vector for each document
doc_vectors = {}
for fname in seasons_corpus.keys():
    doc_vectors[fname] = norm_vec(np.array([seasons_corpus[fname][0].count(word) for word in new_vocab]))

In [None]:
# Compute vectors for the comparison documents
comparison_vectors = {}
for fname in comparison_dict.keys():
    comparison_vectors[fname] = norm_vec(np.array([comparison_dict[fname].count(word) for word in new_vocab]))

In [None]:
student_codes = {}
for name in doc_vectors.keys():
    student_codes[name] = max_from_dict(compare_to_compvecs(name))
compute_accuracy()

In [None]:
gold_list = []
test_list = []
for name in student_codes.keys():
    if seasons_corpus[name][1] != "none":
        gold_list += [seasons_corpus[name][1]]
        test_list += [student_codes[name]]
cm = nltk.ConfusionMatrix(gold_list, test_list)
cm
print(cm)

### Other variants: Use different weight factors when constructing the vectors

#### A weight factor function will commonly use these different quantities in some combination

* `tf = term frequency` (number of times the term appears in the present document)
* `df = document frequency` (number of documents in which the term appears)
* `cf = corpus frequency` (total number of times the term appears in the entire corpus)
* `N = number of documents`

In [None]:
def tf(tf, df, cf, N):
    return tf

def logtf(tf, df, cf, N):
    if tf == 0:
        result = 0
    else:
        result = (1 + np.log(tf))
    return result

def onehot(tf, df, cf, N):
    if tf == 0:
        return 0
    else:
        return 1

def tfidf(tf, df, cf, N):
    if tf == 0:
        result = 0
    else:
        result = (1 + np.log(tf)) * np.log(N  / df)
    return result

#### We need to construct the document frequency distribution since we don't have that yet.

In [None]:
doc_fdist = nltk.FreqDist()
for fname in seasons_corpus.keys():
    pruned_transcript_words = [w for w in seasons_corpus[fname][0] if w not in stop_list]
    doc_fdist.update(list(set(pruned_transcript_words)))

#### A little function to simplify the task of constructing vectors with different weight factors

In [None]:
def compute_vector(words, vocab, df, cf, N, weight_function):
    new_vector = []
    for w in vocab:
        tf = words.count(w)
        new_vector.append(weight_function(tf, df[w], cf[w], N))
    return norm_vec(np.array(new_vector))

In [None]:
# compute the document vector for each document
doc_vectors = {}
N = len(seasons_corpus.keys())
wf = tf
for fname in seasons_corpus.keys():
    doc_vectors[fname] = compute_vector(seasons_corpus[fname][0], new_vocab, doc_fdist, word_fdist, N, wf)
# Compute vectors for the comparison documents
comparison_vectors = {}
for fname in comparison_dict.keys():
    comparison_vectors[fname] = compute_vector(comparison_dict[fname], new_vocab, doc_fdist, word_fdist, N, wf)

In [None]:
student_codes = {}
for name in doc_vectors.keys():
    student_codes[name] = max_from_dict(compare_to_compvecs(name))
compute_accuracy()