In [1]:
from algorithms.statics import *
from algorithms.pkgs import CountVectorizer, MongoClient, re, defaultdict, stopwords, nltk
client = MongoClient(DB_CONNECTION)
sample_review = client.course_evals.entries.find_one()
negatives = list(stopwords.words())[1509:1644] # SLICE ENGLISH STOPWORDS ONLY
negatives.extend(CUSTOM_STOPWORDS)

In [2]:
def tokenize(raw_text):
    text = ''
    sent_tokenize = nltk.sent_tokenize(raw_text)
    for s in sent_tokenize:
        pattern = re.compile('[^a-zA-z\s]+', re.UNICODE)
        tmp = re.sub(pattern, ' ', s)
        tmp = re.sub('\s+', ' ', tmp)
        text += ' < '+' '.join([x for x in nltk.word_tokenize(tmp)])+' > '
    return text.lower()

In [3]:
# INSTRUCTOR NAMES
sub_names = set()
for entry in client.course_evals.entries.find():
    for sub_name in entry['INSTRUCTOR_NAME'].split(', '):
        if sub_name.lower() not in negatives:
            sub_names.add(sub_name.lower())

In [4]:
def vectorize(entry):
    vectorized = CountVectorizer(ngram_range=(N_GRAM_LOWER_LIM,N_GRAM_UPPER_LIM),stop_words=negatives)
    example = tokenize(entry['RESPONSE_TEXT'])
    inter = set(example.split(' ')).intersection(sub_names)
    # REMOVE INSTRUCTOR NAMES
    for sub_name in inter:
        example = re.sub(sub_name, '', example)
    example = re.sub('\s+', ' ', example)
    X = vectorized.fit_transform(raw_documents=[example])
    return (X.toarray()[0].tolist(), vectorized.get_feature_names())

In [None]:
class_to_corpus = defaultdict(lambda:defaultdict(int))
counter = 0
for entry in client.course_evals.entries.find():
    counter += 1
    print(counter)
    try:
        res = vectorize(entry)
        for freq,gram in zip(res[0],res[1]):
            class_to_corpus[entry['COURSE_UNIQUE_ID']][gram] += freq
    except ValueError:
        print('stopwords only ... deleting from db')
        client.course_evals.entries.delete_one({'_id':entry['_id']})

In [6]:
for entry in client.course_evals.derivatives.ngrams.find()[:10]:
    print(entry['course_id'])
    print(sorted(entry['freq_to_gram_dict'].items(),key=lambda v:v[1],reverse=True)[:5])
print('num_classes: ', len(class_to_corpus.keys()))

W21-SURG-241-01/W21-ARTSTUDI-139-01
[('anatomy', 5), ('class', 5), ('facial', 4), ('classes', 4), ('drawing', 3)]
W21-COMPMED-200-01
[('presentation', 2), ('professor', 2), ('questions', 2), ('scientific', 2), ('skills', 1)]
W21-CHPR-200-01
[('research', 9), ('different', 6), ('guest', 4), ('prevention', 4), ('class', 3)]
W21-NENS-204-01
[('neurologist', 1), ('think', 1), ('strokes', 1), ('want', 1), ('topics', 1)]
W21-ORTHO-270-01
[('engineering', 2), ('research', 2), ('tissue', 2), ('advances', 1), ('cell', 1)]
W21-SURG-101-01
[('online', 25), ('class', 20), ('anatomy', 17), ('lab', 16), ('lectures', 16)]
W21-MED-221-01/W21-MED-121-01
[('research', 10), ('translational', 6), ('class', 4), ('medical', 3), ('recommend', 3)]
W21-PSYC-60N-01
[('class', 27), ('life', 25), ('course', 11), ('definitely', 8), ('happiness', 8)]
W21-BIOMEDIN-224-01/W21-GENE-224-01
[('pharmacogenomics', 4), ('understanding', 2), ('get', 2), ('analysis', 1), ('medicine', 1)]
W21-MED-73N-01
[('class', 10), ('scie

### Save to DB

In [None]:
client.course_evals.derivatives.ngrams.drop()
for key in class_to_corpus.keys():
    document = {
        'course_id': key,
        'freq_to_gram_dict': class_to_corpus[key]
        }
    client.course_evals.derivatives.ngrams.insert_one(document)