# Setup

In [8]:
# Mount Google Drive.
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import random
random.seed(1)
import re

# Set working directory.
os.chdir('/content/gdrive/My Drive/personal/CS224U/project')
# os.listdir() # Uncomment to sanity check that you're in the right directory.

# Word count and score distributions

In [None]:
# Takes a dataframe and adds a 'wordcount' column (modifies original df)
# Requirement: df has existing 'text' column
# Returns modified df with 'wordcount' column added
def add_wordcount_col(df):
  assert 'text' in df
  df['wordcount'] = df['text'].str.split().str.len()

In [None]:
# Takes a dataframe and returns a new dataframe containing only rows where wordcount < max_words
# Requirement: input df has existing 'wordcount' column
def restrict_by_wordcount(df, max_words):
  assert 'wordcount' in df
  df_restricted = df.loc[df['wordcount'] <= max_words]
  return df_restricted

## Calculate Semantic Distance

In [None]:
import string
import scipy.spatial.distance

# Takes two sequence and create a composite vector for each sequence.
# Return the cosine similarity between the two vectors.

def get_distance_between_texts(text1, text2, VSM, multiply = True,
                               tokenizer = nltk.word_tokenize,
                               remove_stopwords = True,
                               remove_punct = True):
  
  v1 = get_text_vector(text1, VSM, multiply, tokenizer, remove_stopwords),
  v2 = get_text_vector(text2, VSM, multiply, tokenizer, remove_stopwords)

  return scipy.spatial.distance(v1, v2)

# Takes a sequence and a VSM. Return a composite vector that represents the sequence
# Extract word vectors from the VSM and combine them with either multiplication or addition (default is multiplication)
# Set multiply = False to use addition
# Default tokenizer is nltk word tokenizer. 
# Remove stopwords and punctuations by default.
def get_text_vector(text, 
                    VSM, # the VSM (a dictionary) used to derive word vectors
                    multiply = True,
                    tokenizer = nltk.word_tokenize,
                    remove_stopwords = True,
                    remove_punct = True):
  
  if remove_punct:
    text = text.translate(str.maketrans('','',string.punctuation))
  
  
  words = tokenizer(text)

  if remove_stopwords:
    stop_words = nltk.corpus.stopwords.words('english')
    words = [w for w in words if not w in stop_words] 

    
  
  words = [w for w in words if w in VSM] 

  # Uncomment this for sanity check
  #print(len(words))
  
  v = VSM[words[0]]
  for word in words[1:]:
    if multiply:
      v= np.multiply(v, VSM[word])
    else:
      v = v+VSM[word]

  return v

In [19]:
os.chdir('/content/gdrive/My Drive/personal/CS224U/cs224u-kf/')
#Because the folder name has a dash in it, we cannot use relative import.
# Here I just changed the working directory temporarily
import utils  
os.chdir('/content/gdrive/My Drive/personal/CS224U/project')


glove_dict = utils.glove2dict('/content/gdrive/MyDrive/personal/CS224U/cs224u-kf/data/glove.6B/glove.6B.300d.txt')