# Setup

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import random
random.seed(1)
import re
import nltk
import semdis


# Set working directory.
#os.chdir('/content/gdrive/My Drive/personal/CS224U/project')
# os.listdir() # Uncomment to sanity check that you're in the right directory.

# Word count and score distributions

In [5]:
# Takes a dataframe and adds a 'wordcount' column (modifies original df)
# Requirement: df has existing 'text' column
# Returns modified df with 'wordcount' column added
def add_wordcount_col(df):
  assert 'text' in df
  df['wordcount'] = df['text'].str.split().str.len()

In [6]:
# Takes a dataframe and returns a new dataframe containing only rows where wordcount < max_words
# Requirement: input df has existing 'wordcount' column
def restrict_by_wordcount(df, max_words):
  assert 'wordcount' in df
  df_restricted = df.loc[df['wordcount'] <= max_words]
  return df_restricted

In [9]:
import utils  

glove_dict = utils.glove2dict('glove.6B/glove.6B.300d.txt')

## Calculate Semantic Distance

In [37]:
import string
import scipy.spatial.distance

# Takes two sequence and create a composite vector for each sequence.
# Return the cosine similarity between the two vectors.

def get_distance_between_texts(text1, text2, VSM = glove_dict,
                               multiply = True,
                               tokenizer = nltk.word_tokenize,
                               remove_stopwords = True,
                               remove_punct = True):
  
  v1 = get_text_vector(text1, VSM, multiply, tokenizer, remove_stopwords),
  v2 = get_text_vector(text2, VSM, multiply, tokenizer, remove_stopwords)

  return scipy.spatial.distance.cosine(v1, v2)

# Takes a sequence and a VSM. Return a composite vector that represents the sequence
# Extract word vectors from the VSM and combine them with either multiplication or addition (default is multiplication)
# Set multiply = False to use addition
# Default tokenizer is nltk word tokenizer. 
# Remove stopwords and punctuations by default.

## TODO: Trying weighted sum (e.g., IDF weighting)
def get_text_vector(text, 
                    VSM, # the VSM (a dictionary) used to derive word vectors
                    multiply = True,
                    tokenizer = nltk.word_tokenize,
                    remove_stopwords = True,
                    remove_punct = True):
  
  if remove_punct:
    text = text.translate(str.maketrans('','',string.punctuation))
  
  
  words = tokenizer(text)

  if remove_stopwords:
    stop_words = nltk.corpus.stopwords.words('english')
    words = [w for w in words if not w in stop_words] 

    
  
  words = [w for w in words if w in VSM] 

  # Uncomment this for sanity check
  #print(len(words))
  if len(words) > 0:
    v = VSM[words[0]]
    for word in words[1:]:
      if multiply:
        v= np.multiply(v, VSM[word])
      else:
        v = v+VSM[word]
  else:
    # If no word is found in the dictionary, return a random vector
    v = np.random.rand(300)

  return v

#test the function
get_distance_between_texts("test, text", "test text", glove_dict)


0.0

In [32]:
imp.reload(semdis)

# take a sentence and return
def distances_within_text(text,
                          tokenizer = nltk.word_tokenize,
                         remove_stopwords =True,
                             remove_punct = True):
    if remove_punct:
        text = text.translate(str.maketrans('','',string.punctuation))
    words = tokenizer(text)
    if remove_stopwords:
        stop_words = nltk.corpus.stopwords.words('english')
        words = [w for w in words if not w in stop_words] 
    n = len(words)
    distances = []
    for i in range(n):
        for j in range(i+1, n):
            if words[i] in glove_dict and words[j] in glove_dict:
                distances.append(semdis.get_word_cosine(words[i], words[j], vsm = glove_dict))
        else:
            continue
    return distances

# test the function
test_text= 'test, test. score'
print(distances_within_sentence(test_text))

# Take a sequence and a pooling function (e.g., max, min, average)
# Calculate the semantic distances between all word pairs and pool them using the given function.
def pool_distances_within_text(text, pool = np.average, **kwarg):
    distances = distances_within_text(text, **kwarg)
    return pool(distances)

# Test the function
print(pool_distances_within_text(test_text))
print(pool_distances_within_text(test_text, pool = max))
print(pool_distances_within_text(test_text, pool = max, remove_punct = False))

# Two-step pooling 
# Pool distances within each sentence at first using the function given for the "sentence_pool" argument
# Pool these results using the function provided for the "pool" argument
def pool_distances_split_sentence(text, pool = np.average, sentence_pool = np.average, **kwarg):
    sentences = nltk.sent_tokenize(text)
    sentence_pooled = [pool_distances_within_text(sentence, pool = sentence_pool, **kwarg) for sentence in sentences]
    
    # When there is only one word in a sentence, distances cannot be calculated and will return nan
    # Drop nan using the following line
    sentence_pooled = [x for x in sentence_pooled if np.isnan(x) == False]
    return pool(sentence_pooled)

[0.0, 0.5940629763576282, 0.5940629763576282]
0.3960419842384188
0.5940629763576282
0.7448144358708455
0.0


## Use semantic distance to predict creativity


In [34]:

def get_distances_for_df(responses, prompt, VSM = glove_dict, multiply = True):
    return [get_distance_between_texts(prompt, x, VSM, multiply = multiply) for x in responses]

# This function can be used to examine whether a certain feature is correlated with a construct
# It takes a datafree with a column 'text' and a column 'label'.
# The argument "apply_to_column" regulates whether the function should be apply to the whole ['text'] column 
# or each element in the column, the default is element-wise operation.
def correlate_feature_with_creativity(df, function, apply_to_column = False, spearman = False, **kwarg):
    if apply_to_column:
        feature = function(df['text'], **kwarg)
    else:
        feature = [function(x, **kwarg) for x in df['text']]

    
    if spearman:
        return scipy.stats.spearmanr(feature, df['label'])
    else:
        return scipy.stats.pearsonr(feature, df['label'])

In [40]:
import dataset
fitness_df = dataset.get_data(1, 'Novelty_Combined', shuffle=True)
car_df = dataset.get_data(2, 'Novelty_Combined', shuffle=True)

correlate_feature_with_creativity(df = fitness_df,
                                  function = get_distances_for_df,
                                  apply_to_column = True,
                                  spearman = True,
                                  multiply = True,
                                 prompt = "fitness equipment")


# The following two functions are equivalent
correlate_feature_with_creativity(df = car_df,
                                  function = get_distances_for_df,
                                  apply_to_column = True,
                                  spearman = True,
                                  multiply = True,
                                 prompt = "self-driving car")
correlate_feature_with_creativity(df = car_df,
                                  function = get_distance_between_texts,
                                  apply_to_column = False,
                                  spearman = True,
                                  multiply = True,
                                 text2 = "self-driving car")

SyntaxError: invalid syntax (<ipython-input-40-0d9b7e3f3e40>, line 19)

In [73]:
# Are word counts correlated with word counts
feature = get_distances_for_responses(prompt = "fitness equipment",
                             responses = car_df['text'],
                             VSM = glove_dict, multiply = True)
add_wordcount_col(car_df)
scipy.stats.spearmanr(feature, car_df['wordcount'])

SpearmanrResult(correlation=0.1174380910023567, pvalue=0.004350859526027052)

Consistent with Beaty & Johnson's finding. Additive composition generates semantic distances that are negatively correlated with word counts; multiplicative composition generates semantic distances that are positively correlated with word counts.