# Setup

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import random
random.seed(1)
import re
import nltk
import semdis


# Set working directory.
#os.chdir('/content/gdrive/My Drive/personal/CS224U/project')
# os.listdir() # Uncomment to sanity check that you're in the right directory.

In [5]:
# Takes a dataframe and adds a 'wordcount' column (modifies original df)
# Requirement: df has existing 'text' column
# Returns modified df with 'wordcount' column added
def add_wordcount_col(df):
  assert 'text' in df
  df['wordcount'] = df['text'].str.split().str.len()

In [6]:
# Takes a dataframe and returns a new dataframe containing only rows where wordcount < max_words
# Requirement: input df has existing 'wordcount' column
def restrict_by_wordcount(df, max_words):
  assert 'wordcount' in df
  df_restricted = df.loc[df['wordcount'] <= max_words]
  return df_restricted

In [9]:
import utils  

glove_dict = utils.glove2dict('glove.6B/glove.6B.300d.txt')

## Functions that Calculate Semantic Distance

In [37]:
import string
import scipy.spatial.distance

# Takes two sequence and create a composite vector for each sequence.
# Return the cosine similarity between the two vectors.

def get_distance_between_texts(text1, text2, VSM = glove_dict,
                               multiply = True,
                               tokenizer = nltk.word_tokenize,
                               remove_stopwords = True,
                               remove_punct = True):
  
  v1 = get_text_vector(text1, VSM, multiply, tokenizer, remove_stopwords),
  v2 = get_text_vector(text2, VSM, multiply, tokenizer, remove_stopwords)

  return scipy.spatial.distance.cosine(v1, v2)

# Takes a sequence and a VSM. Return a composite vector that represents the sequence
# Extract word vectors from the VSM and combine them with either multiplication or addition (default is multiplication)
# Set multiply = False to use addition
# Default tokenizer is nltk word tokenizer. 
# Remove stopwords and punctuations by default.

## TODO: Trying weighted sum (e.g., IDF weighting)
def get_text_vector(text, 
                    VSM, # the VSM (a dictionary) used to derive word vectors
                    multiply = True,
                    tokenizer = nltk.word_tokenize,
                    remove_stopwords = True,
                    remove_punct = True):
  
  if remove_punct:
    text = text.translate(str.maketrans('','',string.punctuation))
  
  
  words = tokenizer(text)

  if remove_stopwords:
    stop_words = nltk.corpus.stopwords.words('english')
    words = [w for w in words if not w in stop_words] 

    
  
  words = [w for w in words if w in VSM] 

  # Uncomment this for sanity check
  #print(len(words))
  if len(words) > 0:
    v = VSM[words[0]]
    for word in words[1:]:
      if multiply:
        v= np.multiply(v, VSM[word])
      else:
        v = v+VSM[word]
  else:
    # If no word is found in the dictionary, return a random vector
    v = np.random.rand(300)

  return v

#test the function
get_distance_between_texts("test, text", "test text", glove_dict)


0.0

In [72]:
imp.reload(semdis)

# take a sentence and return
def distances_within_text(text,
                          tokenizer = nltk.word_tokenize,
                         remove_stopwords =True,
                             remove_punct = True):
    if remove_punct:
        text = text.translate(str.maketrans('','',string.punctuation))
    words = tokenizer(text)
    if remove_stopwords:
        stop_words = nltk.corpus.stopwords.words('english')
        words = [w for w in words if not w in stop_words] 
    n = len(words)
    distances = []
    for i in range(n):
        for j in range(i+1, n):
            if words[i] in glove_dict and words[j] in glove_dict:
                distances.append(semdis.get_word_cosine(words[i], words[j], vsm = glove_dict))
        else:
            continue
    return distances

# test the function
test_text= 'test, test. score'
print(distances_within_sentence(test_text))

# Take a sequence and a pooling function (e.g., max, min, average)
# Calculate the semantic distances between all word pairs and pool them using the given function.
def pool_distances_within_text(text, pool = np.average, **kwarg):
    distances = distances_within_text(text, **kwarg)
    if len(distances) == 0:
        return None
    else:
        return pool(distances)

# Test the function
print(pool_distances_within_text(test_text))
print(pool_distances_within_text(test_text, pool = max))
print(pool_distances_within_text(test_text, pool = max, remove_punct = False))

# Two-step pooling 
# Pool distances within each sentence at first using the function given for the "sentence_pool" argument
# Pool these results using the function provided for the "pool" argument
def pool_distances_split_sentence(text, pool = np.average, sentence_pool = np.average, **kwarg):
    sentences = nltk.sent_tokenize(text)
    sentence_pooled = [pool_distances_within_text(sentence, pool = sentence_pool, **kwarg) for sentence in sentences]
    
    # When there is only one word in a sentence, distances cannot be calculated and will return nan
    # Drop nan using the following line
    sentence_pooled = [x for x in sentence_pooled if x is not None]
    return pool(sentence_pooled)

[0.0, 0.5940629763576282, 0.5940629763576282]
0.3960419842384188
0.5940629763576282
0.7448144358708455


# Evaluations of different features

In [67]:
from sklearn.model_selection import ParameterGrid

# This function can be used to examine whether a certain feature is correlated with a construct
# It takes a datafree with a column 'text' and a column 'label'.
# The argument "apply_to_column" regulates whether the function should be apply to the whole ['text'] column 
# or each element in the column, the default is element-wise operation.
def correlate_feature_with_creativity(df, function, apply_to_column = False, spearman = False, length = False, **kwarg):
    if apply_to_column:
        feature = function(df['text'], **kwarg)
    else:
        feature = [function(x, **kwarg) for x in df['text']]
    
    if length:
        add_wordcount_col(df)
        y = df['wordcount']
    else:
        y = df['label']
        
    if spearman:
        return scipy.stats.spearmanr(feature, y)
    else:
        return scipy.stats.pearsonr(feature, y)

## This is not working
# def correlate_feature_gridsearch(param_grid, **kwarg):
#     grid = ParameterGrid(param_grid)
#     results = {}

#     for params in grid:
#         print(params)
#         # Index of the model, represents the parameters
#         index = '; '.join(x + '_' + str(y) for x, y in params.items())
        
#         result = correlate_feature_with_creativity(params, **kwarg)
#         print(result)
#         results[index] = result
    
#     return results

In [73]:
pool_dict = {'mean': np.average, 'max': max, 'min': min}

for pool_func in pool_dict:
    print("pool_func: " + pool_func)
    result = correlate_feature_with_creativity(df = car_df,
                                      function = pool_distances_within_text,
                                      apply_to_column = False,
                                      spearman = True,
                                      length = False,  
                                     pool = pool_dict[pool_func])
    print(result)

pool_func: <function average at 0x7fed06264b80>


KeyboardInterrupt: 

Maximum word cosine is positively correlated with creativity but also with length

Creativity-maximum: SpearmanrResult(correlation=0.37452160762745074, pvalue=5.111302023283955e-21)

Minimum word cosine is negatively correlated with creativity but also with length

Creativity-minmum: SpearmanrResult(correlation=-0.37735349122002054, pvalue=2.4535790257204983e-21)

In [77]:
pool_dict = {'mean': np.average, 'max': max, 'min': min}
for pool_func in pool_dict:
    for sentence_pool in pool_dict:
        print("pool_func: " + pool_func + "; sentence_pool: " + sentence_pool)
        result = correlate_feature_with_creativity(df = car_df,
                                          function = pool_distances_split_sentence,
                                          apply_to_column = False,
                                          spearman = True,
                                          length = True,  
                                         pool = pool_dict[pool_func],
                                          sentence_pool = pool_dict[sentence_pool])
        print(result)

pool_func: mean; sentence_pool: mean
SpearmanrResult(correlation=-0.2403277759735626, pvalue=3.592244833240831e-09)
pool_func: mean; sentence_pool: max
SpearmanrResult(correlation=-0.0038991285913811246, pvalue=0.9248327146046069)
pool_func: mean; sentence_pool: min
SpearmanrResult(correlation=-0.22093754052073925, pvalue=6.19485844271774e-08)
pool_func: max; sentence_pool: mean
SpearmanrResult(correlation=0.1723037472178696, pvalue=2.660790908762486e-05)
pool_func: max; sentence_pool: max
SpearmanrResult(correlation=0.42317239849931104, pvalue=6.010422081881892e-27)
pool_func: max; sentence_pool: min
SpearmanrResult(correlation=0.22874809208303887, pvalue=2.0280939769626615e-08)
pool_func: min; sentence_pool: mean
SpearmanrResult(correlation=-0.4774720620380414, pvalue=8.158835269475223e-35)
pool_func: min; sentence_pool: max
SpearmanrResult(correlation=-0.3832613528225597, pvalue=5.184492974439368e-22)
pool_func: min; sentence_pool: min
SpearmanrResult(correlation=-0.5166922258004382

pool_func: mean; sentence_pool: mean

SpearmanrResult(correlation=-0.10681453757761777, pvalue=0.009541355224589723)

pool_func: mean; sentence_pool: max

SpearmanrResult(correlation=0.02830077354068258, pvalue=0.4933854586159333)

pool_func: mean; sentence_pool: min

SpearmanrResult(correlation=-0.13007923968097182, pvalue=0.001572383066756145)

pool_func: max; sentence_pool: mean

SpearmanrResult(correlation=0.1748646004971523, pvalue=2.0069593419130286e-05)

pool_func: max; sentence_pool: max

SpearmanrResult(correlation=0.3096731969519826, pvalue=1.555880582819005e-14)

pool_func: max; sentence_pool: min

SpearmanrResult(correlation=0.18085354217737354, pvalue=1.0215051334244021e-05)

pool_func: min; sentence_pool: mean

SpearmanrResult(correlation=-0.28291090999942947, pvalue=2.7703965033980604e-12)

pool_func: min; sentence_pool: max

SpearmanrResult(correlation=-0.22320713937402628, pvalue=4.497115355524009e-08)

pool_func: min; sentence_pool: min

SpearmanrResult(correlation=-0.3248636747910821, pvalue=6.42564834544976e-16)

# document count based features

In [129]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

def normalized_tfidf(df):
    vec = CountVectorizer(tokenizer= nltk.word_tokenize,
                          stop_words = {'english'})
    dtf = vec.fit_transform(car_df['text']).toarray()

    tfidf_vec = TfidfVectorizer(tokenizer= nltk.word_tokenize,
                          stop_words = {'english'},
                               use_idf=False)
    tfidf = tfidf_vec.fit_transform(car_df['text']).toarray()
    
    normed_tfidf= tfidf.sum(axis = 1)/dtf.sum(axis = 1)
    return normed_tfidf

In [131]:

scipy.stats.pearsonr(normed_tfidf, car_df['label'])
#scipy.stats.pearsonr(normed_tfidf, car_df['wordcount'])

(-0.6524408182006463, 1.363312275423059e-72)

In [100]:
from sklearn.preprocessing import normalize
mtx = np.array([[1,0,0],
              [1,1,0],
               [1,1,1]])
normalize(mtx, norm = 'l1', axis = 0)

array([[0.33333333, 0.        , 0.        ],
       [0.33333333, 0.5       , 0.        ],
       [0.33333333, 0.5       , 1.        ]])

# Test Dumas et al.

In [40]:
# A wrapper function that applies Dumas et al. method to the whole column of texts.
def get_distances_for_df(responses, prompt, VSM = glove_dict, multiply = True):
    return [get_distance_between_texts(prompt, x, VSM, multiply = multiply) for x in responses]

import dataset
fitness_df = dataset.get_data(1, 'Novelty_Combined', shuffle=True)
car_df = dataset.get_data(2, 'Novelty_Combined', shuffle=True)

# Measures based on Duma et al., 2020.
correlate_feature_with_creativity(df = fitness_df,
                                  function = get_distances_for_df,
                                  apply_to_column = True,
                                  spearman = True,
                                  multiply = True,
                                 prompt = "fitness equipment")


# The following two functions are equivalent
correlate_feature_with_creativity(df = car_df,
                                  function = get_distances_for_df,
                                  apply_to_column = True,
                                  spearman = True,
                                  multiply = True,
                                 prompt = "self-driving car")
correlate_feature_with_creativity(df = car_df,
                                  function = get_distance_between_texts,
                                  apply_to_column = False,
                                  spearman = True,
                                  multiply = True,
                                 text2 = "self-driving car")

SyntaxError: invalid syntax (<ipython-input-40-0d9b7e3f3e40>, line 19)

In [73]:
# Are word counts correlated with word counts?

feature = get_distances_for_responses(prompt = "fitness equipment",
                             responses = car_df['text'],
                             VSM = glove_dict, multiply = True)
add_wordcount_col(car_df)
scipy.stats.spearmanr(feature, car_df['wordcount'])

SpearmanrResult(correlation=0.1174380910023567, pvalue=0.004350859526027052)

Consistent with Beaty & Johnson's finding. Additive composition generates semantic distances that are negatively correlated with word counts; multiplicative composition generates semantic distances that are positively correlated with word counts.