In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)


Mounted at /content/gdrive


In [None]:

import os

# set working directory
os.chdir('/content/gdrive/My Drive/Colab Notebooks')
#os.listdir()


In [None]:
import pandas as pd
import numpy as np
import vsm
import utils
import sst
from collections import Counter, defaultdict

from sklearn.model_selection import train_test_split
from sklearn import linear_model
from scipy.stats import spearmanr, pearsonr
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from torch_rnn_classifier import TorchRNNClassifier
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import TruncatedSVD, SparsePCA
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import Pipeline
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
from sklearn.model_selection import GridSearchCV

import random
random.seed(1)

import re

# Our dataset is an excel sheet with multiple sheets. 
# Each sheet include ideas from one sample along with ratings on several metrics 
#3 constructs (creativity, usefulness, novelty) * 3 types of judges (expert, consumers, combined)
# the following function allow us to extract a specific type of labels together with the ideas
def extract_metric(study, metric, length = 400):
  df0 = pd.read_excel("data/Idea Ratings_Berg_2019_OBHDP.xlsx", sheet_name= study-1)
  df = df0[["Final_Idea", metric]].rename(columns={'Final_Idea': 'sentence', metric: 'label'})

  df = df.iloc[[isinstance(x, str) for x in df['sentence']]]
  return(df.iloc[[len(x.split())< length for x in df['sentence']]])


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [None]:
# def get_wordnet_edges():
#     edges = defaultdict(set)
#     for ss in wn.all_synsets():
#         lem_names = {lem.name() for lem in ss.lemmas()}
#         for lem in lem_names:
#             edges[lem] |= lem_names
#     return edges
# wn_edges = get_wordnet_edges()

In [None]:
# def bigrams_phi(text):
#     """
#     The basis for a bigrams feature function. Downcases all tokens.

#     Parameters
#     ----------
#     text : str
#         The example to represent.

#     Returns
#     -------
#     defaultdict
#         A map from tuples to their counts in `text`.

#     """
#     toks = text.lower().split()
#     left = [utils.START_SYMBOL] + toks
#     right = toks + [utils.END_SYMBOL]
#     grams = list(zip(left, right))
#     return Counter(grams)
# def unigrams_phi(text):
#   """
#   The basis for a unigrams feature function. Downcases all tokens.

#   Parameters
#   ----------
#   text : str
#       The example to represent.

#   Returns
#   -------
#   defaultdict
#       A map from strings to their counts in `tree`. (Counter maps a
#       list to a dict of counts of the elements in that list.)

#   """
#   return Counter(text.lower().split())

In [None]:
def get_token_counts(df):
  ##### YOUR CODE HERE
  nrow = df.shape[0]
  text = []
  for i in range(nrow):
      text.append(unigrams_phi(df['sentence'][i]))
  return(text)
  
def get_length(df):
  nrow = df.shape[0]
  lengths = []
  characters = []
  average_len = []
  for i in range(nrow):
      lengths.append(len(df['sentence'][i].split()))
      characters.append(len(df['sentence'][i]))
      average_len.append(len(df['sentence'][i])/len(df['sentence'][i].split()))
  df_len = pd.DataFrame({'lengths': lengths,
                        'characters': characters,
                        'average_len': average_len})
  return(df_len.to_numpy())

def get_distance(word1, word2):
  v1 = glove_dict[word1],
  v2 = glove_dict[word2]

  return(vsm.cosine(v1, v2))

def get_vector_distance(text):
  words = nltk.word_tokenize(text)
  stop_words = nltk.corpus.stopwords.words('english')
  words = [w for w in words if not w in stop_words] 
  n = len(words)
  distances = []
  for i in range(n):
    for j in range(i+1, n):
      if words[i] in glove_dict and words[j] in glove_dict:
        distances.append(get_distance(words[i], words[j]))
      else:
        continue
  return(distances)

def get_df_distance(df):
  lst_distances = [get_vector_distance(text) for text in df['sentence']]

  df_dis = pd.DataFrame({'average': [np.average(distances) for distances in lst_distances],
                    'max': [np.max(distances) for distances in lst_distances],
                    'min': [np.min(distances) for distances in lst_distances]})
  return(df_dis.to_numpy())


# def get_pooled_insentence_distance(text, pool = np.average):
#   sentences = nltk.sent_tokenize(text)
#   distances = [get_pooled_distance(sentence, pool) for sentence in sentences]
#   return(np.average(distances))

def featurize(df_train, df_test, count = False, tfidf = False, length = True, distance = False):

  features_train = []
  features_test = []

  #Add features here

  #Counts
  if count:
    # vec = DictVectorizer(sparse=False)

    # features_train.append(vec.fit_transform(get_token_counts(df_train)))
    # features_test.append(vec.transform(get_token_counts(df_test)))# Not `fit_transform`!
    vec = CountVectorizer(tokenizer= nltk.word_tokenize)
    svd = TruncatedSVD(
      n_components=100,
      n_iter=10,
      random_state=42)
    features_train.append(vec.fit_transform(df_train['sentence']).toarray())
    features_test.append(vec.transform(df_test['sentence']).toarray())
    
  if distance:
    features_train.append(get_df_distance(df_train))
    features_test.append(get_df_distance(df_test))


  #Length
  if length:
    features_train.append(get_length(df_train))
    features_test.append(get_length(df_test))

  #tfidf
  if tfidf:
    tfidf = TfidfVectorizer(sparse = False)
    features_train.append(tfidf.fit_transform(df_train['sentence']).toarray())
    features_test.append(tfidf.transform(df_test['sentence']).toarray())
    print('tfidf')


  X_train =np.column_stack(features_train)
  X_test =np.column_stack(features_test)

  y_train = df_train['label']
  y_test = df_test['label']

  return(X_train, X_test, y_train, y_test)



In [None]:
glove_dict = utils.glove2dict(
    os.path.join('data', 'glove.6B', 'glove.6B.300d.txt'))

In [None]:
# crt = pd.concat([extract_metric(1,"Creativity_Combined"),
#           extract_metric(2,"Creativity_Combined"),
#           extract_metric(3,"Creativity_Combined")]).reset_index()
crt = extract_metric(2,"Usefulness_Combined")
n = crt.shape[0]
index = np.random.randint(1,6,n)

crt_train = crt[index < 5].reset_index()
crt_test = crt[index == 5].reset_index()
crt_sealed = crt[index > 5].reset_index()


In [None]:
#distances = get_df_distance(crt_test)
#pearsonr(distances[:200], crt_train['label'][range(200)])
print(spearmanr(distances[:,0], crt_test['label']))
lengs = get_length(crt_test)
print(pearsonr(distances[:, 0], lengs[:,0]))
# print(pearsonr(distances, lengs[:,1]))
# print(pearsonr(distances, lengs[:,2]))

SpearmanrResult(correlation=-0.3837380601595334, pvalue=2.7211707494553795e-05)
(-0.1420501927276636, 0.13339522045882654)


In [None]:
pipeline = Pipeline([('vect', CountVectorizer()),
                 ('svd', TruncatedSVD()),
                 ('clf', linear_model.SGDRegressor())
                 #('MPL', MLPRegressor())
                 ])
parameters = {
  'vect__max_df': (0.5, 0.75, 1.0),
  'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
  'clf__alpha': (1, 0.1, 0.001, 0.0001, 0),
  'clf__penalty': ('l1', 'l2', 'elasticnet'),
}
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

grid_search.fit(crt_train['sentence'], crt_train['label'])
#pipe.score(crt_test['sentence'], crt_test['label'])
#pearsonr(pipe.predict(crt_test['sentence']), crt_test['label'])

Fitting 5 folds for each of 90 candidates, totalling 450 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    7.6s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:   29.7s
[Parallel(n_jobs=-1)]: Done 446 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 450 out of 450 | elapsed:  1.1min finished


GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('vect',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        p

In [None]:
pipe = Pipeline([('vect', CountVectorizer()),
                 ('svd', TruncatedSVD()),
                 ('clf', linear_model.SGDRegressor(penalty='elasticnet'))
                 #('MPL', MLPRegressor())
                 ])
pipe.fit(crt_train['sentence'], crt_train['label'])
pipe.score(crt_test['sentence'], crt_test['label'])

0.20330349194100927

In [None]:
crt_train = pd.concat([extract_metric(1,"Creativity_Combined"),
          extract_metric(2,"Creativity_Combined")]).reset_index()
crt_test = extract_metric(3,"Creativity_Combined").reset_index()


In [None]:
X_train, X_test, y_train, y_test = featurize(crt_train, crt_test, count=True, length=False, distance=False)

def fit_lasso_regressor(X, y):
 
    clf = linear_model.Lasso()
    clf.fit(X, y)
    return(clf)
mod = fit_lasso_regressor(X_train, y_train)
pred = mod.predict(X_test)

print(pearsonr(crt_test['label'], pred))

(0.446630807574013, 1.656954503170394e-16)


In [None]:
# Try to combine the output of two models
X_train, X_test, y_train, y_test = featurize(crt_train, crt_test, count=True, length=False)

def fit_lasso_regressor(X, y):
 
    clf = linear_model.Lasso()
    clf.fit(X, y)
    return(clf)
mod = fit_lasso_regressor(X_train, y_train)
train1 =  mod.predict(X_train)
pred1 = mod.predict(X_test)

X_train, X_test, y_train, y_test = featurize(crt_train, crt_test, count=False, length=True)
mod = fit_lasso_regressor(X_train, y_train)
train2 = mod.predict(X_train)
pred2 = mod.predict(X_test)

lm = linear_model.LinearRegression()

lm.fit(np.column_stack([train1, train2]), y_train)
pred = lm.predict(np.column_stack([pred1, pred2]))


print(spearmanr(crt_test['label'], pred))
print(pearsonr(crt_test['label'], pred1))
print(pearsonr(crt_test['label'], pred2))

print(pearsonr(crt_test['label'], pred))

SpearmanrResult(correlation=0.6254302407310216, pvalue=7.74529646063656e-35)
(nan, nan)
(0.5910400095563775, 2.132523068771589e-30)
(0.5910400095563774, 2.1325230687716497e-30)




In [None]:
SST_HOME = os.path.join('data', 'sentiment')
train_dataset = sst.build_dataset(
    crt_train,
    phi=unigrams_phi,
    vectorizer=None)
vec.fit_transform(train_feats)

In [None]:
train_dataset['y']