### Humicroedit dataset creation

In [1]:
import warnings
warnings.filterwarnings('ignore')

from tqdm import tqdm
import numpy as np
# import torch
# from torchvision import datasets
# from torch import nn, optim, autograd
import matplotlib.pyplot as plt
import pandas as pd

# from sklearn.decomposition import LatentDirichletAllocation
# from sklearn.linear_model import LogisticRegression
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, scale
# from sklearn.datasets import make_multilabel_classification

import random
# from sklearn import svm
# from sklearn import linear_model
# from scipy.sparse import csr_matrix
# from sklearn.metrics import classification_report
# from sklearn.cross_decomposition import CCA
from sklearn.utils import shuffle

import gensim, logging
import ast
# from sklearn.preprocessing import MinMaxScaler
# from sklearn.manifold import TSNE
# from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
# from sklearn.metrics import silhouette_samples, silhouette_score
# from gensim.parsing.preprocessing import remove_stopwords
from cleantext import clean
# import matplotlib.cm as cm
# from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn import preprocessing
import re
from pathlib import Path
%matplotlib inline

np.random.seed(42)

### Read in data

*Note*: to run this notebook you would need to download:

1) Humicroedit data (from https://www.cs.rochester.edu/u/nhossain/humicroedit.html, Full Dataset Release)

2) glove.6B (from https://nlp.stanford.edu/projects/glove/) 

and place them in the relevant paths below (in ../../data)

In [2]:
train_df_task1 = pd.read_csv("../../data/semeval-2020-task-7-dataset/subtask-1/train.csv")
train_funline_df_task1 = pd.read_csv("../../data/semeval-2020-task-7-dataset/subtask-1/train_funlines.csv")
val_df_task1 = pd.read_csv("../../data/semeval-2020-task-7-dataset/subtask-1/dev.csv")
test_df_task1 = pd.read_csv("../../data/semeval-2020-task-7-dataset/subtask-1/test.csv")

train_df_task2 = pd.read_csv("../../data/semeval-2020-task-7-dataset/subtask-2/train.csv")
train_funline_df_task2 = pd.read_csv("../../data/semeval-2020-task-7-dataset/subtask-2/train_funlines.csv")
val_df_task2 = pd.read_csv("../../data/semeval-2020-task-7-dataset/subtask-2/dev.csv")
test_df_task2 = pd.read_csv("../../data/semeval-2020-task-7-dataset/subtask-2/test.csv")

### Helper functions

In [3]:
def task2_convert_to_task1(df):
    dataset1 = df[['original1', 'edit1', 'grades1', 'meanGrade1']]
    dataset2 = df[['original2', 'edit2', 'grades2', 'meanGrade2']]
    dataset2.rename(columns={"original2": "original1", "edit2": "edit1", "grades2": "grades1", "meanGrade2": "meanGrade1"}, inplace=True)
    combined = dataset1.append(dataset2)
    combined.rename(columns={"original1": "original", "edit1": "edit", "grades1": "grades", "meanGrade1": "meanGrade"}, inplace=True)
    return combined

In [4]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from collections import defaultdict
from nltk.stem.porter import PorterStemmer

## https://github.com/utkuozbulak/unsupervised-learning-document-clustering/blob/master/src/read_and_clean_documents.py
def clean_string(st):
    stemmer = PorterStemmer()
    items_to_clean = set(list(stopwords.words('english')) + ['\n','\n\n','\n\n\n','\n\n\n\n','ocroutput','',' '])
    # Items to clean
    regex_non_alphanumeric = re.compile('[^0-9a-zA-Z]')  # REGEX for non alphanumeric chars
    st = " ".join(regex_non_alphanumeric.sub(' ', st).split())  # Filter text, remove non alphanumeric chars
    st = st.lower()  # Lowercase the text
    st = stemmer.stem(st)  # Stem the text
    if len(st) < 3:  # If the length of item is lower than 3, remove item
        item = ''
    st = " ".join([elem for elem in st.split(" ") if elem not in items_to_clean])

    return st

def perform_edit(x):
    first_part = x['original'].split("<")[0]
    second_part = x['original'].split(">")[1]
    edit = x['edit']
    result = first_part + edit + second_part
#     return clean_string(result)
    return result

def replaced_word(x):
    return x[x.find("<")+len("<"):x.rfind("/>")]

def restructure_dataset(df):
    df['Z_raw'] = df['original'].apply(lambda x: x.replace("<","").replace("/>",""))
#     df['Z_raw'] = df['original'].apply(lambda x: clean_string(x.replace("<","").replace("/>","")))
    df['replaced'] = df['original'].apply(lambda x: replaced_word(x))
    df['W_raw'] = df['edit']
    df['X_raw'] = df.apply(lambda x: perform_edit(x), axis=1)
    df['Y'] = df['meanGrade']
    return df

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lgultchin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Structure dataset

In [35]:
total_df_task1 = train_df_task1.append(train_funline_df_task1).append(val_df_task1).append(test_df_task1)
assert len(total_df_task1) == \
len(train_df_task1) + len(train_funline_df_task1) + \
len(val_df_task1) + len(test_df_task1)

total_df_task2 = train_df_task2.append(train_funline_df_task2).append(val_df_task2).append(test_df_task2)
assert len(total_df_task2) == \
len(train_df_task2) + len(train_funline_df_task2) + \
len(val_df_task2) + len(test_df_task2)
total_df_task2 = task2_convert_to_task1(total_df_task2)

In [36]:
total_df = total_df_task1.append(total_df_task2)
print(len(total_df))
total_df.drop_duplicates(inplace=True)
print(len(total_df))

56651
41832


In [37]:
total_df

Unnamed: 0,id,original,edit,grades,meanGrade
0,14530.0,France is ‘ hunting down its citizens who join...,twins,10000,0.2
1,13034.0,"Pentagon claims 2,000 % increase in Russian tr...",bowling,33110,1.6
2,8731.0,Iceland PM Calls Snap Vote as Pedophile Furor ...,party,22100,1.0
3,76.0,"In an apparent first , Iran and Israel <engage...",slap,20000,0.4
4,6164.0,Trump was told weeks ago that Flynn misled <Vi...,school,0,0.0
...,...,...,...,...,...
2946,,Zuckerberg sold nearly $ 500 million Facebook ...,emojis,32210,1.6
2949,,“ Fake <news/> ” or free speech : Is Google cr...,discs,0,0.0
2952,,“ Stop this bullshit ” : uncle of Pakistani gi...,startled,0,0.0
2955,,“ The fish rots from the <head/> ” : a histori...,toupee,32110,1.4


In [38]:
total_df = restructure_dataset(total_df)
total_df

Unnamed: 0,id,original,edit,grades,meanGrade,Z_raw,replaced,W_raw,X_raw,Y
0,14530.0,France is ‘ hunting down its citizens who join...,twins,10000,0.2,France is ‘ hunting down its citizens who join...,Isis,twins,France is ‘ hunting down its citizens who join...,0.2
1,13034.0,"Pentagon claims 2,000 % increase in Russian tr...",bowling,33110,1.6,"Pentagon claims 2,000 % increase in Russian tr...",Syria,bowling,"Pentagon claims 2,000 % increase in Russian tr...",1.6
2,8731.0,Iceland PM Calls Snap Vote as Pedophile Furor ...,party,22100,1.0,Iceland PM Calls Snap Vote as Pedophile Furor ...,Coalition,party,Iceland PM Calls Snap Vote as Pedophile Furor ...,1.0
3,76.0,"In an apparent first , Iran and Israel <engage...",slap,20000,0.4,"In an apparent first , Iran and Israel engage ...",engage,slap,"In an apparent first , Iran and Israel slap ea...",0.4
4,6164.0,Trump was told weeks ago that Flynn misled <Vi...,school,0,0.0,Trump was told weeks ago that Flynn misled Vic...,Vice,school,Trump was told weeks ago that Flynn misled sch...,0.0
...,...,...,...,...,...,...,...,...,...,...
2946,,Zuckerberg sold nearly $ 500 million Facebook ...,emojis,32210,1.6,Zuckerberg sold nearly $ 500 million Facebook ...,stock,emojis,Zuckerberg sold nearly $ 500 million Facebook ...,1.6
2949,,“ Fake <news/> ” or free speech : Is Google cr...,discs,0,0.0,“ Fake news ” or free speech : Is Google crack...,news,discs,“ Fake discs ” or free speech : Is Google crac...,0.0
2952,,“ Stop this bullshit ” : uncle of Pakistani gi...,startled,0,0.0,“ Stop this bullshit ” : uncle of Pakistani gi...,killed,startled,“ Stop this bullshit ” : uncle of Pakistani gi...,0.0
2955,,“ The fish rots from the <head/> ” : a histori...,toupee,32110,1.4,“ The fish rots from the head ” : a historian ...,head,toupee,“ The fish rots from the toupee ” : a historia...,1.4


### Convert raw text columns to w2v

In [9]:
model = gensim.models.KeyedVectors.load_word2vec_format('../../data/GoogleNews-vectors-negative300.bin.gz', binary=True)

from gensim.scripts.glove2word2vec import glove2word2vec
glove2word2vec(glove_input_file="../../data/glove.6B/glove.6B.300d.txt", 
               word2vec_output_file="../../data/glove.6B/gensim_glove_vectors.txt")

from gensim.models.keyedvectors import KeyedVectors
glove_model = KeyedVectors.load_word2vec_format("../../data/glove.6B/gensim_glove_vectors.txt", binary=False)

In [39]:
def word_vector(word2vec_model, word):
    try:
        return word2vec_model[word]
    except KeyError:
        return np.nan
    
def edits_diff(x,model=model):
    try:
        edit = model[x['edit']]
        init = model[x['replaced']]
        return edit-init
    except KeyError:
        return np.nan
    
def document_vector(word2vec_model, doc):
    # remove out-of-vocabulary words
    doc = [word for word in doc if word in word2vec_model.vocab]
    if len(doc)>0:
        return np.mean(word2vec_model[doc], axis=0)
    else:
        return np.nan

In [40]:
len(glove_model.vocab.keys()), len(model.vocab.keys())

(400000, 3000000)

In [41]:
%%time
total_df['W_vec'] = total_df['W_raw'].apply(lambda x: word_vector(model, x))
total_df['Z_vec'] = total_df['Z_raw'].apply(lambda x: document_vector(model, x))
total_df['X_vec'] = total_df['X_raw'].apply(lambda x: document_vector(model, x))
total_df['W_init_sub_edit_vec'] = total_df.apply(edits_diff, axis=1)
# glove vectors
total_df['W_glv_vec'] = total_df['W_raw'].apply(lambda x: word_vector(glove_model, clean_string(x)))
total_df['Z_glv_vec'] = total_df['Z_raw'].apply(lambda x: document_vector(glove_model, clean_string(x)))
total_df['X_glv_vec'] = total_df['X_raw'].apply(lambda x: document_vector(glove_model, clean_string(x)))

total_df['W_init_sub_edit_glv_vec'] = total_df.apply(lambda x: edits_diff(x, model=glove_model), axis=1)

total_df = total_df.dropna()
total_df

CPU times: user 1min 19s, sys: 19.1 s, total: 1min 38s
Wall time: 3min 21s


Unnamed: 0,id,original,edit,grades,meanGrade,Z_raw,replaced,W_raw,X_raw,Y,W_vec,Z_vec,X_vec,W_init_sub_edit_vec,W_glv_vec,Z_glv_vec,X_glv_vec,W_init_sub_edit_glv_vec
3,76.0,"In an apparent first , Iran and Israel <engage...",slap,20000,0.4,"In an apparent first , Iran and Israel engage ...",engage,slap,"In an apparent first , Iran and Israel slap ea...",0.4,"[0.17871094, -0.13671875, 0.091308594, 0.27148...","[-0.16085683, 0.11282813, -0.017742654, 0.1522...","[-0.15556335, 0.12433208, -0.025668057, 0.1493...","[0.115234375, -0.24853516, 0.05419922, 0.14550...","[0.20881, 0.13581, -0.34811, 0.10243, -0.44111...","[-0.29926562, 0.042028673, -0.24126922, -0.067...","[-0.2568383, 0.004592164, -0.21411426, -0.0841...","[0.138641, -0.26598, -0.23705998, 0.71699, -0...."
5,8832.0,All 22 <promises/> Trump made in his speech to...,sounds,22200,1.2,All 22 promises Trump made in his speech to Co...,promises,sounds,All 22 sounds Trump made in his speech to Cong...,1.2,"[0.15722656, -0.015136719, 0.024169922, 0.1142...","[-0.16917509, 0.11653765, -0.004016353, 0.1354...","[-0.16694859, 0.11431636, 0.0022720026, 0.1258...","[0.029296875, -0.31591797, -0.26293945, 0.2060...","[0.072236, -0.064091, -0.72274, 0.029478, -0.1...","[-0.25548378, 0.0055763684, -0.21668643, -0.01...","[-0.24318196, 0.0027593398, -0.2431335, -0.041...","[-0.274745, 0.17397, -0.23278001, 0.66924, -0...."
6,12174.0,New DOJ alert system will flag <crimes/> again...,laughter,32100,1.2,New DOJ alert system will flag crimes against ...,crimes,laughter,New DOJ alert system will flag laughter agains...,1.2,"[0.31835938, -0.25976562, 0.19628906, 0.484375...","[-0.179599, 0.12229004, -0.008529663, 0.129858...","[-0.1853861, 0.12889583, -0.013356279, 0.11729...","[0.33666992, -0.56640625, -0.040039062, 0.5917...","[-0.23031, -0.030609, 0.082003, 0.13315, -0.00...","[-0.24717303, 0.081990376, -0.31995907, -0.066...","[-0.26802576, 0.06752585, -0.35055396, -0.0671...","[-0.251844, -0.38553903, 0.109877996, 0.428259..."
9,14191.0,Dutch minister resigns in drug baron <row/>,blow,0,0.0,Dutch minister resigns in drug baron row,row,blow,Dutch minister resigns in drug baron blow,0.0,"[0.26953125, 0.07128906, 0.14746094, -0.083984...","[-0.21529505, 0.09837018, 0.0059074634, 0.1422...","[-0.20689122, 0.09824865, 0.00816076, 0.138844...","[0.1821289, -0.021484375, 0.21435547, -0.19335...","[0.7799, -0.012848, -0.63467, -0.20527, -0.251...","[-0.24108344, -0.05642715, -0.26649308, -0.049...","[-0.22866751, -0.009843903, -0.27912727, -0.07...","[0.19519001, -0.198688, -0.76445, 0.18273, -0...."
10,14268.0,Dozens dead in possible gas <attack/> in Syria...,bloating,22100,1.0,Dozens dead in possible gas attack in Syria ; ...,attack,bloating,Dozens dead in possible gas bloating in Syria ...,1.0,"[0.012634277, 0.25585938, -0.1796875, 0.161132...","[-0.16217759, 0.09522083, 0.01645795, 0.153305...","[-0.16275138, 0.09460449, 0.021258319, 0.14807...","[-0.16802979, 0.12402344, -0.25683594, 0.22216...","[-0.0010849, 0.12138, 0.22883, 0.13895, -0.202...","[-0.28534657, 0.12610303, -0.32908022, -0.0660...","[-0.27154708, 0.1449134, -0.33190408, -0.07578...","[1.09489, 0.19946003, 0.109608, -0.25891, -0.4..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3013,17.0,"Eyeing an election , Austria 's far-right Free...",flirts,21100,0.8,"Eyeing an election , Austria 's far-right Free...",commits,flirts,"Eyeing an election , Austria 's far-right Free...",0.8,"[0.15625, -0.12011719, -0.035888672, 0.0625, -...","[-0.19297282, 0.119241156, 0.0052346624, 0.128...","[-0.19415405, 0.12365723, 0.007941894, 0.12312...","[0.09643555, -0.4970703, -0.5515137, 0.0007324...","[-0.35989, 0.39939, 0.304, 0.22194, -0.21962, ...","[-0.26238957, 0.002128861, -0.291182, -0.06419...","[-0.27710137, -0.010364528, -0.32532543, -0.05...","[-0.363321, 0.166347, 0.457223, 0.43596, -0.38..."
3014,6829.0,The Latest : McCabe lawyer says criminal <refe...,mind,10000,0.2,The Latest : McCabe lawyer says criminal refer...,referral,mind,The Latest : McCabe lawyer says criminal mind ...,0.2,"[0.23535156, 0.1640625, 0.03930664, 0.15722656...","[-0.19099759, 0.12131135, 0.0032176143, 0.1335...","[-0.1856874, 0.10970147, 0.0059516374, 0.14110...","[0.45703125, 0.14160156, -0.1550293, 0.0146484...","[-0.19272, -0.3462, -0.16195, 0.12966, 0.48245...","[-0.2296312, 0.037876945, -0.30486488, -0.1363...","[-0.1922786, 0.08629804, -0.27266732, -0.16186...","[0.2517, -0.68961, -0.7339, -0.3505, -0.169030..."
3015,12145.0,Steve Bannon ’s own words show sharp break on ...,guards,21000,0.6,Steve Bannon ’s own words show sharp break on ...,issues,guards,Steve Bannon ’s own words show sharp break on ...,0.6,"[-0.040039062, 0.36914062, 0.15722656, -0.0066...","[-0.17600851, 0.10289748, 1.0388962e-05, 0.140...","[-0.17773305, 0.10385662, 0.0008969514, 0.1395...","[0.0007324219, 0.1171875, 0.37304688, -0.31521...","[0.33292, 0.1669, 0.11604, -0.046371, -0.36552...","[-0.18546198, -0.0034434667, -0.23186754, -0.0...","[-0.2327396, -0.01566634, -0.24392949, -0.0567...","[0.235443, 0.3515206, -0.308492, -0.200829, -0..."
3021,6845.0,Spicer : We do n't regret repeating claim that...,licked,11000,0.4,Spicer : We do n't regret repeating claim that...,spied,licked,Spicer : We do n't regret repeating claim that...,0.4,"[-0.1796875, 0.1171875, -0.018676758, 0.135742...","[-0.18470407, 0.11734853, -0.020107837, 0.1634...","[-0.17765427, 0.11036682, -0.014645894, 0.1591...","[-0.28027344, -0.15234375, -0.20227051, 0.1661...","[-0.50318, 0.64528, -0.69668, -0.088312, -0.27...","[-0.32987472, -0.00243384, -0.21117866, 0.0087...","[-0.34803858, 0.015613832, -0.23332469, 0.0098...","[0.199165, 0.36876002, -0.065579, 0.29648098, ..."


### Cluster W

In [112]:
W_glv_vec = total_df['W_glv_vec'].values
W_glv_vec = np.concatenate(W_glv_vec).ravel().reshape(len(W_glv_vec), -1)
clusterer = KMeans(n_clusters=20, random_state=10)
total_df['cluster_labels'] = clusterer.fit_predict(W_glv_vec)
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(np.array(total_df['cluster_labels']).reshape(-1,1))
def one_hot(x):
    return enc.transform(np.array([x]).reshape(-1,1)).toarray()[0]
total_df['cluster_labels_one_hot'] = total_df['cluster_labels'].apply(lambda x: one_hot(x))

In [113]:
clusters = {}
for i in range(20):
    clusters[i] = total_df[total_df['cluster_labels']==i]['W_raw']

### Prepare Phis: hypothsis testing based on Hossain et al.
--------

Hossain et al, who compiled the Homicroedit dataset, analyzed and hypothesized what "humor features" exist in the humorous edits. Some of the candidate explainers were:
#### Manual inspection
1. Edit word forms a meaningful n-gram with adjacent words
2. Connection b/w edit word and original word (e.g. semnatically distant, or similar pronounciation)
3. Edit word makes a strong connection with an entity in the headline (Trump-hair, Obama-ears)
4. Sarcasm
5. Tension Supression/Relief Theory of Humor
6. Edit word creates incongruity
7. Setup and punchline: surprising edit at the end of the sentence
-----------------
### Translation to our setup
1. Length of resulting edited sentence (should not vary with w)
2. Cosine distance b/w Glove of edit word and the rest of words in sentence
3. Location index of replaced word (should not vary with w)
4. Sentiment polarity of edit word
5. Sentiment polarity of resulting sentence
6. Cosine distance b/w Glove of edited word and Glove of original word
7. Cosine distance from neigbouring words
8. Distance of final sentence from cluster centroids

1. Length of resulting edited sentence (should not vary with w)

In [102]:
total_df['phis0'] = total_df['X_raw'].apply(lambda x: len(x))

2. Cosine distance b/w Glove of edit word and the rest of words in sentence

In [103]:
def avg_dist_edit_rest(x):
    edit = x['edit']
    edit_vec = x['W_glv_vec']
    sent = x['X_raw']
    dists = []
    for word in sent.split(" "):
        if word!=edit:
            word_vec = word_vector(glove_model, clean_string(word))
            if not np.isnan(word_vec).any():
                dist = cosine_similarity(
                    word_vec.reshape(1,-1), 
                    edit_vec.reshape(1,-1))
                dists.append(dist)
    return np.mean(dists)

total_df['phis1'] = total_df.apply(avg_dist_edit_rest, axis=1)

3. location index of replaced word (should not vary with w)

In [104]:
def ind_of_edit(x):
    return x['X_raw'].split(" ").index(x['edit'])

total_df['phis2'] = total_df.apply(ind_of_edit, axis=1)

4. Sentiment polarity of edit word

In [105]:
import stanza
nlp = stanza.Pipeline(lang='en', processors='tokenize,sentiment')

def sentiment_of_word(x):
    sents = []
    doc = nlp(x)
    for sentence in doc.sentences:
        sent = sentence.sentiment
        sents.append(sent)
    return sents[0]

total_df['phis3'] = total_df['edit'].apply(sentiment_of_word)

2021-06-04 13:08:59 INFO: Loading these models for language: en (English):
| Processor | Package |
-----------------------
| tokenize  | ewt     |
| sentiment | sstplus |

2021-06-04 13:08:59 INFO: Use device: cpu
2021-06-04 13:08:59 INFO: Loading: tokenize
2021-06-04 13:08:59 INFO: Loading: sentiment
2021-06-04 13:09:01 INFO: Done loading processors!


5. Sentiment polarity of resulting sentence

In [106]:
def sentiment_of_word(x):
    sents = []
    doc = nlp(x)
    for i, sentence in enumerate(doc.sentences):
        sent = sentence.sentiment
        sents.append(sent)
    return sents[0]

total_df['phis4'] = total_df['X_raw'].apply(sentiment_of_word)

6. Cosine distance b/w Glove of edited word and Glove of original word

In [107]:
total_df['replaced_glv_vec'] = total_df['replaced'].apply(lambda x: word_vector(glove_model, clean_string(x)))
total_df.dropna(inplace=True)
def cosine_distance_original_and_edit_word(x):
    edit_vec = x['W_glv_vec']
    original_vec = x['replaced_glv_vec']
    return cosine_similarity(
                    edit_vec.reshape(1,-1), 
                    original_vec.reshape(1,-1))

total_df['phis5'] = total_df.apply(cosine_distance_original_and_edit_word, axis=1)

7. Cosine distance from neigbouring words

In [108]:
def dist_from_neighb(x, neighb=1):
    edit_vec = x['W_glv_vec']
    sentence = x['X_raw']
    edit_index = x['phis2']
    neighb_word_vec = word_vector(glove_model, sentence[edit_index+neighb])
    if not np.isnan(neighb_word_vec).any():
        similar = cosine_similarity(
                        neighb_word_vec.reshape(1,-1), 
                        edit_vec.reshape(1,-1)).squeeze(axis=0)[0]
        return similar
    else:
        return -1

total_df['phis6'] = total_df.apply(dist_from_neighb, neighb=-2, axis=1)
total_df['phis7'] = total_df.apply(dist_from_neighb, neighb=-1, axis=1)
total_df['phis8'] = total_df.apply(dist_from_neighb, neighb=1, axis=1)
total_df['phis9'] = total_df.apply(dist_from_neighb, neighb=2, axis=1)

8. Distance of final sentence from cluster centroids

In [121]:
%%time 
cluster_centers = clusterer.cluster_centers_
avs_resulting_sent_vec = total_df['X_raw'].apply(lambda x: document_vector(glove_model, clean_string(x)))
distance_avg_sent_to_centroids = {10+i: [] for i in range(20)}
for sent in avs_resulting_sent_vec:
    for centroid_id, centroid in enumerate(cluster_centers):
        distance_avg_sent_to_centroids[centroid_id+10].append(
            cosine_similarity(sent.reshape(1,-1), 
                              centroid.reshape(1,-1)).item())
for key in distance_avg_sent_to_centroids.keys():
    total_df['phis'+str(key)] = distance_avg_sent_to_centroids[key]


CPU times: user 49 s, sys: 1.33 s, total: 50.3 s
Wall time: 56.2 s


In [122]:
len(list(total_df.columns[total_df.columns.str.contains('phis')]))

30

In [123]:
# Normalize phis and drop nans
for phi_col in list(total_df.columns[total_df.columns.str.contains('phis')]):
    total_df[phi_col] = scale(total_df[phi_col])
    
len(total_df)

7575

In [124]:
unseen_clusters = 11
total_df_seen = total_df[total_df['cluster_labels'] != unseen_clusters]
total_df_unseen = total_df[total_df['cluster_labels'] == unseen_clusters]
print(total_df_seen.shape, total_df_unseen.shape)

(6857, 949) (718, 949)


### Groundtruth Y

In [125]:
np.random.seed(42)
weights_phis_groundtruth = np.random.uniform(-1, 1, 30)
zero_out_inds = np.random.choice(30, 9, replace=False)
weights_phis_groundtruth[zero_out_inds] = 0
# make sure at least one not partic. is varying w/ W
weights_phis_groundtruth[3] = 0
neg_inds = np.random.choice(30, 15, replace=False)
weights_phis_groundtruth[neg_inds] *= -1

def generate_groudtruth(x, weight_to_apply):
    phis = x[list(x.index[x.index.str.contains('phis')])].values
    try:
        result = (weight_to_apply@phis).squeeze(axis=0)
    except:
        result = (weight_to_apply@phis)
    # add noise to linear combin.
    result += np.random.normal(0, .5)
    return result
    
total_df['Y_groundtruth'] = total_df.apply(lambda x: generate_groudtruth(x, weights_phis_groundtruth), axis=1)

In [126]:
weights_phis_groundtruth

array([ 0.25091976,  0.90142861, -0.46398788,  0.        , -0.        ,
       -0.        ,  0.88383278,  0.73235229, -0.20223002,  0.        ,
       -0.        ,  0.9398197 ,  0.66488528, -0.57532178, -0.63635007,
        0.        , -0.        ,  0.04951286,  0.13610996,  0.41754172,
        0.22370579, -0.        ,  0.        , -0.26727631,  0.08786003,
        0.57035192, -0.60065244, -0.        , -0.18482914,  0.90709917])

### Export dataset

In [128]:
# restructure dataframe: from single column multi-dimensional 
# to multi-column one-dimensional
total_df[['X_glv_vec'+str(i) for i in range(len(total_df['X_glv_vec'].iloc[0]))]] =\
    pd.DataFrame(total_df.X_glv_vec.tolist(), index = total_df.index)
total_df.drop('X_glv_vec', axis=1, inplace=True)

total_df[['Z_glv_vec'+str(i) for i in range(len(total_df['Z_glv_vec'].iloc[0]))]] =\
    pd.DataFrame(total_df.Z_glv_vec.tolist(), index = total_df.index)
total_df.drop('Z_glv_vec', axis=1, inplace=True)

total_df[['W_glv_vec'+str(i) for i in range(len(total_df['W_glv_vec'].iloc[0]))]] =\
    pd.DataFrame(total_df.W_glv_vec.tolist(), index = total_df.index)
total_df.drop('W_glv_vec', axis=1, inplace=True)

# total_df[['cluster_labels_one_hot'+str(i) for i in range(len(total_df['cluster_labels_one_hot'].iloc[0]))]] =\
#     pd.DataFrame(total_df.cluster_labels_one_hot.tolist(), index = total_df.index)
# total_df.drop('cluster_labels_one_hot', axis=1, inplace=True)

In [129]:
np.random.seed(42)
# # choice = int(np.random.choice(5, 1, replace=False))
# # dict_cluster_sizes = dict(total_df['cluster_labels'].value_counts())
# # keys=list(dict_cluster_sizes.keys())
# # values=list(dict_cluster_sizes.values())
# # value = sorted(values)[15+choice]
# # print("size of chosen split: ", value)
# # unseen_clusters = [keys[values.index(value)]]

# this is the cluster used for unseen W in paper
# chosen via process above
unseen_clusters = 11 

# total_df_seen = total_df[~total_df['cluster_labels'].isin([unseen_clusters])]
# total_df_unseen = total_df[total_df['cluster_labels'].isin([unseen_clusters])]

total_df_seen = total_df[total_df['cluster_labels'] != unseen_clusters]
total_df_unseen = total_df[total_df['cluster_labels'] == unseen_clusters]
print(total_df_seen.shape, total_df_unseen.shape)

total_df_seen = total_df_seen[
                    list(total_df_seen.columns[
                        total_df_seen.columns.str.contains('Z_glv_vec')])+
                    list(total_df_seen.columns[
                        total_df_seen.columns.str.contains('W_glv_vec')])+
                    list(total_df_seen.columns[
                        total_df_seen.columns.str.contains('X_glv_vec')])+['Y_groundtruth']
                    + list(total_df_seen.columns[
                        total_df_seen.columns.str.contains('phis')])
]


total_df_unseen = total_df_unseen[
                    list(total_df_unseen.columns[
                    total_df_unseen.columns.str.contains('Z_glv_vec')])+
                    list(total_df_unseen.columns[
                        total_df_unseen.columns.str.contains('W_glv_vec')])+
                    list(total_df_unseen.columns[
                        total_df_unseen.columns.str.contains('X_glv_vec')])+['Y_groundtruth']
                    + list(total_df_unseen.columns[
                        total_df_unseen.columns.str.contains('phis')])]

total_df_seen = shuffle(total_df_seen)
total_df_seen_train = total_df_seen.iloc[:int(len(total_df_seen)*0.9)]
total_df_seen_test = total_df_seen.iloc[int(len(total_df_seen)*0.9):]

total_df_unseen = shuffle(total_df_unseen)
df_test_unseen_train = total_df_unseen.iloc[:int(len(total_df_unseen)*0.8)]
df_test_unseen_test = total_df_unseen.iloc[int(len(total_df_unseen)*0.8):]

total_df_seen = total_df_seen.to_numpy()
total_df_seen_train = total_df_seen_train.to_numpy()
total_df_seen_test = total_df_seen_test.to_numpy()

testset_unseen = total_df_unseen.to_numpy()
test_unseen_train = df_test_unseen_train.to_numpy()
test_unseen_test = df_test_unseen_test.to_numpy()

print(total_df_seen_train.shape, total_df_seen_test.shape, 
      test_unseen_train.shape, test_unseen_test.shape)

Path('../../data/Humicroedit/').mkdir(parents=True, exist_ok=True)
np.savez_compressed('../../data/Humicroedit/Humicroedit_trainset_seen_train.npz', total_df_seen_train)
np.savez_compressed('../../data/Humicroedit/Humicroedit_trainset_seen_test.npz', total_df_seen_test)
np.savez_compressed('../../data/Humicroedit/Humicroedit_testset_unseen_train.npz', test_unseen_train)
np.savez_compressed('../../data/Humicroedit/Humicroedit_testset_unseen_test.npz', test_unseen_test)
np.savez_compressed('../../data/Humicroedit/Humicroedit_params.npz', weights_phis_groundtruth)

(6857, 949) (718, 949)
(6171, 931) (686, 931) (574, 931) (144, 931)


## Additional experiments: create 100 versions of datasets with different coefficients for Y

In [130]:
np.random.seed(42)
num_runs_diff_Y = 100
for run in range(num_runs_diff_Y):
    weights_phis_groundtruth = np.random.uniform(-1, 1, 30)
    zero_out_inds = np.random.choice(30, 9, replace=False)
    weights_phis_groundtruth[zero_out_inds] = 0
    # make sure at least one not partic. is varying w/ W
    weights_phis_groundtruth[3] = 0
    neg_inds = np.random.choice(30, 15, replace=False)
    weights_phis_groundtruth[neg_inds] *= -1

    def generate_groudtruth(x, weight_to_apply):
        phis = x[list(x.index[x.index.str.contains('phis')])].values
        try:
            result = (weight_to_apply@phis).squeeze(axis=0)
        except:
            result = (weight_to_apply@phis)
        # add noise to linear combin.
        result += np.random.normal(0, .5)
        return result

    total_df['Y_groundtruth'] = total_df.apply(lambda x: generate_groudtruth(x, weights_phis_groundtruth), axis=1)
    
    unseen_clusters = 11 

    total_df_seen = total_df[~total_df['cluster_labels'].isin([unseen_clusters])]
    total_df_unseen = total_df[total_df['cluster_labels'].isin([unseen_clusters])]
#     print(total_df_seen.shape, total_df_unseen.shape)

    total_df_seen = total_df_seen[
                        list(total_df_seen.columns[
                            total_df_seen.columns.str.contains('Z_glv_vec')])+
                        list(total_df_seen.columns[
                            total_df_seen.columns.str.contains('W_glv_vec')])+
                        list(total_df_seen.columns[
                            total_df_seen.columns.str.contains('X_glv_vec')])+['Y_groundtruth']
                        + list(total_df_seen.columns[
                            total_df_seen.columns.str.contains('phis')])
    ]


    total_df_unseen = total_df_unseen[
                        list(total_df_unseen.columns[
                        total_df_unseen.columns.str.contains('Z_glv_vec')])+
                        list(total_df_unseen.columns[
                            total_df_unseen.columns.str.contains('W_glv_vec')])+
                        list(total_df_unseen.columns[
                            total_df_unseen.columns.str.contains('X_glv_vec')])+['Y_groundtruth']
                        + list(total_df_unseen.columns[
                            total_df_unseen.columns.str.contains('phis')])]

    total_df_seen = shuffle(total_df_seen)
    total_df_seen_train = total_df_seen.iloc[:int(len(total_df_seen)*0.9)]
    total_df_seen_test = total_df_seen.iloc[int(len(total_df_seen)*0.9):]

    total_df_unseen = shuffle(total_df_unseen)
    df_test_unseen_train = total_df_unseen.iloc[:int(len(total_df_unseen)*0.8)]
    df_test_unseen_test = total_df_unseen.iloc[int(len(total_df_unseen)*0.8):]

    total_df_seen = total_df_seen.to_numpy()
    total_df_seen_train = total_df_seen_train.to_numpy()
    total_df_seen_test = total_df_seen_test.to_numpy()

    testset_unseen = total_df_unseen.to_numpy()
    test_unseen_train = df_test_unseen_train.to_numpy()
    test_unseen_test = df_test_unseen_test.to_numpy()

#     print(total_df_seen_train.shape, total_df_seen_test.shape, 
#           test_unseen_train.shape, test_unseen_test.shape)
    
    Path('../../data/Humicroedit/diffY/').mkdir(parents=True, exist_ok=True)    
    np.savez_compressed('../../data/Humicroedit/diffY/Humicroedit_unif{}_trainset_seen_train.npz'.format(run), total_df_seen_train)
    np.savez_compressed('../../data/Humicroedit/diffY/Humicroedit_unif{}_trainset_seen_test.npz'.format(run), total_df_seen_test)
    np.savez_compressed('../../data/Humicroedit/diffY/Humicroedit_unif{}_testset_unseen_train.npz'.format(run), test_unseen_train)
    np.savez_compressed('../../data/Humicroedit/diffY/Humicroedit_unif{}_testset_unseen_test.npz'.format(run), test_unseen_test)
    np.savez_compressed('../../data/Humicroedit/diffY/Humicroedit_unif{}_params.npz'.format(run), weights_phis_groundtruth)