# Imports

In [1]:
import sys
sys.path.append('../')
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)

import spacy
from spacy.lang.en import English
from spacy.attrs import POS
nlp = spacy.load('en_core_web_lg')

In [2]:
from utils.nlp_utils import get_feats, lemmatize_doc, scramble_words
from gensim.models.tfidfmodel import TfidfModel
from gensim.corpora import Dictionary
from gensim.matutils import sparse2full

# pre-process text by lemmatizing

In [3]:
version_to_use = 'tangramsSequential_collapsed'
d_raw = pd.read_csv('../data/{}.csv'.format(version_to_use))#.rename(index=str, columns={"contents": "text"})
d_raw['text'] = [nlp(text) for text in d_raw['contents']]
d_raw['lemmas'] = [lemmatize_doc(parsed_text) for parsed_text in d_raw['text']]
docs_dict = Dictionary(d_raw['lemmas'])

go ahead and extract the 'content' words we'll use for extracting vectors

In [51]:
contentful = [] 
for utterance in d_raw['text'] :
    subset = []
    for word in utterance :
        if word.pos_ in ['NOUN', 'VERB', 'ADJ', 'ADV'] and word.has_vector:
            subset.append(word)
    contentful.append(subset)
d_raw['contentful'] = contentful

we're missing rows so we need to 'fill in' the content so that it'll be NANs

In [54]:
d = d_raw.copy()
d = d.set_index(['gameid','intendedName', 'repetitionNum'])
mux = pd.MultiIndex.from_product([d.index.levels[0], d.index.levels[1],d.index.levels[2]], names=['gameid','intendedName', 'repetitionNum'])
d = d.reindex(mux, fill_value=[np.nan]).reset_index()

nan_rows = [i for (i,row) in d.iterrows() if pd.isna(row['text'])]
nan_insert_rows = [k - lag for (lag, k) in enumerate(nan_rows)]

gameidList = pd.unique(d.gameid.ravel()).tolist()
tangramList = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L']

### create tf-idf weightings

In [55]:
docs_corpus = [docs_dict.doc2bow(doc) for doc in d['lemmas'] if not np.any(pd.isna(doc))]
model_tfidf = TfidfModel(docs_corpus, id2word=docs_dict)
docs_tfidf  = model_tfidf[docs_corpus]
docs_vecs   = np.vstack([sparse2full(c, len(docs_dict)) for c in docs_tfidf])
tfidf_emb_vecs = np.vstack([nlp(docs_dict[i]).vector for i in range(len(docs_dict))])

In [56]:
docs_emb_raw = np.dot(docs_vecs, tfidf_emb_vecs) 
docs_emb = np.insert(docs_emb_raw, nan_insert_rows, np.nan, axis=0)

# Examine semantic embeddings
We'd like to pull out bag of words embeddings from NPs in each utterance in the cued dataset and cluster them for each tangram; expect to see different pairs in different parts of the space (i.e. to compute a d' for an 'idiosyncracy' or 'multiple equilibria' result) and also different utterances from single games closer together. 

In [82]:
from utils.nlp_utils import get_feats
for i in range(100) :
    meta, raw_avg_feats, weighted_feats = get_feats(d, docs_emb, nlp, scramble=True)
    np.save('outputs/feats_tangrams_embeddings_rawavg_scrambled{}.npy'.format(i), raw_avg_feats)#, delimiter=',')

In [79]:
meta, raw_avg_feats, weighted_feats = get_feats(d, docs_emb, nlp, scramble=False)
meta.to_csv('outputs/meta_tangrams_embeddings.csv')
np.save('outputs/feats_tangrams_embeddings_tfidf.npy', weighted_feats)#, delimiter=',')
np.save('outputs/feats_tangrams_embeddings_rawavg.npy', raw_avg_feats)#, delimiter=',')

re-derive these embeddings on completely scrambled, re-sampled utterances

TODO: initial distribution w/in vs. across
TODO: 2D PCA... (traces of beginnings and ends)
-- Connect individuals in a game with a line!
-- Word clouds for initial and final

# Look at tsne visualization
TODO: there are a bunch of problems with this: a lot of the creative utterances don't exist in current embedding (e.g. "ghostman"), sometimes they don't converge to a noun (e.g. "flying"), etc.

In [None]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.manifold import MDS
from sklearn.decomposition import TruncatedSVD
tsne = TSNE(n_components = 2)
big_pca = PCA(n_components = 50)
viz_pca = PCA(n_components = 2)
mds = MDS(n_components=2)

In [None]:
embedding_viz = pd.DataFrame(
    columns = ['gameid', 'intendedName', 'repetitionNum', 'x_tsne', 'y_tsne', 'x_mds', 'y_mds', 'feats_type']
)

for name, group in meta.groupby('intendedName') :
    tangram_inds = np.array(group.index)
    for feats_type in ['raw_avg', 'weighted'] :
        feats = weighted_feats if feats_type == 'weighted' else raw_avg_feats
        relevant_feats = feats[tangram_inds]
        nan_rows = [i for i in range(relevant_feats.shape[0]) if pd.isna(relevant_feats[i,0])]
        nan_insert_rows = [k - lag for (lag, k) in enumerate(nan_rows)]
        X = np.ma.masked_invalid(relevant_feats)
        tsne_out = tsne.fit_transform(big_pca.fit_transform(np.ma.compress_rows(X)))
        tsne_out = np.insert(tsne_out, nan_insert_rows, np.nan, axis=0)
        X_tsne = pd.DataFrame(tsne_out, 
                             columns = ['x_tsne', 'y_tsne'], 
                             index=tangram_inds) #X_mds, 
        X_tsne['feats_type'] = feats_type
        embedding_viz = embedding_viz.append(pd.concat([group, X_tsne], axis = 1), 
                                             ignore_index=True, sort=False)


In [None]:
embedding_viz.to_csv('outputs/embeddings.csv')