# Imports

In [1]:
import sys
sys.path.append('../')
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)

import spacy
from spacy.lang.en import English
from spacy.attrs import POS
nlp = spacy.load('en_core_web_lg')

In [2]:
# from nlp_utils import get_feats, lemmatize_doc
from gensim.models.tfidfmodel import TfidfModel
from gensim.corpora import Dictionary
from gensim.matutils import sparse2full

In [3]:
def stop (t) :
    return t.is_stop or t.lemma_ in ['person', 'tangrams', 'tangram', 'look', 'like', 'tap', 'choose', 'zzz', 'xxx', 'yyy', 'pick', 'guy', 'blue', 'box']

# Pre-process text by lemmatizing

In [4]:
postd_raw = pd.read_csv('../data/cleaned_post_test.csv', encoding='latin-1')
md_raw = pd.read_csv('../data/cleaned_messages.csv', encoding='latin-1')

In [5]:
postd_raw['nlptext'] = [nlp(text) for text in postd_raw['text']]
postd_raw['non_stop_text'] = [[token for token in text if not token.is_stop] for text in postd_raw['nlptext']]
md_raw['nlptext'] = [nlp(text) for text in md_raw['text']]
md_raw['non_stop_text'] = [[token for token in text if not token.is_stop] for text in md_raw['nlptext']]

In [6]:
def keep_token(t):
    return (t.is_alpha and 
            not (t.is_space or t.is_punct or 
                 t.is_stop or t.like_num))

def lemmatize_doc(doc):
    return [ t.lemma_ for t in doc if keep_token(t)]

In [7]:
postd_raw['lemmatized_nonstop'] = [lemmatize_doc(parsed_text) for parsed_text in postd_raw['non_stop_text']]
md_raw['lemmatized_nonstop'] = [lemmatize_doc(parsed_text) for parsed_text in md_raw['non_stop_text']]

In [8]:
postd_raw.head()

Unnamed: 0,gameId,playerId,tangram,text,ownGroup,blockOrder,utt_length_words,nlptext,non_stop_text,lemmatized_nonstop
0,hXJrmsG9Q2uWDXN7T,9sgexPGxSg8dbfjky,H,buddha,ownGroup,first,1,(buddha),[buddha],[buddha]
1,hXJrmsG9Q2uWDXN7T,9sgexPGxSg8dbfjky,E,rabbit,ownGroup,first,1,(rabbit),[rabbit],[rabbit]
2,hXJrmsG9Q2uWDXN7T,9sgexPGxSg8dbfjky,F,depressed,ownGroup,first,1,(depressed),[depressed],[depressed]
3,hXJrmsG9Q2uWDXN7T,9sgexPGxSg8dbfjky,G,dancer,ownGroup,first,1,(dancer),[dancer],[dancer]
4,hXJrmsG9Q2uWDXN7T,mGLLgXimwZwoxxH5h,G,A dancer Some people see a long dressed woman ...,otherGroup,first,10,"(A, dancer, Some, people, see, a, long, dresse...","[dancer, people, long, dressed, woman, dancing]","[dancer, people, long, dressed, woman, dance]"


In [9]:
md_raw.head()

Unnamed: 0,gameId,trialNum,partnerNum,repNum,roomId,tangram,text,total_num_words,total_num_chars,listener,speaker,nlptext,non_stop_text,lemmatized_nonstop
0,76iJQ4QXdyAcYKseZ,0,0,0,room0,B,"It looks like someone is praying, no",7,34,up6Jjy6x3SYuzkRrC,NiKAGZngti7bqNjn2,"(It, looks, like, someone, is, praying, ,, no)","[looks, like, praying, ,]","[look, like, pray]"
1,76iJQ4QXdyAcYKseZ,0,1,0,room0,A,"dancer with hump, looking on the ground, lifti...",10,52,sK8r54bPtKtwfCyE7,NiKAGZngti7bqNjn2,"(dancer, with, hump, ,, looking, on, the, grou...","[dancer, hump, ,, looking, ground, ,, lifting,...","[dancer, hump, look, ground, lift, leg]"
2,76iJQ4QXdyAcYKseZ,0,1,0,room1,C,"the one who looks like goalkeeper, he looks li...",16,82,gR2G6xLFF4jLsksxJ,up6Jjy6x3SYuzkRrC,"(the, one, who, looks, like, goalkeeper, ,, he...","[looks, like, goalkeeper, ,, looks, like, goal...","[look, like, goalkeeper, look, like, goalkeepe..."
3,76iJQ4QXdyAcYKseZ,0,2,0,room0,B,"like a dog on the ground, with bow on the head...",12,52,gR2G6xLFF4jLsksxJ,NiKAGZngti7bqNjn2,"(like, a, dog, on, the, ground, ,, with, bow, ...","[like, dog, ground, ,, bow, head, ,, kneeling]","[like, dog, ground, bow, head, kneel]"
4,76iJQ4QXdyAcYKseZ,0,2,0,room1,D,"praying maybe, arms in front, looks like a sta...",11,57,up6Jjy6x3SYuzkRrC,sK8r54bPtKtwfCyE7,"(praying, maybe, ,, arms, in, front, ,, looks,...","[praying, maybe, ,, arms, ,, looks, like, stan...","[pray, maybe, arm, look, like, stand, prayer, ..."


# Format Data

In [10]:
postd_raw.loc[postd_raw['playerId']=='NiKAGZngti7bqNjn2', 'lemmatized_nonstop']

937    [stand, person, extended, arm, like, hold, tray]
953                     [dancer, leg, lift, have, hump]
958                             [kneel, dog, bow, head]
960                        [goalkeeper, fly, arm, open]
966                 [dog, sit, ground, wear, bow, head]
969               [dancer, look, lift, leg, have, hump]
973                        [fly, goalkeeper, open, arm]
975        [stand, person, wear, cloak, care, backpack]
Name: lemmatized_nonstop, dtype: object

In [11]:
postd_raw.loc[(postd_raw['playerId']=='NiKAGZngti7bqNjn2') & (postd_raw['tangram'] == 'B') & (postd_raw['ownGroup'] == 'ownGroup'), 'lemmatized_nonstop']

966    [dog, sit, ground, wear, bow, head]
Name: lemmatized_nonstop, dtype: object

In [12]:
df_own = postd_raw.loc[postd_raw['ownGroup'] == 'ownGroup'][['playerId','tangram','lemmatized_nonstop']]
df_other = postd_raw.loc[postd_raw['ownGroup'] == 'otherGroup'][['playerId','tangram','lemmatized_nonstop']]

df_posts = pd.merge(df_own, df_other, on=["playerId", "tangram"])

df_posts.rename(columns={'playerId':'speaker', 'lemmatized_nonstop_x':'post_own', 'lemmatized_nonstop_y':'post_other'}, inplace=True)

df_posts

Unnamed: 0,speaker,tangram,post_own,post_other
0,9sgexPGxSg8dbfjky,H,[buddha],[leg]
1,9sgexPGxSg8dbfjky,E,[rabbit],[rabbit]
2,9sgexPGxSg8dbfjky,F,[depressed],[sad]
3,9sgexPGxSg8dbfjky,G,[dancer],"[open, chest]"
4,ek7RGAsMfmiuKv9ZW,G,[dancer],[dancer]
...,...,...,...,...
547,pXny4jckQajNRAXwg,G,"[look, like, shoe, heel]","[heel, shoe]"
548,83TWsQKpEn93idsMW,H,"[look, like, letter, diamond, shape]","[look, like, letter, diamond, shape]"
549,SvorbMj2XjMKB6Zfy,H,"[look, like, letter, diamond, shape, dot, letter]","[look, like, letter, diamond, shape, dot]"
550,pXny4jckQajNRAXwg,E,"[degree, triangle, rabbit, ear, like, triangle]","[triangle, rabbit, ear, triangle]"


In [13]:
md_raw = md_raw.merge(df_posts, on=['speaker','tangram'])
md_raw.head()

Unnamed: 0,gameId,trialNum,partnerNum,repNum,roomId,tangram,text,total_num_words,total_num_chars,listener,speaker,nlptext,non_stop_text,lemmatized_nonstop,post_own,post_other
0,76iJQ4QXdyAcYKseZ,0,0,0,room0,B,"It looks like someone is praying, no",7,34,up6Jjy6x3SYuzkRrC,NiKAGZngti7bqNjn2,"(It, looks, like, someone, is, praying, ,, no)","[looks, like, praying, ,]","[look, like, pray]","[dog, sit, ground, wear, bow, head]","[kneel, dog, bow, head]"
1,76iJQ4QXdyAcYKseZ,0,2,0,room0,B,"like a dog on the ground, with bow on the head...",12,52,gR2G6xLFF4jLsksxJ,NiKAGZngti7bqNjn2,"(like, a, dog, on, the, ground, ,, with, bow, ...","[like, dog, ground, ,, bow, head, ,, kneeling]","[like, dog, ground, bow, head, kneel]","[dog, sit, ground, wear, bow, head]","[kneel, dog, bow, head]"
2,76iJQ4QXdyAcYKseZ,1,1,0,room0,B,"like a dog with a bow, on the head, dog sittin...",13,51,sK8r54bPtKtwfCyE7,NiKAGZngti7bqNjn2,"(like, a, dog, with, a, bow, ,, on, the, head,...","[like, dog, bow, ,, head, ,, dog, sitting, wth...","[like, dog, bow, head, dog, sit, wth, bow]","[dog, sit, ground, wear, bow, head]","[kneel, dog, bow, head]"
3,76iJQ4QXdyAcYKseZ,9,0,2,room0,B,kneeling prayer,2,15,up6Jjy6x3SYuzkRrC,NiKAGZngti7bqNjn2,"(kneeling, prayer)","[kneeling, prayer]","[kneel, prayer]","[dog, sit, ground, wear, bow, head]","[kneel, dog, bow, head]"
4,76iJQ4QXdyAcYKseZ,9,1,2,room0,B,"dog on the ground, with bow",6,25,sK8r54bPtKtwfCyE7,NiKAGZngti7bqNjn2,"(dog, on, the, ground, ,, with, bow)","[dog, ground, ,, bow]","[dog, ground, bow]","[dog, sit, ground, wear, bow, head]","[kneel, dog, bow, head]"


# Intersection Analysis

In [14]:
md = md_raw.copy()

In [15]:
md['intersection_own'] = md.apply(lambda row: list(set(row.lemmatized_nonstop) & set(row.post_own)), axis=1)
md['non_empty_own'] = md.apply(lambda row: len(row.intersection_own) > 0, axis=1)
md['int_length_own'] = md.apply(lambda row: len(row.intersection_own), axis=1)
md['post_match_own'] = md.apply(lambda row: row.int_length_own / len(row.post_own), axis=1)
md['message_match_own'] = md.apply(lambda row: row.int_length_own / len(row.lemmatized_nonstop) if len(row.lemmatized_nonstop) > 0 else 0, axis=1)

md['intersection_other'] = md.apply(lambda row: list(set(row.lemmatized_nonstop) & set(row.post_other)), axis=1)
md['non_empty_other'] = md.apply(lambda row: len(row.intersection_other) > 0, axis=1)
md['int_length_other'] = md.apply(lambda row: len(row.intersection_other), axis=1)
md['post_match_other'] = md.apply(lambda row: row.int_length_other / len(row.post_other), axis=1)
md['message_match_other'] = md.apply(lambda row: row.int_length_other / len(row.lemmatized_nonstop) if len(row.lemmatized_nonstop) > 0 else 0, axis=1)


md.head()

Unnamed: 0,gameId,trialNum,partnerNum,repNum,roomId,tangram,text,total_num_words,total_num_chars,listener,...,intersection_own,non_empty_own,int_length_own,post_match_own,message_match_own,intersection_other,non_empty_other,int_length_other,post_match_other,message_match_other
0,76iJQ4QXdyAcYKseZ,0,0,0,room0,B,"It looks like someone is praying, no",7,34,up6Jjy6x3SYuzkRrC,...,[],False,0,0.0,0.0,[],False,0,0.0,0.0
1,76iJQ4QXdyAcYKseZ,0,2,0,room0,B,"like a dog on the ground, with bow on the head...",12,52,gR2G6xLFF4jLsksxJ,...,"[bow, dog, ground, head]",True,4,0.666667,0.666667,"[bow, dog, kneel, head]",True,4,1.0,0.666667
2,76iJQ4QXdyAcYKseZ,1,1,0,room0,B,"like a dog with a bow, on the head, dog sittin...",13,51,sK8r54bPtKtwfCyE7,...,"[bow, dog, head, sit]",True,4,0.666667,0.5,"[bow, dog, head]",True,3,0.75,0.375
3,76iJQ4QXdyAcYKseZ,9,0,2,room0,B,kneeling prayer,2,15,up6Jjy6x3SYuzkRrC,...,[],False,0,0.0,0.0,[kneel],True,1,0.25,0.5
4,76iJQ4QXdyAcYKseZ,9,1,2,room0,B,"dog on the ground, with bow",6,25,sK8r54bPtKtwfCyE7,...,"[ground, dog, bow]",True,3,0.5,1.0,"[bow, dog]",True,2,0.5,0.666667


In [20]:
md.to_csv('../analysis/post_test_similarity', index=False)