In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from scipy.spatial import distance

In [2]:
ladec = pd.read_csv('../data/external/ladec.csv')

In [3]:
#dictionary approach to GloVe vectors
embeddings_dict = {}

with open("../data/external/glove.6B.50d.txt", 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector

In [4]:
#all words in row of LaDEC have embeddings in GloVe
is_embedded = ladec.apply( lambda row: ((row['c1'] in embeddings_dict) and (row['c2'] in  embeddings_dict) and (row['stim'] in embeddings_dict) ), axis='columns' )

In [5]:
ladec['is_embedded'] = is_embedded

In [6]:
# Only keep the row if row['is_embedded'] AND row['correctParse']
# Otherwise drop it!

ladec['keep_row'] = ladec.apply( lambda row: ( (row['correctParse']=='yes') and row['is_embedded'] ), axis='columns' )

In [7]:
#new dataframe that only includes rows of LaDEC with valid parse and an embedding in the current GloVe set
embedded_ladec = ladec[ ladec['keep_row'] ]

Suggested alternate format of columns = 'cmp', 'c1_00', 'c1_01' ... 'c1_49'

Flatten the embedding arrays so that each element has a column assigned to it

This would give the DataFrame 152 columns

In [8]:
c1_labels = [ 'c1_%02d' % x for x in range(50) ]
c2_labels = [ 'c2_%02d' % x for x in range(50) ]
cmp_labels = [ 'cmp_%02d' % x for x in range(50) ]

In [9]:
all_embeddings = pd.DataFrame( columns = ['cmp','c1','c2', *c1_labels, *c2_labels, *cmp_labels])

In [10]:
all_embeddings[ ['cmp','c1','c2'] ] = embedded_ladec[ ['stim','c1','c2'] ]

In [11]:
all_embeddings

Unnamed: 0,cmp,c1,c2,c1_00,c1_01,c1_02,c1_03,c1_04,c1_05,c1_06,...,cmp_40,cmp_41,cmp_42,cmp_43,cmp_44,cmp_45,cmp_46,cmp_47,cmp_48,cmp_49
0,gadabout,gad,about,,,,,,,,...,,,,,,,,,,
1,knockabout,knock,about,,,,,,,,...,,,,,,,,,,
2,turnabout,turn,about,,,,,,,,...,,,,,,,,,,
4,walkabout,walk,about,,,,,,,,...,,,,,,,,,,
5,runabout,run,about,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8951,junkyards,junk,yards,,,,,,,,...,,,,,,,,,,
8952,shipyards,ship,yards,,,,,,,,...,,,,,,,,,,
8953,farmyards,farm,yards,,,,,,,,...,,,,,,,,,,
8954,brickyards,brick,yards,,,,,,,,...,,,,,,,,,,


In [12]:
for i in all_embeddings.index:
    all_embeddings.loc[i, c1_labels] = embeddings_dict[all_embeddings.loc[i,'c1']]
    all_embeddings.loc[i, c2_labels] = embeddings_dict[all_embeddings.loc[i,'c2']]
    all_embeddings.loc[i, cmp_labels] = embeddings_dict[all_embeddings.loc[i,'cmp']]

In [13]:
all_embeddings.head()

Unnamed: 0,cmp,c1,c2,c1_00,c1_01,c1_02,c1_03,c1_04,c1_05,c1_06,...,cmp_40,cmp_41,cmp_42,cmp_43,cmp_44,cmp_45,cmp_46,cmp_47,cmp_48,cmp_49
0,gadabout,gad,about,-0.28186,0.76122,0.31124,-0.056947,-0.16125,-0.84541,0.21271,...,0.072761,0.46474,-0.26208,0.24451,0.26817,-0.62835,0.007282,-0.25889,0.4026,0.076465
1,knockabout,knock,about,-0.12375,-0.75023,0.60649,-0.20309,0.15405,-0.023966,0.37052,...,0.26866,0.20105,-0.60197,0.62156,-0.58291,-0.29071,0.41405,0.33043,0.296,0.098863
2,turnabout,turn,about,0.45343,-0.22031,0.53704,-0.48735,0.34246,0.046052,-0.23291,...,-0.87763,0.23897,-0.81087,-0.3565,-0.49125,-0.059674,0.67104,1.456,0.005515,0.23835
4,walkabout,walk,about,0.41281,0.9111,-0.078414,-0.46515,0.61895,-0.44519,-0.87504,...,-0.35818,-0.05923,-0.90118,0.14399,-0.1174,-0.40766,0.74211,-0.49879,0.10832,0.19781
5,runabout,run,about,-0.39488,-0.16448,0.5962,0.65815,-0.16846,-0.15133,-1.4758,...,0.72483,-0.29675,0.15276,0.24591,-0.69316,-0.45773,0.25187,-0.72384,0.54023,-0.16045


In [14]:
all_embeddings.describe()

Unnamed: 0,cmp,c1,c2,c1_00,c1_01,c1_02,c1_03,c1_04,c1_05,c1_06,...,cmp_40,cmp_41,cmp_42,cmp_43,cmp_44,cmp_45,cmp_46,cmp_47,cmp_48,cmp_49
count,5902,5902,5902,5902.0,5902.0,5902.0,5902.0,5902.0,5902.0,5902.0,...,5902.0,5902.0,5902.0,5902.0,5902.0,5902.0,5902.0,5902.0,5902.0,5902.0
unique,5900,1392,1782,1387.0,1385.0,1391.0,1387.0,1387.0,1389.0,1383.0,...,5816.0,5820.0,5824.0,5766.0,5815.0,5813.0,5810.0,5826.0,5810.0,5797.0
top,silversword,over,man,0.12972,0.088073,0.24375,0.078102,-0.12783,0.27831,-0.48693,...,-0.17344,0.40601,0.39916,1.1583,-0.26778,0.25847,0.62909,-1.2632,-0.32151,-1.0659
freq,2,96,175,96.0,96.0,96.0,96.0,96.0,96.0,96.0,...,3.0,2.0,2.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0


In [15]:
all_embeddings.to_csv('../data/processed/all_embeddings_forML.csv')

Loading the dataset and extracting arrays from it

In [16]:
test = pd.read_csv('../data/processed/all_embeddings_forML.csv')

In [17]:
test.head()

Unnamed: 0.1,Unnamed: 0,cmp,c1,c2,c1_00,c1_01,c1_02,c1_03,c1_04,c1_05,...,cmp_40,cmp_41,cmp_42,cmp_43,cmp_44,cmp_45,cmp_46,cmp_47,cmp_48,cmp_49
0,0,gadabout,gad,about,-0.28186,0.76122,0.31124,-0.056947,-0.16125,-0.84541,...,0.072761,0.46474,-0.26208,0.24451,0.26817,-0.62835,0.007282,-0.25889,0.4026,0.076465
1,1,knockabout,knock,about,-0.12375,-0.75023,0.60649,-0.20309,0.15405,-0.023966,...,0.26866,0.20105,-0.60197,0.62156,-0.58291,-0.29071,0.41405,0.33043,0.296,0.098863
2,2,turnabout,turn,about,0.45343,-0.22031,0.53704,-0.48735,0.34246,0.046052,...,-0.87763,0.23897,-0.81087,-0.3565,-0.49125,-0.059674,0.67104,1.456,0.005515,0.23835
3,4,walkabout,walk,about,0.41281,0.9111,-0.078414,-0.46515,0.61895,-0.44519,...,-0.35818,-0.05923,-0.90118,0.14399,-0.1174,-0.40766,0.74211,-0.49879,0.10832,0.19781
4,5,runabout,run,about,-0.39488,-0.16448,0.5962,0.65815,-0.16846,-0.15133,...,0.72483,-0.29675,0.15276,0.24591,-0.69316,-0.45773,0.25187,-0.72384,0.54023,-0.16045


In [18]:
x_c1 = test.iloc[ :len(test['c1']), 4:54]
x_c2 = test.iloc[ :len(test['c1']), 54:104]
y_cmp = test.iloc[ :len(test['c1']), 104:154]

In [19]:
testy = np.array( test.iloc[ :len(test['c1']), 4:54] )

In [20]:
test.iloc[ :len(test['c1']), 54:104]

Unnamed: 0,c2_00,c2_01,c2_02,c2_03,c2_04,c2_05,c2_06,c2_07,c2_08,c2_09,...,c2_40,c2_41,c2_42,c2_43,c2_44,c2_45,c2_46,c2_47,c2_48,c2_49
0,0.894660,0.36604,0.37588,-0.41818,0.58462,0.18594,-0.41907,-0.46621,-0.54903,0.02477,...,-0.23460,0.44512,0.53397,0.66654,-0.093662,-0.035203,-0.064194,0.559980,-0.665930,0.12177
1,0.894660,0.36604,0.37588,-0.41818,0.58462,0.18594,-0.41907,-0.46621,-0.54903,0.02477,...,-0.23460,0.44512,0.53397,0.66654,-0.093662,-0.035203,-0.064194,0.559980,-0.665930,0.12177
2,0.894660,0.36604,0.37588,-0.41818,0.58462,0.18594,-0.41907,-0.46621,-0.54903,0.02477,...,-0.23460,0.44512,0.53397,0.66654,-0.093662,-0.035203,-0.064194,0.559980,-0.665930,0.12177
3,0.894660,0.36604,0.37588,-0.41818,0.58462,0.18594,-0.41907,-0.46621,-0.54903,0.02477,...,-0.23460,0.44512,0.53397,0.66654,-0.093662,-0.035203,-0.064194,0.559980,-0.665930,0.12177
4,0.894660,0.36604,0.37588,-0.41818,0.58462,0.18594,-0.41907,-0.46621,-0.54903,0.02477,...,-0.23460,0.44512,0.53397,0.66654,-0.093662,-0.035203,-0.064194,0.559980,-0.665930,0.12177
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5897,-1.147500,-0.01226,2.00370,-0.51959,0.39160,-0.44946,-1.55860,-0.38894,0.14249,-0.72019,...,-0.66000,0.17273,0.26157,0.81009,-1.530700,0.609110,0.090890,0.204970,-0.097267,-0.66934
5898,-1.147500,-0.01226,2.00370,-0.51959,0.39160,-0.44946,-1.55860,-0.38894,0.14249,-0.72019,...,-0.66000,0.17273,0.26157,0.81009,-1.530700,0.609110,0.090890,0.204970,-0.097267,-0.66934
5899,-1.147500,-0.01226,2.00370,-0.51959,0.39160,-0.44946,-1.55860,-0.38894,0.14249,-0.72019,...,-0.66000,0.17273,0.26157,0.81009,-1.530700,0.609110,0.090890,0.204970,-0.097267,-0.66934
5900,-1.147500,-0.01226,2.00370,-0.51959,0.39160,-0.44946,-1.55860,-0.38894,0.14249,-0.72019,...,-0.66000,0.17273,0.26157,0.81009,-1.530700,0.609110,0.090890,0.204970,-0.097267,-0.66934


In [21]:
test.iloc[ :len(test['c1']), 104:154]

Unnamed: 0,cmp_00,cmp_01,cmp_02,cmp_03,cmp_04,cmp_05,cmp_06,cmp_07,cmp_08,cmp_09,...,cmp_40,cmp_41,cmp_42,cmp_43,cmp_44,cmp_45,cmp_46,cmp_47,cmp_48,cmp_49
0,-0.95223,0.22216,-0.31866,-0.803280,-0.349400,0.153730,-0.028646,0.60601,0.319070,0.861780,...,0.072761,0.46474,-0.262080,0.244510,0.26817,-0.628350,0.007282,-0.25889,0.402600,0.076465
1,-0.53877,-0.37070,-1.33180,-0.324460,-0.459840,0.984080,0.857930,-0.25580,0.057934,0.310290,...,0.268660,0.20105,-0.601970,0.621560,-0.58291,-0.290710,0.414050,0.33043,0.296000,0.098863
2,0.24069,-0.30180,-1.16750,0.055764,0.006461,0.375450,0.386180,0.65111,-0.173850,1.144900,...,-0.877630,0.23897,-0.810870,-0.356500,-0.49125,-0.059674,0.671040,1.45600,0.005515,0.238350
3,-0.18218,-0.13491,-1.28260,-0.778250,-0.849530,-0.780300,0.106260,0.59223,0.339390,-0.159170,...,-0.358180,-0.05923,-0.901180,0.143990,-0.11740,-0.407660,0.742110,-0.49879,0.108320,0.197810
4,0.33506,-0.64988,0.44627,-0.392030,-0.961660,0.266480,0.245760,-0.46295,-0.388120,0.128180,...,0.724830,-0.29675,0.152760,0.245910,-0.69316,-0.457730,0.251870,-0.72384,0.540230,-0.160450
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5897,-0.20939,-0.61757,0.28687,-0.140080,-0.710250,-0.155260,-0.497110,-0.15810,-0.212150,0.165680,...,-0.638450,-0.50486,0.467690,0.061731,0.46063,-0.215850,0.083754,-0.13010,0.068344,-0.868550
5898,0.67891,-0.16578,-0.12632,-0.102680,-0.962670,0.049075,-1.258300,0.90347,-0.681290,-0.921170,...,-0.467160,-1.34320,1.460800,-0.533550,0.19131,0.904780,-1.494800,0.75939,-1.035700,-0.332330
5899,0.23381,-0.35211,0.22571,-0.598700,-0.254510,-0.346220,0.467430,0.25412,0.410810,0.621830,...,-0.380910,-0.79653,0.012824,0.696000,0.21392,-0.674850,0.273400,-0.55502,-0.462630,-0.458570
5900,-0.13562,-1.26780,0.10599,0.328730,-0.748700,0.129260,0.159310,0.92586,0.116330,0.063934,...,-0.695630,-0.57096,0.734690,0.528490,0.09799,-0.370890,-0.534470,-0.39309,-0.273490,-1.353600
