In [1]:
import pandas as pd
import numpy as np

# Importing the raw text to a pandas dataframe

In [3]:
df = pd.read_csv('data/raw.csv', header=0, delimiter='\t', names=['text'])
print(df.head())
print(df.text[27])
print(len(df.text))

                                                text
0            The old car broke down in the car park.
1         At least two men broke in and stole my TV.
2  The horses were broken in and ridden in two we...
3   Kim and Sandy both broke up with their partners.
4  The horse which Kim sometimes rides is more ba...
He tried to ignore what his own common sense told him, but it wasn’t possible; her motives were too blatant.
43


## Some Preprocessing

In [4]:
cleaned = df.copy()
# lowercasing all text
cleaned.text = df.text.str.lower()
cleaned.head()

Unnamed: 0,text
0,the old car broke down in the car park.
1,at least two men broke in and stole my tv.
2,the horses were broken in and ridden in two we...
3,kim and sandy both broke up with their partners.
4,the horse which kim sometimes rides is more ba...


### grabbing the sentences we need

In [5]:
sents = [5, 6, 7, 9, 10, 16, 17, 18, 20, 23, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43]
test_set = cleaned.iloc[[i-1 for i in sents]]

#### saving to disk for the parsers which use a command line tool

In [6]:
test_set.to_csv('data/test_set.txt',header=None, index=None)

# Stanford PCFG Parser

#### Passing data to the command line tool and piping to a .txt file

In [None]:
!stanford-parser-full-2020-11-17/lexparser.sh data/test_set.txt > output/stanford_out.txt

# SpaCy

In [None]:
#!python -m spacy download en_core_web_md

In [8]:
import spacy
nlp = spacy.load('en_core_web_md')

#### writing to .txt file in a format comparable with Stanford output

In [9]:
with open('output/spacy.txt', 'w') as f:
    for sent in test_set.text:
        doc = nlp(sent)
        for token in doc:
            f.write('{}({}, {})\n'.format(token.dep_, token.head, token.text.lower()))
        f.write('\n')

In [None]:
# this is for the format needed by DependAble
# with open('spacy_output_TEST.tsv', 'w') as f:
#     for i, sent in enumerate(content[:3]):
#         doc = nlp(sent.strip())
#         for j, word in enumerate(doc):
#             # make a dict for lookup
#             lookup = {word:j+1 for j, word in enumerate(doc)}
#             f.write(str(j+1)) # num
#             f.write('\t')
#             f.write(word.text) # word
#             f.write('\t')
#             f.write(word.lemma_) # lemma
#             f.write('\t')
#             f.write('_') # _
#             f.write('\t')
#             f.write(word.tag_) # POS
#             f.write('\t')
#             f.write('_') # _
#             f.write('\t')
#             f.write(str(lookup[word.head])) # head but I need num
#             f.write('\t')
#             f.write(word.dep_)
#             f.write('\t')
#             f.write('_')
#             f.write('\n')
#         f.write('\n')

# RASP

#### Passing data to the command line tool and piping to a .txt file

In [10]:
!./rasp3os/scripts/rasp.sh -m < data/test_set.txt > output/rasp.txt

# Unbound Dependencies

#### loading in the unbound dependency dataset

In [171]:
dep_df = pd.read_csv('data/dep_data.csv', header=0, names=['text', 'gr','head','dep','distance', 'dep_type', 'spacy', 'stanford', 'rasp'])
dep_lookup = {0: 'obj from rel clause', 1: 'obj from reduced rel clause', 2: 'subj from rel clause', \
              3: 'free relative', 4: 'obj wh-questions', 5: 'RNR', 6: 'subj from embed'}

data = dep_df.copy()
data["dep_name"] = [dep_lookup[i] for i in dep_df.dep_type]
data.head()

Unnamed: 0,text,gr,head,dep,distance,dep_type,spacy,stanford,rasp,dep_name
0,The horse which Kim sometimes rides is more ba...,dobj,6,2,4,0,1.0,1.0,,obj from rel clause
1,the horse as well as the rabbits which we want...,dobj,10,2,8,0,0.0,0.0,,obj from rel clause
2,the horse as well as the rabbits which we want...,dobj,10,7,3,0,1.0,1.0,,obj from rel clause
3,It was my aunt’s car which we sold at auction ...,dobj,9,6,3,0,1.0,1.0,,obj from rel clause
4,The veterans who I thought that we would meet ...,dobj,5,2,3,0,1.0,1.0,,obj from rel clause


In [172]:
# get max distance btw head and dependent by dep_type
data.groupby("dep_name")["distance"].max()

dep_name
RNR                             5
free relative                   5
obj from reduced rel clause     4
obj from rel clause             8
obj wh-questions                7
subj from embed                 7
subj from rel clause           13
Name: distance, dtype: int64

In [175]:
# get average distance btw head and dependent by dep_type
data.groupby("dep_name")["distance"].mean()

dep_name
RNR                            2.500000
free relative                  5.000000
obj from reduced rel clause    4.000000
obj from rel clause            4.428571
obj wh-questions               7.000000
subj from embed                6.500000
subj from rel clause           5.666667
Name: distance, dtype: float64

In [174]:
# accuracy by dep_type for spacy
data.groupby("dep_name")["spacy"].mean()

dep_name
RNR                                 NaN
free relative                  0.000000
obj from reduced rel clause         NaN
obj from rel clause            0.857143
obj wh-questions                    NaN
subj from embed                0.000000
subj from rel clause           0.600000
Name: spacy, dtype: float64

In [176]:
# overall accuracy for spacy
data["spacy"].mean()

0.6428571428571429

In [155]:
# accuracy by dep_type for stanford parser
data.groupby("dep_name")["stanford"].mean()

dep_name
free relative           1.000000
obj from rel clause     0.571429
subj from embed         0.000000
subj from rel clause    0.400000
Name: stanford, dtype: float64

In [158]:
# overall accuracy for stanford
data["stanford"].mean()

0.5

In [156]:
# accuracy by dep_type for rasp 
data.groupby("dep_name")["rasp"].mean()

dep_name
free relative          NaN
obj from rel clause    NaN
subj from embed        NaN
subj from rel clause   NaN
Name: rasp, dtype: float64

In [159]:
# overall accuracy for rasp
data["rasp"].mean()

nan