In [22]:
import pandas as pd
import string
import re
from stemming.porter2 import stem
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

corpus1 = pd.read_csv('../1_1_EPIE/idioms.csv')
corpus2 = pd.read_csv('../1_2_dictionary/idioms.csv')
corpus3 = pd.read_csv('../1_3_bookcorpus/idioms.csv')
print(f"{len(corpus1) + len(corpus2) + len(corpus3)} before deduplication")

2416 before deduplication


[nltk_data] Downloading package stopwords to /Users/chany/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [23]:
corpus1 = corpus1.rename(columns={
    'idiom': 'idiom_EPIE',
    'meaning': 'meaning_EPIE',
    'example': 'example_EPIE',
    'tag': 'tag_EPIE'
})

corpus2 = corpus2.rename(columns={
    'idiom': 'idiom_dictionary',
    'meaning': 'meaning_dictionary',
    'example': 'example_dictionary',
})

corpus3 = corpus3.rename(columns={
    'idiom': 'idiom_bookcorpus',
    'definition': 'meaning_bookcorpus'
})
corpora = [corpus1, corpus2, corpus3]

In [24]:
pronouns = {
    'I', 'you', 'You', 'he', 'He', 'she', 'She', 'it', 'It', 'we', 'We', 'they', 'They', # subjective
    'me', 'Me', 'him', 'Him', 'her', 'Her', 'us', 'Us', 'them', 'Them', # objective
    'my', 'My', 'your', 'Your', 'his', 'His', 'our', 'Our', 'their', 'Their', # possessive, adj
    'mine', 'Mine', 'yours', 'Yours' 'hers', 'Hers', 'ours', 'Ours', 'theirs', 'Theirs', # possessive, n
    'pron', 'own'
}
determiners = {
    'the', 'The', 'a', 'A', 'an', 'An', 'this', 'This', 'that', 'That', 'these', 'These', 'those', 'Those'
}
auxes = {'is', 'was', 'are', 'were', 'be', 'being', 'been'}
props = {'in', 'at', 'on', 'of', 'by', 'for', 'by', 'to', 'as'}
extras = {'ll', 'if', 'and', 'or'}
stop_ignore = {'between', 'all', 'same', 'but', 'off', 'once', 'have', 'until', 'below', 'through', 'while', 
        'above', 'both', 'up', 'out', 'under', 'against', 'further', 'few', 'most', 'down', 'over', 'just',
}
stop_words = pronouns | determiners | auxes | props | extras | set(stopwords.words('english')) - stop_ignore
def trim_idiom(idiom):
    idiom = idiom.lower()
    idiom = idiom.replace('one\'s ', '')
    idiom = idiom.replace('-', ' ')
    idiom = idiom.replace('\'', ' ')
    idiom = idiom.replace('/', ' ')
    idiom = idiom.translate(str.maketrans('', '', string.punctuation))
    idiom = idiom.split()
    idiom = set([stem(word) for word in idiom])
    idiom -=  stop_words
    return frozenset(idiom)
    

In [25]:
corpus1['idiom_set'] = corpus1['idiom_EPIE'].apply(trim_idiom)
corpus2['idiom_set'] = corpus2['idiom_dictionary'].apply(trim_idiom)
corpus3['idiom_set'] = corpus3['idiom_bookcorpus'].apply(trim_idiom)

In [26]:
corpus = pd.concat([corpus1, corpus2, corpus3])
corpus = corpus.groupby('idiom_set').agg({
    'idiom_EPIE': 'first',
    'meaning_EPIE': 'first',
    'example_EPIE': 'first',
    'tag_EPIE': 'first',
    'idiom_dictionary': 'first',
    'meaning_dictionary': 'first',
    'example_dictionary': 'first',
    'idiom_bookcorpus': 'first',
    'meaning_bookcorpus': 'first'
}).reset_index()

In [27]:
def postprocess(row):
    row['dupl'] = 0
    if row['idiom_EPIE'] and not row['idiom_dictionary'] and not row['idiom_bookcorpus']: 
        row['idiom'] = row['idiom_EPIE']
        row['meaning'] = row['meaning_EPIE']
        row['example'] = row['example_EPIE']
        row['source'] = 'EPIE'
    elif not row['idiom_EPIE'] and row['idiom_dictionary'] and not row['idiom_bookcorpus']: 
        row['idiom'] = row['idiom_dictionary']
        row['meaning'] = row['meaning_dictionary']
        row['example'] = row['example_dictionary']
        row['source'] = 'dictionary'
    elif not row['idiom_EPIE'] and not row['idiom_dictionary'] and row['idiom_bookcorpus']: 
        row['idiom'] = row['idiom_bookcorpus']
        row['meaning'] = row['meaning_bookcorpus']
        row['source'] = 'bookcorpus'
    else:
        row['idiom'] = row['idiom_dictionary']
        row['meaning'] = row['meaning_dictionary']
        row['example'] = row['example_dictionary']
        row['source'] = 'dictionary'
        row['dupl'] = 1
    return row

In [29]:
corpus = corpus.apply(postprocess, axis=1)[['idiom', 'meaning', 'example', 'source', 'dupl',
    'idiom_set', 
    'idiom_EPIE', 'meaning_EPIE', 'example_EPIE', 'tag_EPIE',
    'idiom_dictionary', 'meaning_dictionary', 'example_dictionary',
    'idiom_bookcorpus', 'meaning_bookcorpus',
]]
corpus = corpus[corpus['idiom'].notnull()].reset_index(drop=True)
corpus = corpus[corpus['idiom_set'] != frozenset()].reset_index(drop=True)

In [30]:
corpus[corpus['dupl']==1].to_csv('dupl.csv', index=False)
corpus.to_csv('idioms.csv', index=False)
corpus[corpus['source']=='EPIE'].to_csv('EPIE.csv', index=False)

In [31]:
corpus

Unnamed: 0,idiom,meaning,example,source,dupl,idiom_set,idiom_EPIE,meaning_EPIE,example_EPIE,tag_EPIE,idiom_dictionary,meaning_dictionary,example_dictionary,idiom_bookcorpus,meaning_bookcorpus
0,the bottom line,"ing: in the final analysis, the most important...",The bottom line is that pregnant women's healt...,dictionary,1,"(bottom, line)",[pron] bottom line is,the most important point or conclusion.,"It 's something to do with relating to it , bu...",O O O O O O O O O O O O O O O O O O O O O O B-...,the bottom line,"ing: in the final analysis, the most important...",The bottom line is that pregnant women's healt...,,
1,in the buff,naked,The model created a sensation when she posed f...,dictionary,0,(buff),,,,,in the buff,naked,The model created a sensation when she posed f...,,
2,it's anyone's call,a competition where the outcome is difficult t...,I think this year's election would be anyone's...,dictionary,0,(anycal),,,,,it's anyone's call,a competition where the outcome is difficult t...,I think this year's election would be anyone's...,,
3,knee jerk reaction,an automatic response to something,It was a typical knee jerk reaction. He said n...,dictionary,0,"(knee, reaction, jerk)",,,,,knee jerk reaction,an automatic response to something,It was a typical knee jerk reaction. He said n...,,
4,knock on wood (touch wood),tap knuckle on wood in order to avoid bad luck,I am expecting a promotion and a big pay hike ...,dictionary,0,"(touch, wood, knock)",,,,,knock on wood (touch wood),tap knuckle on wood in order to avoid bad luck,I am expecting a promotion and a big pay hike ...,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1962,turn back [pron] clock,go back in time to a previous point in one's l...,"Wished she could turn back the clock to when ,...",EPIE,0,"(clock, back, turn)",turn back [pron] clock,go back in time to a previous point in one's l...,"Wished she could turn back the clock to when ,...",O O O B-IDIOM I-IDIOM I-IDIOM I-IDIOM O O O O ...,,,,,
1963,turn the tables,change your position with respect to someone e...,"After flying back to town, I went to Jenna's p...",dictionary,1,"(tabl, turn)",turn [pron] tables,"reverse a situation, especially to gain an adv...",MELISSA KNIGHT REPORTS FROM THE U.S. ON A FEMA...,O O O O O O O O O O O O O O O O O O O O B-IDIO...,turn the tables,change your position with respect to someone e...,"After flying back to town, I went to Jenna's p...",tables are turned,when the situation has changed giving the adva...
1964,turn a blind eye,deliberately overlook,"As a police officer, you cannot turn a blind e...",dictionary,1,"(turn, blind, eye)",turn [pron] blind eye,ignore something that one should not ignore.,"I 'm , I 'm , quite amazed that , that , the q...",O O O O O O O O O O O O O O O O O O O O O O O ...,turn a blind eye,deliberately overlook,"As a police officer, you cannot turn a blind e...",,
1965,trip [pron] light fantastic,dance joyfully and energetically.,"IT PROBABLY never existed , the England where ...",EPIE,0,"(light, fantast, trip)",trip [pron] light fantastic,dance joyfully and energetically.,"IT PROBABLY never existed , the England where ...",O O O O O O O O O O O O O O O O O O O O B-IDIO...,,,,,
