In [14]:
import pandas as pd
import string
import re
from stemming.porter2 import stem
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
corpus2 = pd.read_csv('../1_dict/idioms.csv')
corpus3 = pd.read_csv('../2_pfbt/idioms.csv')
print(f"{len(corpus2)} + {len(corpus3)} before deduplication")

1408 + 291 before deduplication


[nltk_data] Downloading package stopwords to /Users/chany/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
corpus2 = corpus2.rename(columns={
    'idiom': 'idiom_dict',
    'meaning': 'meaning_dict',
    'example': 'example_dict',
})

corpus3 = corpus3.rename(columns={
    'idiom': 'idiom_pfbt',
    'definition': 'meaning_pfbt'
})
corpora = [corpus2, corpus3]

In [16]:
pronouns = {
    'I', 'you', 'You', 'he', 'He', 'she', 'She', 'it', 'It', 'we', 'We', 'they', 'They', # subjective
    'me', 'Me', 'him', 'Him', 'her', 'Her', 'us', 'Us', 'them', 'Them', # objective
    'my', 'My', 'your', 'Your', 'his', 'His', 'our', 'Our', 'their', 'Their', # possessive, adj
    'mine', 'Mine', 'yours', 'Yours' 'hers', 'Hers', 'ours', 'Ours', 'theirs', 'Theirs', # possessive, n
    'pron', 'own'
}
determiners = {
    'the', 'The', 'a', 'A', 'an', 'An', 'this', 'This', 'that', 'That', 'these', 'These', 'those', 'Those'
}
auxes = {'is', 'was', 'are', 'were', 'be', 'being', 'been'}
props = {'in', 'at', 'on', 'of', 'by', 'for', 'by', 'to', 'as'}
extras = {'ll', 'if', 'and', 'or'}
stop_ignore = {'between', 'all', 'same', 'but', 'off', 'once', 'have', 'until', 'below', 'through', 'while', 
        'above', 'both', 'up', 'out', 'under', 'against', 'further', 'few', 'most', 'down', 'over', 'just',
}
stop_words = pronouns | determiners | auxes | props | extras | set(stopwords.words('english')) - stop_ignore
def trim_idiom(idiom):
    idiom = idiom.lower()
    idiom = idiom.replace('one\'s ', '')
    idiom = idiom.replace('-', ' ')
    idiom = idiom.replace('\'', ' ')
    idiom = idiom.replace('/', ' ')
    idiom = idiom.translate(str.maketrans('', '', string.punctuation))
    idiom = idiom.split()
    idiom = set([stem(word) for word in idiom])
    idiom -=  stop_words
    return frozenset(idiom)

In [17]:
corpus2['idiom_set'] = corpus2['idiom_dict'].apply(trim_idiom)
corpus2['meaning_dict'] = corpus2['meaning_dict'].str.replace("ing: ", "")
corpus3['idiom_set'] = corpus3['idiom_pfbt'].apply(trim_idiom)

In [18]:
corpus = pd.concat([corpus2, corpus3])
corpus = corpus.groupby('idiom_set').agg({
    'idiom_dict': 'first',
    'meaning_dict': 'first',
    'example_dict': 'first',
    'idiom_pfbt': 'first',
    'meaning_pfbt': 'first'
}).reset_index()

In [19]:
def postprocess(row):
    row['dupl'] = 0
    if row['idiom_dict'] and not row['idiom_pfbt']: 
        row['idiom'] = row['idiom_dict']
        row['meaning'] = row['meaning_dict']
        row['example'] = row['example_dict']
        row['source'] = 'dict'
    elif not row['idiom_dict'] and row['idiom_pfbt']: 
        row['idiom'] = row['idiom_pfbt']
        row['meaning'] = row['meaning_pfbt']
        row['source'] = 'pfbt'
    else:
        row['idiom'] = row['idiom_dict']
        row['meaning'] = row['meaning_dict']
        row['example'] = row['example_dict']
        row['source'] = 'dict'
        row['dupl'] = 1
    return row

In [20]:
corpus = corpus.apply(postprocess, axis=1)[['idiom', 'meaning', 'example', 'source', 'dupl',
    'idiom_set', 
    'idiom_dict', 'meaning_dict', 'example_dict',
    'idiom_pfbt', 'meaning_pfbt',
]]
corpus = corpus[corpus['idiom'].notnull()].reset_index(drop=True)
corpus = corpus[corpus['idiom_set'] != frozenset()].reset_index(drop=True)

In [21]:
corpus.to_csv('idioms.csv', index=False)
corpus[corpus['example'].notnull()].reset_index(drop=True)[['idiom', 'meaning', 'example']].to_csv(
    'idioms_with_instances.csv', index=False)

In [22]:
corpus

Unnamed: 0,idiom,meaning,example,source,dupl,idiom_set,idiom_dict,meaning_dict,example_dict,idiom_pfbt,meaning_pfbt
0,by all means,in every way possible.,"""Do you mind if I go to the party?"" ""By all me...",dict,0,"(all, mean)",by all means,in every way possible.,"""Do you mind if I go to the party?"" ""By all me...",,
1,banker's hours,short working hours,"With our boss on leave, most of us worked bank...",dict,0,"(hour, banker)",banker's hours,short working hours,"With our boss on leave, most of us worked bank...",,
2,hot potato,any subject which several folks are talking ab...,The issue of gun control is a political hot po...,dict,0,"(hot, potato)",hot potato,any subject which several folks are talking ab...,The issue of gun control is a political hot po...,,
3,penny for your thoughts,a way of asking what someone else is thinking,"""You have been quiet for a while, a penny for ...",dict,0,"(penni, thought)",penny for your thoughts,a way of asking what someone else is thinking,"""You have been quiet for a while, a penny for ...",,
4,add insult to injury,to make a bad situation worse,The company rejected his application for a job...,dict,0,"(injuri, insult, add)",add insult to injury,to make a bad situation worse,The company rejected his application for a job...,,
...,...,...,...,...,...,...,...,...,...,...,...
1579,buck up,to summon the courage to do something,After losing the first place position to Marth...,dict,0,"(up, buck)",buck up,to summon the courage to do something,After losing the first place position to Marth...,,
1580,butter up,to flatter someone so as to get something in r...,Mary's nice compliments about her not so nice ...,dict,0,"(up, butter)",butter up,to flatter someone so as to get something in r...,Mary's nice compliments about her not so nice ...,,
1581,laughter is the best medicine,means that laughter can help cure negative fee...,"After spending several hours in good company, ...",dict,0,"(best, laughter, medicin)",laughter is the best medicine,means that laughter can help cure negative fee...,"After spending several hours in good company, ...",,
1582,from pillar to post,to move from one place to another with no purp...,Failing to get a satisfactory answer from her ...,dict,0,"(post, pillar)",from pillar to post,to move from one place to another with no purp...,Failing to get a satisfactory answer from her ...,,
