**the algorithm:**
#take a big corpus with trite similes (The Daily Mail, http://cs.nyu.edu/~kcho/DMQA/, or a pulp fiction book), prepare clean sentences, traverce throug the corpus sen by sen, POS-tagging each and looking for sentences containing prepositions "like" and "as" (tag = "IN") and exclude "as soon as, as well as, as usual, such as, as of yet, as much, like that, like this.." ((alternatively, use dependency parser to accurately cut out a phrase. but it's a pain and may be an overkill)) Add these sentences to a target corpus. Cut out a simile candidate out of each sentence; optionally: replace "likes" and "ases" with a "comparator". 
#approach 1:
#Use fuzzywazzy to do fuzzy matching of the simile candidates across the corpus. Find those that at least 98% similar and appear multiple times (over 10) across the corpus - those are thrite similes (or common grammatical constuctions containing 'like' or 'as' that we missed during cleaning). Build a corpus of trite similes. With a testing set, repeat all steps up to fuzzywazzy. Then, instead of fuzzy matching candidates across the testinf set, fuzzy match them with the trite similes corpus. Highlight (tag) if a match is found.  
#approach 2: 
#Sort words in each set alphabetically. Then build an n-gram counter (may be plot a histogram) - a dictionary with an n-gram as a key and how many times is appears in the corpus as a value. In a new text, repeat all steps up to the last one and then find new n-grams in the dictionary. If the new n-grams are among the most frequently met n-grams in the corpus, these n-grams constitute trite similes. Then use them as a trite similes corpus and compare to the testing set as described in the approach 1. 


In [33]:
import nltk

min_simile_freq = 5
train_dir_name = '../raw_data/similes_train_tmp/' 
test_dir_name = '../raw_data/similes_test/' 


# from nltk.parse.stanford import StanfordDependencyParser
# path_to_jar = '/Development/Projects/Magnifis/3rd_party/NLU/stanford-corenlp-full-2013/stanford-corenlp-3.2.0.jar'
# path_to_models_jar ='/Development/Projects/Magnifis/3rd_party/NLU/stanford-corenlp-full-2013/stanford-corenlp-3.2.0-models.jar'
# dependency_parser = StanfordDependencyParser(path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar)

# result = dependency_parser.raw_parse('I shot an elephant in my sleep')
# dep = result.next()
# list(dep.triples())

In [63]:
import os
import io
import codecs
import re


regex_filter = r"(as soon)|(as well)|(as if)|(as \w+ as possible)|(as before)|\
(as long)|(as usual)|(as ever)|(as a result)|\
(such as)|(as of yet)|(as much)|(as many)|\
(like that)|(like this)|(like you)|(like me)|(like him)|(like us)|(like her)|\
(look like)|(looks like)|\
(like everything else)|(like everyone else)|(anybody like)|(anyone like)"



In [64]:
from tqdm import tqdm

def get_raw_text_data(input_dir):  
    fList=os.listdir(input_dir)
    # Create a list of file names with full path, eliminating subdirectory entries
    fList1 = [os.path.join(input_dir, f) for f in fList if os.path.isfile(os.path.join(input_dir, f))] 
    
    #max_files = 1000 #remove to get the entire corpus
    raw_corpus = ''
    for file in tqdm(fList1): #[0:max_files] 
        with codecs.open(file, 'r', 'latin_1') as f: 
                                        # 'utf-8') as f:
        #with open(file, encoding="utf8") as f:
            raw_corpus += ''.join(f.read())  
    corpus = re.sub(r"(\n|\r)+""|(@\w+)+", ' ', raw_corpus) #remove backslashes and words starting with @
    #corpus = re.sub(r"(as soon)+" "|(as well)+" "|(as if)+" "|(as quickly as possible)+" "|(as long)" "|(as usual)+" "|(such as)+" "|(as of yet)+" "|(as much)+" "|(as many)+" "|(like that)+" "|(like this)+" "|(like you)+" "|(like me)+" "|(like him)+" "|(like us)+" "|(like her)+" "|(anybody like)+" "|(anyone like)+", "", corpus)
    return corpus


In [65]:
def tokenize_text(corpus, regex_filter, do_tokenize_words=True):
    sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    raw_sents = sent_tokenizer.tokenize(corpus) # Split text into sentences 
    if regex_filter:
        raw_sents = [sent for sent in raw_sents if not re.search(regex_filter, sent)]
    if do_tokenize_words:
        result = [nltk.word_tokenize(sent) for sent in raw_sents]
    else: 
        result = raw_sents
    return result

In [66]:
def extract_simile_candidates(sentences):
    comparisons = []
    for i_sent, sent in enumerate(sentences):
        if not 'like' in sent and not 'as' in sent: 
            continue 
        # exlude a single 'as', leaving in only '...as ... as...'
        if not 'like' in sent and len([word for word in sent if word=='as']) == 1: 
            continue
        pos_tagged = nltk.pos_tag(sent)
        for pair in pos_tagged:
            if pair[1] == 'IN' and (pair[0] == 'like' or pair[0] == 'as'):
                comparisons.append((i_sent, pos_tagged))
    return comparisons


In [67]:
def filter_candidates(all_candidates):
    similes_candidates = []
    punkt = set(['.',',','-',':',';','!','?', '"', '\'', ')', '(', '%', '#', '[', ']', '@'])
    key_pos_tags = set(['NN', 'NNS', 'NNP']) #, 'VB', 'VBN', 'VBD', 'VBG']) # noun or verb
    for i_sent, tagged_sent in all_candidates:
        start_index = -1
        words_after = -1
        sent = [pair[0] for pair in tagged_sent]
        pos_tags = [pair[1] for pair in tagged_sent]
        if 'like' in sent:
            start_index = sent.index('like')
            #two_words_before_like = max(0, index_of_like - 4)
            words_after = min(len(sent), start_index + 6)
        elif 'as' in sent:
            start_index = sent.index('as')
            words_after = min(len(sent), start_index + 8)

        if start_index >= 0 and words_after > 0:
            index_of_punkt = 0
            for i in range(start_index, words_after): 
                if sent[i] in punkt: 
                    index_of_punkt = i
                    break 

            if index_of_punkt > start_index: 
                words_after = min(words_after, index_of_punkt)
            if not(not key_pos_tags.intersection(set(pos_tags[start_index:words_after]))): # make sure at least one key pos tag is present
                similes_candidates.append((i_sent, sent[start_index:words_after]))
    return similes_candidates

In [68]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
stop_words = set(['a', 'an', 'and', 'or', 'the', \
                  'his', 'her', 'my', 'your', 'their', 'our', \
                  'i', 'you', 'he', 'she', 'it', 'they', 'who', 'that', 'whose', \
                  'is', 'are', 'was', 'will', 'would', \
                  '.',',','-',':',';','!','?', '"', '\'', ')', '(', '%', '#', '[', ']', '@'])

def preprocess_words(wordlist): 
    wordset = set([])
    for word in wordlist: 
        word = word.lower()
        if word not in stop_words and len(word) > 1: 
            if word != 'as':
                word = lemmatizer.lemmatize(word)
            if word == 'like' or word == 'as': 
                word = '$cmpr'
            wordset.add(word)
    return wordset 
        
''' Precomputes a corpus (phrase search index) for a given list of phrases
    Optimization: create a data structure to speed up fuzzy matching as follows: 
    {'word' : [i, j, k, ...]}, where i, j, k are the row indices of all phrases containing 'word'. 
    For each new search query, we prefetch the relevant rows based in the words in that query, 
    prior to fuzzy matching. 
'''
def init_corpus_2match(wordlists): 
    lookup = {}
    all_wordsets = []
    for i_sent, words in wordlists: # for each phrase (word list)
        if not words:
            continue
        wordset = preprocess_words(words)
        if not(not wordset):
            i_row = len(all_wordsets)   
            all_wordsets.append(wordset)
            
            # update loookup index (dictionary of word to corpus row id)
            for word in wordset: 
                if word not in lookup: 
                    lookup[word] = [i_row]
                else: 
                    lookup[word].append(i_row)
    return (all_wordsets, lookup) 


''' Returns a list of matches for 'phrase' in 'wordsets' with 'min_similarity' 
'''
def fuzzy_match(words_in, search_index, min_similarity): 
    # init 
    phraset = preprocess_words(words_in)
    relevant_corpus_rows = search_index
    
    # prepare relevant subset of search index
    # the data could be in 2 different representations
    if isinstance(search_index, tuple): 
        corpus = search_index[0]
        lookup = search_index[1]

        # prefetch relevant corpus rows 
        relevant_corpus_row_ids = set([])
        for word in phraset: 
            if word not in lookup or word == '$cmpr':
                continue
            row_ids = lookup[word]
            for i in row_ids:
                relevant_corpus_row_ids.add(i)    
        relevant_corpus_rows = [corpus[i] for i in relevant_corpus_row_ids]  
    
    # todo: remove
    # print("input phrase: {}".format(phraset))
    # print("relevant phrase ids: {}".format(relevant_corpus_row_ids))
    
    
    # actually search
    nb_input = len(phraset)
    matches = []
    for wordset in relevant_corpus_rows: 
        intersect = phraset.intersection(wordset)
        n = len(intersect)
        if n/min(nb_input, len(wordset)) >= min_similarity and not(n < 2 and next(iter(intersect))=='$cmpr'): 
            #print(wordset)
            matches.append(wordset)
    return matches

In [69]:
import operator

def train_similes_corpus(candidates):
    corpus_2match = init_corpus_2match(candidates)
    covered = set([])
    count_dict = {}
    for cand in candidates:
        if not cand: 
            continue
        phrase = ' '.join(cand)
        if phrase in covered:
            continue
        covered.add(phrase)
        result = fuzzy_match(cand, corpus_2match, 0.75)
        #print("result is {}".format(result))
        if result:
            count_dict[phrase] = len(result)
    
    sorted_counts = sorted(count_dict.items(), key=operator.itemgetter(1))
    sorted_counts.reverse()
    return count_dict, sorted_counts

In [70]:
from tqdm import tqdm

def aggregate_similes_candidates(input_dir):  
    fList=os.listdir(input_dir)
    # Create a list of file names with full path, eliminating subdirectory entries
    fList1 = [os.path.join(input_dir, f) for f in fList if os.path.isfile(os.path.join(input_dir, f))] 
    
    #max_files = 1000 #remove to get the entire corpus
    all_candidates = []
    for i in tqdm(range(len(fList1))): #[0:max_files] 
        file = fList1[i]
        with codecs.open(file, 'r', 'latin_1') as f: 
                                        # 'utf-8') as f:
        #with open(file, encoding="utf8") as f:
            raw_text = ''.join(f.read()) 
            text = re.sub(r"(\n|\r)+""|(@\w+)+", ' ', raw_text) #remove backslashes and words starting with @
            sentences = tokenize_text(text, regex_filter)
            similes_candidates = extract_simile_candidates(sentences)
            similes_candidates = filter_candidates(similes_candidates)
            all_candidates.extend(similes_candidates)
    return all_candidates

## Extract simile candidates from raw text  

In [71]:
from sklearn.externals import joblib

def train(input_dir, min_simile_freq): 
    similes_candidates = aggregate_similes_candidates(input_dir)
    count_dict, sorted_counts = train_similes_corpus(similes_candidates)

    # create actual corpus and save 
    top_similes_corpus = init_corpus_2match([(-1, item[0].split(' ')) for item in count_dict.items() if item[1] >= min_simile_freq])
    # save 
    joblib.dump(top_similes_corpus, "top_similes_corpus.v2.pkl")
    return similes_candidates, sorted_counts


## Train 

In [11]:

similes_candidates, sorted_counts = train(train_dir_name, min_simile_freq)
sorted_counts

100%|██████████| 6/6 [00:21<00:00,  5.36s/it]


input phrase: {'beneath', 'star', '$cmpr', 'mantle', 'blue'}
relevant phrase ids: {0, 325, 326, 136, 233, 138, 1000, 115, 116, 988}
input phrase: {'ghost', 'murdered', 'men', 'silently', '$cmpr', 'of'}
relevant phrase ids: {512, 1, 7, 1037, 1038, 534, 24, 28, 1054, 1055, 32, 43, 44, 49, 561, 571, 1083, 1085, 574, 1086, 576, 65, 578, 579, 580, 581, 582, 583, 584, 73, 74, 1091, 76, 1093, 1099, 1100, 1101, 1103, 82, 83, 1108, 84, 595, 596, 1106, 1110, 90, 91, 602, 603, 1111, 607, 1125, 102, 103, 1126, 105, 619, 620, 621, 110, 622, 1131, 1132, 115, 116, 1156, 1157, 135, 1163, 140, 141, 142, 143, 144, 145, 1167, 1172, 1173, 1174, 664, 153, 1178, 155, 156, 1179, 1180, 1182, 1183, 1185, 168, 680, 170, 681, 682, 1195, 174, 1196, 1199, 1200, 1201, 1202, 1203, 693, 1204, 1205, 696, 1211, 1212, 1213, 190, 191, 1214, 1215, 706, 1216, 1219, 1220, 710, 199, 200, 201, 202, 1221, 204, 205, 1222, 1223, 1224, 1227, 1228, 211, 212, 1229, 726, 1231, 216, 1234, 218, 1235, 220, 1236, 1237, 1238, 1239, 1240,

[('like a man', 47),
 ('like a man who', 47),
 ('as a child', 28),
 ('like a child', 28),
 ('like a child who', 28),
 ('like the others', 16),
 ('as NatÃ¡sha and I used to as children', 15),
 ('like a child in a toyshop', 15),
 ('like children in an embarrassing situation', 15),
 ('like a suffering child and to', 14),
 ('like a child when the doctor', 14),
 ('like children of my own', 13),
 ('like a child taken out for', 13),
 ('like a child and quickly shuffling', 12),
 ('like a shadow', 12),
 ('like a dog', 12),
 ('like an invalid or a child', 12),
 ('like a child he made sport', 12),
 ('like a child at a dancing', 12),
 ('like dogs', 12),
 ('like a child till tears came', 12),
 ('like a stone', 10),
 ('like a son', 10),
 ('like a cat', 10),
 ('like the others she had seen', 10),
 ('as necessary for the movement of the peoples', 10),
 ('like wolves', 10),
 ('like a man and think of', 10),
 ('like the prince', 10),
 ('like the replies of a man', 10),
 ('like a wolf', 10),
 ('as king o

In [12]:
similes_candidates[0:5]

[['like', 'blue', 'mantles', 'beneath', 'the', 'stars'],
 ['as', 'silently', 'as', 'the', 'ghosts', 'of', 'murdered', 'men'],
 ['like', 'a', 'blind', 'dog'],
 ['like', 'a', 'great', 'lazy', 'cat', 'on'],
 ['like', 'rats', 'from', 'their', 'burrows']]

In [13]:
sorted_counts

[('like a man', 47),
 ('like a man who', 47),
 ('as a child', 28),
 ('like a child', 28),
 ('like a child who', 28),
 ('like the others', 16),
 ('as NatÃ¡sha and I used to as children', 15),
 ('like a child in a toyshop', 15),
 ('like children in an embarrassing situation', 15),
 ('like a suffering child and to', 14),
 ('like a child when the doctor', 14),
 ('like children of my own', 13),
 ('like a child taken out for', 13),
 ('like a child and quickly shuffling', 12),
 ('like a shadow', 12),
 ('like a dog', 12),
 ('like an invalid or a child', 12),
 ('like a child he made sport', 12),
 ('like a child at a dancing', 12),
 ('like dogs', 12),
 ('like a child till tears came', 12),
 ('like a stone', 10),
 ('like a son', 10),
 ('like a cat', 10),
 ('like the others she had seen', 10),
 ('as necessary for the movement of the peoples', 10),
 ('like wolves', 10),
 ('like a man and think of', 10),
 ('like the prince', 10),
 ('like the replies of a man', 10),
 ('like a wolf', 10),
 ('as king o

## Test 

In [102]:
def extract_tagged_simile_sents(sentences):
    simile_sents = []
    for sent in sentences: 
        if not re.search("<rule1s>", sent):
            continue
        sent = re.sub(r"(<rule1s>)|(</rule1s>)", "", sent)
        simile_sents.append(nltk.pos_tag(nltk.word_tokenize(sent))) 
    return simile_sents


def eval(sentence_text, similes_corpus, min_simile_freq): 
    sentences = tokenize_text(sentence_text, None, do_tokenize_words=True)
    sentences_orig = tokenize_text(sentence_text, None, do_tokenize_words=False)
        

    similes_candidates = extract_simile_candidates(sentences)
    similes_candidates = filter_candidates(similes_candidates)
    results = []
    for i_sent, cand in similes_candidates:
        is_pred_simile = False
        matches = fuzzy_match(cand, similes_corpus, 0.75)
        nb_matches = len(matches)
        if nb_matches >= min_simile_freq:
            is_pred_simile = True
            
        # LIKE vs. AS
        sub_index = 0
        sub_length = 0
        if 'like' in cand:
            sub_index = sentences_orig[i_sent].find('like')
            sub_length = 4
        elif 'as' in cand:
            sub_index = sentences_orig[i_sent].find('as')
            sub_length = 2
        global_index = sentence_text.find(sentences_orig[i_sent]) 
        results.append((is_pred_simile, # is simile? 
                        i_sent, # sentence index
                        global_index+sub_index, # index of first char of the sentence in the full text 
                        sub_length, # comparison string length
                        sentences_orig[i_sent], cand)) # simile
    return results



# Test last step: (pseudo-)"classification" of simile_candidates
def test(data_dir, similes_corpus, min_simile_freq): 
    raw_corpus = get_raw_text_data(data_dir)
    sentences = tokenize_text(raw_corpus, None, do_tokenize_words=False)
    true_simile_sents = extract_tagged_simile_sents(sentences)
    
    nb_true_pos = 0
    false_pos = []
    false_neg = []
    for true_simile_sent in true_simile_sents:
        # sent_words = [pair[0] for pair in tagged_sent]
        is_pred_simile = False
        sent = [pair[0] for pair in true_simile_sent] # remove POS tags 
        nb_matches = 0
        nb_true_pos += 1
        simile_candidates = filter_candidates([true_simile_sent])
        if not (not simile_candidates): 
            simile_candidate = simile_candidates[0]
            matches = fuzzy_match(simile_candidate, similes_corpus, 0.75)
            nb_matches = len(matches)
        
        if nb_matches >= min_simile_freq:
            is_pred_simile = True

        if not is_pred_simile:
            false_neg.append(' '.join(sent))
#         else 
#             print("'{}' is NOT a trite simile".format(cand))
    precision = nb_true_pos / (nb_true_pos + len(false_pos))
    recall = nb_true_pos / (nb_true_pos + len(false_neg))
    print("=== Claddification Report ===")
    print("Precision = {}".format(precision))
    print("Recall = {}".format(recall))
    print("=============================")
    print ("-- False Negatives --")
    for neg in false_neg:
        print(neg)
        


In [94]:
similes_corpus = joblib.load("top_similes_corpus.v2.pkl")
test(test_dir_name, similes_corpus, 1)

KeyboardInterrupt: 

In [103]:
# Misc unit tests 
test_data = "Mysterious Mr. Fogg lives his life like a machine, really. In fact, he looks like a frog. That's the honest truth. He is as green as a frog."
eval(test_data, similes_corpus, 20)

[(True,
  0,
  35,
  4,
  'Mysterious Mr. Fogg lives his life like a machine, really.',
  ['like', 'a', 'machine']),
 (True, 1, 77, 4, 'In fact, he looks like a frog.', ['like', 'a', 'frog']),
 (False,
  3,
  121,
  2,
  'He is as green as a frog.',
  ['as', 'green', 'as', 'a', 'frog'])]

In [85]:
list('like').index( 'i')

1

## Backup code 

In [18]:
# import fuzzywuzzy
# from fuzzywuzzy import fuzz
# from fuzzywuzzy import process


In [19]:
# choices = []
# for each in similes_candidates:
#     choices.append(" ".join(each))


In [20]:
# count_dict = {}

# for string in set(choices):
#     result = process.extract(string, choices, limit=1000) #default limit = 5
#     num_matches = 0
#     for each in result:
#         if each[1] > 98:
#             num_matches +=1
#     count_dict[string] = num_matches


In [21]:
# write 
# from sklearn.externals import joblib
# joblib.dump(count_dict, "count_dict_output.pkl")

In [22]:
# count_dict = sorted(count_dict.items(), key=operator.itemgetter(1))
# count_dict.reverse()


In [23]:
#read 
#count_dict_fromfile = joblib.load("count_dict_output.pkl")