**the algorithm:**
#take a big corpus with trite similes (The Daily Mail, http://cs.nyu.edu/~kcho/DMQA/, or a pulp fiction book), prepare clean sentences, traverce throug the corpus sen by sen, POS-tagging each and looking for sentences containing prepositions "like" and "as" (tag = "IN") and exclude "as soon as, as well as, as usual, such as, as of yet, as much, like that, like this.." ((alternatively, use dependency parser to accurately cut out a phrase. but it's a pain and may be an overkill)) Add these sentences to a target corpus. Cut out a simile candidate out of each sentence; optionally: replace "likes" and "ases" with a "comparator". 
#approach 1:
#Use fuzzywazzy to do fuzzy matching of the simile candidates across the corpus. Find those that at least 98% similar and appear multiple times (over 10) across the corpus - those are thrite similes (or common grammatical constuctions containing 'like' or 'as' that we missed during cleaning). Build a corpus of trite similes. With a testing set, repeat all steps up to fuzzywazzy. Then, instead of fuzzy matching candidates across the testinf set, fuzzy match them with the trite similes corpus. Highlight (tag) if a match is found.  
#approach 2: 
#Sort words in each set alphabetically. Then build an n-gram counter (may be plot a histogram) - a dictionary with an n-gram as a key and how many times is appears in the corpus as a value. In a new text, repeat all steps up to the last one and then find new n-grams in the dictionary. If the new n-grams are among the most frequently met n-grams in the corpus, these n-grams constitute trite similes. Then use them as a trite similes corpus and compare to the testing set as described in the approach 1. 


In [25]:
import nltk

# from nltk.parse.stanford import StanfordDependencyParser
# path_to_jar = '/Development/Projects/Magnifis/3rd_party/NLU/stanford-corenlp-full-2013/stanford-corenlp-3.2.0.jar'
# path_to_models_jar ='/Development/Projects/Magnifis/3rd_party/NLU/stanford-corenlp-full-2013/stanford-corenlp-3.2.0-models.jar'
# dependency_parser = StanfordDependencyParser(path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar)

# result = dependency_parser.raw_parse('I shot an elephant in my sleep')
# dep = result.next()
# list(dep.triples())

In [55]:
import os
import io
import codecs
import re


regex_filter = r"(as soon)|(as well)|(as if)|(as \w+ as possible)|(as before)|\
(as long)|(as usual)|(as ever)|(as a result)|\
(such as)|(as of yet)|(as much)|(as many)|\
(like that)|(like this)|(like you)|(like me)|(like him)|(like us)|(like her)|\
(like everything else)|(like everyone else)|(anybody like)|(anyone like)"



In [28]:
def get_raw_text_data(input_dir):  
    fList=os.listdir(input_dir)
    # Create a list of file names with full path, eliminating subdirectory entries
    fList1 = [os.path.join(input_dir, f) for f in fList if os.path.isfile(os.path.join(input_dir, f))] 
    
    #max_files = 1000 #remove to get the entire corpus
    raw_corpus = ''
    for file in fList1: #[0:max_files] 
        with codecs.open(file, 'r', 'latin_1') as f: 
                                        # 'utf-8') as f:
        #with open(file, encoding="utf8") as f:
            raw_corpus += ''.join(f.read())  
    corpus = re.sub(r"(\n|\r)+""|(@\w+)+", ' ', raw_corpus) #remove backslashes and words starting with @
    #corpus = re.sub(r"(as soon)+" "|(as well)+" "|(as if)+" "|(as quickly as possible)+" "|(as long)" "|(as usual)+" "|(such as)+" "|(as of yet)+" "|(as much)+" "|(as many)+" "|(like that)+" "|(like this)+" "|(like you)+" "|(like me)+" "|(like him)+" "|(like us)+" "|(like her)+" "|(anybody like)+" "|(anyone like)+", "", corpus)
    return corpus


In [29]:
def tokenize_text(corpus, regex_filter):
    sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    raw_sents = sent_tokenizer.tokenize(corpus) # Split text into sentences 
    if regex_filter:
        raw_sents = [sent for sent in raw_sents if not re.search(regex_filter, sent)]
    return [nltk.word_tokenize(word) for word in raw_sents]

In [30]:
def extract_simile_candidates(sentences):
    comparisons = []
    for sent in sentences:
        if not 'like' in sent and not 'as' in sent: 
            continue 
        # exlude a single 'as', leaving in only '...as ... as...'
        if not 'like' in sent and len([word for word in sent if word=='as']) == 1: 
            continue
        pos_tagged = nltk.pos_tag(sent)
        for pair in pos_tagged:
            if pair[1] == 'IN' and (pair[0] == 'like' or pair[0] == 'as'):
                comparisons.append(pos_tagged)
    return comparisons


In [31]:
def filter_candidates(all_candidates):
    similes_candidates = []
    punkt = set(['.',',','-',':',';','!','?', '"', '\'', ')', '(', '%', '#', '[', ']', '@'])
    key_pos_tags = set(['NN', 'NNS', 'NNP']) #, 'VB', 'VBN', 'VBD', 'VBG']) # noun or verb
    for tagged_sent in all_candidates:
        start_index = -1
        words_after = -1
        sent = [pair[0] for pair in tagged_sent]
        pos_tags = [pair[1] for pair in tagged_sent]
        if 'like' in sent:
            start_index = sent.index('like')
            #two_words_before_like = max(0, index_of_like - 4)
            words_after = min(len(sent), start_index + 6)
        elif 'as' in sent:
            start_index = sent.index('as')
            words_after = min(len(sent), start_index + 8)

        if start_index >= 0 and words_after > 0:
            index_of_punkt = 0
            for i in range(start_index, words_after): 
                if sent[i] in punkt: 
                    index_of_punkt = i
                    break 

            if index_of_punkt > start_index: 
                words_after = min(words_after, index_of_punkt)
            if not(not key_pos_tags.intersection(set(pos_tags[start_index:words_after]))): # make sure at least one key pos tag is present
                similes_candidates.append(sent[start_index:words_after])
    return similes_candidates

In [42]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
stop_words = set(['a', 'an', 'and', 'or', 'the', \
                  'his', 'her', 'my', 'your', 'their', 'our', \
                  'i', 'you', 'he', 'she', 'it', 'they', 'who', 'that', 'whose', \
                  'is', 'are', 'was', 'will', 'would', \
                  '.',',','-',':',';','!','?', '"', '\'', ')', '(', '%', '#', '[', ']', '@'])

def preprocess_words(wordlist): 
    wordset = set([])
    for word in wordlist: 
        word = word.lower()
        if word not in stop_words and len(word) > 1: 
            if word != 'as':
                word = lemmatizer.lemmatize(word)
            if word == 'like' or word == 'as': 
                word = '$cmpr'
            wordset.add(word)
    return wordset 
        

def init_corpus_2match(wordlists): 
    wordsets = []
    for words in wordlists: 
        if not words:
            continue
        wordset = preprocess_words(words)
        if not(not wordset):
            wordsets.append(wordset)
    return wordsets


''' Returns a list of matches for 'phrase' in 'wordsets' with 'min_similarity' 
'''
def fuzzy_match(words_in, corpus, min_similarity): 
    # TODO: must be optimized!!
    phraset = preprocess_words(words_in)
    #print ("Input phraseset is {}".format(phraset))
    nb_input = len(phraset)
    matches = []
    for wordset in corpus: 
        intersect = phraset.intersection(wordset)
        n = len(intersect)
        if n/min(nb_input, len(wordset)) >= min_similarity and not(n < 2 and next(iter(intersect))=='$cmpr'): 
            #print(wordset)
            matches.append(wordset)
    return matches

In [44]:
import operator

def train_similes_corpus(candidates):
    corpus_2match = init_corpus_2match(candidates)
    covered = set([])
    count_dict = {}
    for cand in candidates:
        if not cand: 
            continue
        phrase = ' '.join(cand)
        if phrase in covered:
            continue
        covered.add(phrase)
        result = fuzzy_match(cand, corpus_2match, 0.75)
        #print("result is {}".format(result))
        if result:
            count_dict[phrase] = len(result)
    
    sorted_counts = sorted(count_dict.items(), key=operator.itemgetter(1))
    sorted_counts.reverse()
    return sorted_counts

## Extract simile candidates from raw text  

In [52]:
def raw_text_to_simile_candidates(input_dir):
    raw_corpus = get_raw_text_data(input_dir)
    sentences = tokenize_text(raw_corpus, regex_filter)
    similes_candidates = extract_simile_candidates(sentences)
    return similes_candidates


In [57]:
from sklearn.externals import joblib

def train(input_dir, min_simile_freq): 
    similes_candidates = raw_text_to_simile_candidates(input_dir)
    similes_candidates = filter_candidates(similes_candidates)
    sorted_counts = train_similes_corpus(similes_candidates)

    # create actual corpus and save 
    top_similes_corpus = init_corpus_2match([item[0].split(' ') for item in count_dict.items() if item[1] >= min_simile_freq])
    # save 
    joblib.dump(top_similes_corpus, "top_similes_corpus.v0.pkl")
    return similes_candidates, sorted_counts

## Train 

In [58]:
min_simile_freq = 5
train_dir_name = './data/similes_train/' 

similes_candidates, sorted_counts = train(train_dir_name, min_simile_freq)
sorted_counts

[('like a man who', 67),
 ('like a man', 67),
 ('like the man', 67),
 ('like a child who', 36),
 ('as a child', 36),
 ('like a child', 36),
 ('like wolves', 30),
 ('like a wolf', 30),
 ('like those of a wolf', 29),
 ('like a cat', 27),
 ('like a woman', 23),
 ('like a child in a toyshop', 21),
 ('like those of a python a', 20),
 ('like the others', 20),
 ('like those of a conjuror who', 19),
 ('like the eyes of a wolf', 19),
 ('as NatÃ¡sha and I used to as children', 19),
 ('like children in an embarrassing situation', 19),
 ('like those of hyenas', 19),
 ('like a shadow', 18),
 ('like a suffering child and to', 18),
 ('as a child in many ways', 18),
 ('like those of a trapped wolf', 18),
 ('like a child when the doctor', 18),
 ('like a great lazy cat on', 17),
 ('like children of my own', 17),
 ('like a child taken out for', 17),
 ('like the wind', 17),
 ('like a frightened child', 16),
 ('like a child and quickly shuffling', 16),
 ('like an invalid or a child', 16),
 ('like a child a

In [60]:
similes_candidates[0:5]

[['like', 'blue', 'mantles', 'beneath', 'the', 'stars'],
 ['as', 'silently', 'as', 'the', 'ghosts', 'of', 'murdered', 'men'],
 ['like', 'a', 'blind', 'dog'],
 ['like', 'a', 'great', 'lazy', 'cat', 'on'],
 ['like', 'rats', 'from', 'their', 'burrows']]

## Test 

In [None]:
# Test last step: (pseudo-)"classification" of simile_candidates

def test(data_dir, similes_corpus, min_simile_freq): 
    similes_candidates = raw_text_to_simile_candidates(data_dir)
    similes_candidates = filter_candidates(similes_candidates)
    
    for cand in similes_candidates:
        matches = fuzzy_match(cand, similes_corpus, 0.75)
        if len(matches) >= min_simile_freq
            print("'{}' is a trite simile".format(cand))
        else 
            print("'{}' is NOT a trite simile".format(cand))
    
        


In [None]:

similes_corpus = joblib.load("top_similes_corpus.v0.pkl")
test(similes)

## Backup code 

In [None]:
# import fuzzywuzzy
# from fuzzywuzzy import fuzz
# from fuzzywuzzy import process


In [None]:
# choices = []
# for each in similes_candidates:
#     choices.append(" ".join(each))


In [None]:
# count_dict = {}

# for string in set(choices):
#     result = process.extract(string, choices, limit=1000) #default limit = 5
#     num_matches = 0
#     for each in result:
#         if each[1] > 98:
#             num_matches +=1
#     count_dict[string] = num_matches


In [None]:
# write 
# from sklearn.externals import joblib
# joblib.dump(count_dict, "count_dict_output.pkl")

In [None]:
# count_dict = sorted(count_dict.items(), key=operator.itemgetter(1))
# count_dict.reverse()


In [None]:
#read 
#count_dict_fromfile = joblib.load("count_dict_output.pkl")