# First Challenge

In [16]:
#Tools
import re
import pandas as pd 
import numpy as np
import sys
from tqdm import tqdm_notebook as tqdmn
from collections import Counter
import glob
from scipy import spatial
import time
import os

#Word processors.
from nltk.corpus import wordnet as wn
import nltk

# Import smtplib for the actual sending function
import smtplib

# Import the email modules we'll need
from email.mime.text import MIMEText

## Wikipedia
My first thought was that a good way to get word associations would be to use Python's Wikipedia package, whose 'search' function returns all the links to other subjects on the page which you input (see search for 'labyrinth' below).

When the first iteration didn't work, I thought that I would try using only links which consist of 1 word. This was better, but as you can see there are still a lot of false positives. I think we can find a better way. We'll leave this method as a possibility, but look at some other methods. I think NLTK will be a good package to give us word associations.

In [2]:
l = wikipedia.search('clay pigeon')
print (l)
newl = []
for i in l:
    if len(i.split(' ')) == 1:              #Only accept one-word responses.
        if i.lower() != searchword.lower(): #Don't accept copies of the search word.
            newl.append(i)
newl
        

['Clay pigeon shooting', 'Clay pigeon', 'Clay Pigeons', 'The Clay Pigeon', 'Clay Pigeon (film)', 'Sporting clays', 'Beretta Silver Pigeon', 'Clay Pigeon Shooting Association', 'Clay pigeon floor procedure', 'Passenger pigeon']


[]

## WordNet
**Defn**: a _hypernym_ is a word with a broad meaning that more specific words fall under; a superordinate. For example, _color_ is a hypernym of _red_ (source: Google dictionary).

I think this one is really good, especially since Python's NLTK package already has a large base of related words. This implementation gets synonyms and hypernyms. If you think of words as having a family tree of specificity (where the leaves are the most specific word), then synonymns are like cousins, nd hypernyms are like parents or grandparents. 

This code works very well for words that have only one well-defined meaning, like *mitten* or *coffee*, but not very good for words that either have many meanings or are abstract, such as *mint* or *blue*.

The worst-case scenario for this code is when a word is not in WordNet's corpus, in which case it can't do anything. One big problem is that WordNet's corpus only includes single words. That means we can't use it for phrases like *electrical device* or *clay pigeon*. In these cases, we will have to do something else (maybe use Wikipedia for this?).

In [3]:
class WordnetAssociations(object):
    """
    This class is designed to tell us words which are similar to a given word.
    Its main function takes a random word as input, and returns up to 5 words 
    which are similar.
    """
    
    def __init__(self, searchword):
        self.sw = searchword
        self.sw_all_syns = wn.synsets(searchword)
        
        if len(self.sw_all_syns) == 0:
            raise ValueError('That word is not in WordNet\'s corpus')
        
        self.sw_best_syn = self.sw_all_syns[0]    #I'm going to assume the best one is first.
        
    def get_synsets(self):
        return wn.synsets(self.sw)
    
    def associated_words(self, SYNS):
        """
        This function returns all synonyms and their hypernyms of the search word
        which was input to the class.
        """
        ASSOC_WORDS = self.sw_all_syns
        for syn in ASSOC_WORDS:
            ASSOC_WORDS.extend(syn.hypernyms())
            
        return list(set(ASSOC_WORDS))
    
    def metric(self, synset2):
        """
        This function tells us how similar (in Wu-Palmer similarity) a given word
        is to the original search word.
        Sometimes, wup_similarity returns None. I don't understand completely why 
        this happens, but basically if the too words are too dissimilar, then they
        won't have any connection between them. In this case, instead of returning 
        'None', this function returns 0.
        
        It looks like Wu-Palmer similarity will work the best, since
        1) path_similarity doesn't differentiate enough between different words,
             e.g., there are a lot of values of 0.66666, 0.5, 0.333333.
        2) lch_similarity doesn't work with words that don't have the same part of speech
        """
        val = wn.wup_similarity(self.sw_best_syn, synset2)
        if not val:
            return 0
        return val
    
    def sort_by_similarity(self, words):
        """
        Input: All of the words which have been indicated to be similar to 
        the search word.
        
        Output:
        Sorts the list, according to HOW similar each of the words are to
        the search word. Then, it returns only the top 5 words.
        
        Note: One or more of those words might be the word itself, so it will
        be taken out in the syns_to_strs function, but that's ok because 4 related words
        is better than having too many, in the case where there are no repeated words.
        """
        
        return sorted(words, key=self.metric)[::-1][:5] 
    
    def syns_to_strs(self, SYN_final, thresh=0.4):
        final = []
        for syn in SYN_final:
            word = syn.name().split('.')[0] #Get rid of the POS tag.
            word = word.split('_')[0] #TODO this might be a bad idea. We're adding the adjective.
            if word != self.sw:
                final.append(word)
        return final
    
    def main(self):
        SYNS = self.get_synsets()
        all_words_extracted = self.associated_words(SYNS)
        final_as_synsets = self.sort_by_similarity(all_words_extracted)
        return self.syns_to_strs(final_as_synsets, -1)

# WordNet should work well for our task. Now let's implement it.

First, I copied the data into a tab-separated .txt file because that allowed Pandas to read it.

In [5]:
df = pd.read_csv('data_copy.txt', delimiter='\t')
vals = df.values
m,n = vals.shape
output = np.empty([m, 3]).astype(str)

In [14]:
for ind in tqdmn(range(len(vals))): #TQDM can only handle simple loops. 
    row = vals[ind]
    classification, tags = row
    class_id = classification.split(' ')[0]
    orig_word = ' '.join(classification.split(' ')[1:])
    searchwords = re.split(r' |, ', classification)[1:]
    first = searchwords[0]
    
    #Now, come to think of it, it'd be better not to initialize a new class every time...
    try:
        W = WordnetAssociations(first) 
        associated_words = W.main()
    except ValueError: #"That word is not in the corpus"
        """
        We're going to use wikipedia.
        This WILL be pretty bad, there must be a better way.
        """
        l = wikipedia.search(first)
        associated_words = []
        for i in l:
            if len(i.split(' ')) == 1: #Only accept one-word responses.
                if i.lower() != first.lower(): #Don't accept copies of the search word.
                    associated_words.append(i)
    output[ ind ] = [class_id, orig_word, ', '.join(associated_words)]




In [3]:
out_df = pd.DataFrame(output)
out_df.to_csv('word_associations2.csv', header=['ID', 'Word', 'Tags'], index=False)

## Word2Vec
This would probably be great, but my laptop overflows with memory trying to read in the embeddings...

In [15]:
#model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

# GloVe
https://nlp.stanford.edu/pubs/glove.pdf  
This is similar to word2vec. They have a lot of embeddings, the one I'm using is from Wikipedia and it's their smallest one.

In [6]:
#DO NOT RUN THIS CELL AGAIN
datadict = {}
FILES = glob.glob('glove*.txt')

for file in tqdmn(FILES):
    with open(file) as f:
        for ind, l in tqdmn(enumerate(f)):
            line = l.split(' ')
            word, vec = line[0], np.array(line[1:]).astype(np.float32)
            datadict[word] = vec




In [17]:
#Now we'll put all the vectors in a KDTree
glove_lk = list(datadict.keys())
glove_vals = list(datadict.values())
mykey = glove_lk[1000]
myvec = glove_vals[1000]
print ('Dictionary maintains order:', np.all(myvec == datadict[mykey]))

Dictionary maintains order: True


In [19]:
#NumPy arrays will be an easier datatype generally.
WORDS = np.array(glove_lk)
A = np.array(glove_vals)

In [20]:
#Make the KDTree for vector comparison
start = time.time()
tree = spatial.KDTree(A)
elapsed = time.time() - start
print (elapsed)

5.187044620513916


In [21]:
#Now iterate through all the words, make a CSV
dists = []
start = time.time()

for ind in tqdmn(range(len(vals))): #TQDM can only handle simple loops. 
    row = vals[ind]
    classification, tags = row
    class_id = classification.split(' ')[0]
    orig_word = ' '.join(classification.split(' ')[1:])
    searchwords = re.split(r' |, ', classification)[1:]
    first = searchwords[0]
    
    #Now, come to think of it, it'd be better not to initialize a new class every time...
    try:
        #Glove takes a little longer to run, but the words look better.
        myvec = datadict[first]
        ds, inds = tree.query(myvec, 6)
        dists.append(ds)
        associated_words = list(WORDS[inds][1:])
    except KeyError: #"That word is not in the corpus"
        #If it isn't in our list, we're just going to ignore it for now.
        associated_words = []
        
    output[ ind ] = [class_id, orig_word, ', '.join(associated_words)]
    
out_df = pd.DataFrame(output)
out_df.to_csv('glove_association.csv', header=['ID', 'Word', 'Tags'], index=False)

print (time.time() - start)


27867.049483299255


In [23]:
os.system('git add glove_association.csv')
os.system("git commit -m 'we have finished'")
os.system('git push origin master')

32768