In [1]:
import numpy as np
import pandas as pd
import os
import re, nltk, gensim, string
import itertools

In [2]:
from nltk.metrics import spearman_correlation
from nltk.corpus import wordnet

In [3]:
data = pd.read_csv('data/ws353.tsv', sep='\t', header=None, names=['word1','word2','similarity'])
data.head(1)

Unnamed: 0,word1,word2,similarity
0,tiger,cat,7.35


# Word Similarity

In [5]:
# compute highest possible similarity from each combination of word senses for
def get_highest_similarity(word1, word2, method='path', brown_ic = nltk.corpus.wordnet_ic.ic("ic-brown.dat")):
    similarities = []
    for x, y in itertools.product(word1, word2):
        try:
            if method == 'path':
                similarity = x.path_similarity(y)
            elif method == 'lcs':
                similarity = x.lch_similarity(y)
            elif method == 'wup':
                similarity = x.wup_similarity(y)
            elif method == 'resnik':
                similarity = x.res_similarity(y, ic=brown_ic)
            elif method == 'jcn':
                if x!=y:
                    similarity = x.jcn_similarity(y, ic=brown_ic)
                else:
                    similarity = None
            elif method == 'lin':
                similarity = x.lin_similarity(y, ic=brown_ic)
        except:
            similarity = None
        
        if similarity:
            similarities += [similarity]
                    
    if len(similarities) > 0:
        return max(similarities)
    else:
        return np.nan

In [6]:
# compute correalation - first remove all instances where computed similarity is null, and then compute spearman correlation
def compute_corr(labels, similarities):
    labels = [x for x,y in zip(labels, similarities) if y==y]
    similarities = [x for x in similarities if x==x]
    corr = [round(spearman_correlation(list(enumerate(labels)), list(enumerate(similarities))),4)]
    return corr

def compute_coverage(similarities):
    return [len([x for x in similarities if x==x])]

In [33]:
correlation = pd.DataFrame(index=['primary','max'])
coverage = pd.DataFrame(index=['primary','max'])

for method in ['path','lcs','wup','resnik','jcn','lin']:
    
    similarities, max_similarities = [], []
    for _, row in data.iterrows():
        word1 = wordnet.synsets(row.word1)
        word2 = wordnet.synsets(row.word2)
        similarities += [get_highest_similarity([word1[0]], [word2[0]], method)] # primary similarity
        max_similarities += [get_highest_similarity(word1, word2, method)] # maximum possible similarity
    
    # correlation with primary word senses similarity
    corr = compute_corr(data['similarity'].values, similarities)
    cov = compute_coverage(similarities)
    
    # correlation with maximium possible similarity
    corr += compute_corr(data['similarity'].values, max_similarities)
    cov += compute_coverage(max_similarities)
    
    correlation[method] = corr
    coverage[method] = cov

In [34]:
correlation

Unnamed: 0,path,lcs,wup,resnik,jcn,lin
primary,0.9955,0.9974,0.9959,0.9974,0.9953,0.9924
max,0.9957,0.9979,0.9963,0.9991,0.9955,0.9954


In [35]:
coverage

Unnamed: 0,path,lcs,wup,resnik,jcn,lin
primary,201,201,201,160,196,160
max,203,203,203,192,201,192


# Data Preprocess

In [4]:
corpus = open('data/news.2007.en.shuffled.deduped', 'r').read().split('\n')
print(corpus[:2])

['¿ Robert J. Spagnoletti, attorney general: $22,903**', '32: A "red phone" direct line.']


In [None]:
count = 0
with open('/content/drive/My Drive/Datasets/NLP/Lang Tech/lab 1/corpus2007.tsv','w') as g:
    with open('/content/drive/My Drive/Datasets/NLP/Lang Tech/lab 1/news.2007.en.shuffled.deduped', 'r') as f:
        for j, line in enumerate(f):
            line = ' '.join(nltk.word_tokenize(re.sub(r'[^\w\d\s]+', '', line)))  # clean and tokenize - remove punctuation and special characters
            g.write(line+'\n')
            count += i+len(line)
print(j, count)

In [None]:
count = 0
with open('/content/drive/My Drive/Datasets/NLP/Lang Tech/lab 1/corpus2018.tsv','w') as g:
    with open('/content/drive/My Drive/Datasets/NLP/Lang Tech/lab 1/news.2018.en.shuffled.deduped', 'r') as f:
        for j, line in enumerate(f):
            line = ' '.join(nltk.word_tokenize(re.sub(r'[^\w\d\s]+', '', line)))  # clean and tokenize - remove punctuation and special characters
            g.write(line+'\n')
            count += i+len(line)
print(j, count)

# PPMI

In [36]:
correlation = pd.DataFrame(index=['correlation', 'coverage'])

In [39]:
ppmi2007 = pd.read_csv('data/ppmi2007.tsv', sep='\t', names=['word1','word2','similarity'])
correlation['corpus2007'] = compute_corr(data['similarity'].values, ppmi2007['similarity'].values)+[ppmi2007.shape[0]]

ppmi2018 = pd.read_csv('data/ppmi2018.tsv', sep='\t', names=['word1','word2','similarity'])
correlation['corpus2018'] = compute_corr(data['similarity'].values, ppmi2018['similarity'].values)+[ppmi2018.shape[0]]

ppmi2007_alpha = pd.read_csv('data/ppmi2007_alpha.tsv', sep='\t', names=['word1','word2','similarity'])
correlation['corpus2007_alpha'] = compute_corr(data['similarity'].values, ppmi2007_alpha['similarity'].values)+[ppmi2007_alpha.shape[0]]

ppmi2018_alpha = pd.read_csv('data/ppmi2018_alpha.tsv', sep='\t', names=['word1','word2','similarity'])
correlation['corpus2018_alpha'] = compute_corr(data['similarity'].values, ppmi2018_alpha['similarity'].values)+[ppmi2018_alpha.shape[0]]

correlation

Unnamed: 0,corpus2007,corpus2018,corpus2007_alpha,corpus2018_alpha
correlation,0.9751,0.9767,0.9284,0.9383
coverage,153.0,180.0,153.0,180.0


# Embedding

In [13]:
correlation = pd.DataFrame(index=['correlation', 'coverage'])

In [14]:
word2vec2007 = pd.read_csv('data/word2vec2007.tsv', sep='\t', names=['word1','word2','similarity'])
correlation['word2vec2007'] = compute_corr(data['similarity'].values, word2vec2007['similarity'].values)+[len(word2vec2007)]

word2vec2018 = pd.read_csv('data/word2vec2018.tsv', sep='\t', names=['word1','word2','similarity'])
correlation['word2vec2018'] = compute_corr(data['similarity'].values, word2vec2018['similarity'].values)+[len(word2vec2018)]

fasttext2007 = pd.read_csv('data/fasttext2007.tsv', sep='\t', names=['word1','word2','similarity'])
correlation['fasttext2007'] = compute_corr(data['similarity'].values, fasttext2007['similarity'].values)+[len(fasttext2007)]

fasttext2018 = pd.read_csv('data/fasttext2018.tsv', sep='\t', names=['word1','word2','similarity'])
correlation['fasttext2018'] = compute_corr(data['similarity'].values, fasttext2018['similarity'].values)+[len(fasttext2018)]

correlation

Unnamed: 0,word2vec2007,word2vec2018,fasttext2007,fasttext2018
correlation,0.9958,0.9959,0.9961,0.996
coverage,202.0,203.0,203.0,203.0
