In [None]:
import w2vAida
import analogy_completion
import pandas as pd
import numpy as np
import operator

In [5]:
## CONFIG
model_version = "s2v"   # one of s2v, aida, or w2v
gold_set = "SAT" # one of SAT, AGS
run_completion_experiment = False
run_ranking_experiment = True
exclude_minority_vectores = False
verbose = True

In [6]:
## logging
log_path = "/opt3/home/lofi/github/SimilarityAndAnalogy/analogy/btw17/log/"
log_filename = model_version+"_"+gold_set+"_"+("NoMinority" if exclude_minority_vectores else "Minority")+".log"

import logging
from logging.handlers import RotatingFileHandler
import sys

log = logging.getLogger('')
log.setLevel(logging.DEBUG)
format = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")

#ch = logging.StreamHandler(sys.stdout)
#ch.setFormatter(format)
#log.addHandler(ch)

#fh = logging.handlers.RotatingFileHandler(log_path+log_filename, maxBytes=(1048576*5), backupCount=7)
fh = logging.FileHandler(log_path+log_filename, mode="w")
fh.setFormatter(format)

log.addHandler(fh)

In [7]:
### VERSION on IS69
if (model_version == "s2v"):
    model_file = "/opt3/home/pratima/thesis_final/models/s2v.model.bin"
    vocab_file = "/opt3/home/lofi/word2vec_models/s2v.vocab"
    delimeter = "|"
elif (model_version == "aida"):
    model_file = "/opt3/home/pratima/thesis_final/models/aida.model.bin"
    vocab_file = "/opt3/home/lofi/word2vec_models/aida.vocab"
    delimeter = ":"
logging.debug("version: "+model_version)
if (gold_set == "AGS"):
    golddata_file = "/opt3/home/lofi/github/SimilarityAndAnalogy/analogy/testData/AGS/AGS-V02.txt"
else:
    golddata_file = "/opt3/home/lofi/github/SimilarityAndAnalogy/analogy/testData/SAT/crowdruns/SAT_AGS_format.csv"


In [8]:
### LOCAL VERSION

#model_file = "D:/data/analogy/pratima_w2v_models/aida.model.bin"
#model_file = "D:/data/analogy/pratima_w2v_models/s2v.model.bin"
#vocab_file = "D:/data/analogy/pratima_w2v_models/aida.vocab"
#golddata_file = "../testData/AGS/AGS-V02.txt"
#output_file = "./result/test.txt"

In [9]:
### LOAD GOLD DATA

def loadGoldData_AGS(dataset):
    with open(dataset, 'r') as file:
        lines = file.readlines()
        simGold = []
        i = 0
        while i < len(lines):
            line = lines[i].strip()
            i += 1
            if line.startswith(":"):
                continue
            if line.startswith("\n"):
                i += 1
                continue
            splits = line.split(" ")
            correctedsplits = [ ]
            correctedsplits.append(splits[0])
            correctedsplits.append(splits[1])
            correctedsplits.append(splits[2])
            correctedsplits.append(splits[3])
            correctedsplits.append(splits[4])
            correctedsplits.append(splits[5])
            correctedsplits.append(splits[6])
            correctedsplits.append(splits[7])
            correctedsplits.append(splits[8])
            simGold.append(correctedsplits)
            #print(correctedsplits)
        return simGold


analogydataset = loadGoldData_AGS(golddata_file)

In [None]:
### Load Training Model. Note: s2v needs | delimeter, 
model=w2vAida.Word2Vec.load_word2vec_format(fname=model_file, binary=True, fvocab=vocab_file, delimeter=delimeter)

In [None]:
## run word statistics for the current model
full_words={}
for challange in analogydataset:
    for i in range(0, 4):
        if not challange[i] in full_words:
            full_words[challange[i]]=[]
##
 # iterate over the whole vocabulary....
for key,value in model.vocab.items():
    # split the multi-prototype from the vocabulary into main word part, and prototype part
    freq = value.count
    split_key = key.split(delimeter)
    main_keyword = split_key[0]
    # iterate over the provided words, and check if it matches any token
    for word in full_words:
        # if we have indeed a match...
        if (word == main_keyword):
            # we keep a dictionary of statistics for each word, and add new prototypes to it when found
            full_words[main_keyword].append([key, freq])

In [3]:
## get rid of unwanted vectors
all_words=full_words
for word, prototypes in all_words.items():
    if verbose:
        print(">>>"+word)
    max_prototype=max(prototypes,key=operator.itemgetter(1))
    for prototype in prototypes:
        if verbose:
            print(prototype)
        if (prototype[1]<0.2*max_prototype[1]):
            prototypes.remove(prototype)



NameError: name 'full_words' is not defined

In [None]:
# just a test 
if (model_version=="s2v"):
    print(model.n_similarity(['egg|noun', 'chick|noun'], ['larva|noun', 'insect|noun']))
print(model.n_similarity_new(['egg','chick'],['larva','insect'], all_words=all_words))

In [None]:
r_columns=('a1','a2','b1','b2','ags_rating','model_rating','difficulty')
raw_result = pd.DataFrame(columns=r_columns)
result_file_name=log_path+model_version+"_"+("NoMinority" if exclude_minority_vectores else "Minority")+".result.csv"

In [None]:
### RANKING EXPERIMENT
if run_ranking_experiment:

    
    duplicatecount=0
    count=0
    countlines=0
    previouscheckvalue=0
    total=0
    nextval = 0

    for value in analogydataset:
        #if(value[6]==word):
        r_similarity = model.n_similarity_new([value[0], value[1]],[value[2],value[3]], all_words=all_words)
        print("{}:{} :: {}:{}  >> {}    reference rating {}".format(value[0], value[1], 
                                                                    value[2], value[3], r_similarity, value[4]))

        countlines+=1
        originalRating=value[4]
        originalcheckvalue=value[0]
       
        row = pd.DataFrame([[value[0],value[1],value[2],value[3],value[4],r_similarity, value[8]]],columns=r_columns)
        raw_result=raw_result.append(row)
    raw_result.to_csv(result_file_name, encoding='utf-8')
              

In [None]:
raw_result['ags_rating'] = raw_result['ags_rating'].apply(pd.to_numeric)
grouped = raw_result.groupby(("a1", "a2"))

In [1]:
# define thresholds for definitly corretc, definitly incorrect, and neutral
correct_threshold = 4
incorrect_threshold = 2

In [None]:
from six import string_types
from numpy import ndarray

def n_similarity_version2(model, all_words, one, two):

    model.init_sims()

    if isinstance(one, string_types) and not two:
        # allow calls like most_similar('dog'), as a shorthand for most_similar(['dog'])
        one = [one]

    # add weights for each word, if not already present; default to 1.0 for positive and -1.0 for negative words
    one = [(word, 1.0) if isinstance(word, string_types + (ndarray,))
                            else word for word in one]
    two = [(word, -1.0) if isinstance(word, string_types + (ndarray,))
                             else word for word in two]

    p = [(word, 1.0) if isinstance(word, string_types)
                            else word for word in one]
    n = [(word, -1.0) if isinstance(word, string_types)
                             else word for word in two]

    p1 = p[:len(p)//2]
    p2 = p[len(p)//2:]
    sp1 = ', '.join('{}'.format(*el) for el in p1)
    sp2 = ', '.join('{}'.format(*el) for el in p2)
    n1 = n[:len(n)//2]
    n2 = n[len(n)//2:]
    sn1 = ', '.join('{}'.format(*el) for el in n1)
    sn2 = ', '.join('{}'.format(*el) for el in n2)

    wone, wtwo, wthree, wfour= [], [], [], []


    all_input = []
    all_counts = {}

    
    all_counts=all_words
    #print("Use given all words")
    for word, weight in one + two:
        for prototype in all_counts[word]:    
            all_input.append(prototype[0])


    ## sort word back into the respective input buckets (yes, this is unnecesarily convoluted)
    for f in all_input:
        if sp1 in f:
            wone.append(f)
        elif sp2 in f:
            wtwo.append(f)
        elif sn1 in f:
            wthree.append(f)
        elif sn2 in f:
            wfour.append(f)

    if not wone or not wtwo or not wthree or not wfour:
        return 0.0

    intermediate=[]
    # iterate over the crossproduct of all variants of word one and all variants of word two
    for x1, y1, x2, y2 in [(x1,y1, x2, y2) for x1 in wone for y1 in wtwo for x2 in wthree for y2 in wfour]:
        similarity = Word2Vec.n_similarity(model,[x1,y1],[x2,y2])
        words = [x, y1, x2, y2]
        l.append([similary, words])
        # store a tuple (similarity, (w1_variant, w2_variant))
        print(similarity+" "+words)
    m = max(intermediate,key=itemgetter(0))   
    return m

In [None]:
print(n_similarity_version2(model=model,one=['kernel','nut'],two=['thorn','stem'], all_words=all_words))


In [None]:
### more complex statistics: correct >> incorrect

### create result file
result_columns=('name','model','difficulty')
result = pd.DataFrame(columns=result_columns)
result_file_name=log_path+model_version+"_"+("NoMinority" if exclude_minority_vectores else "Minority")+".result.final.csv"

# is highest correct?
count = 0
correct = 0
for name, group in grouped:
    #print(group)
    ## select all rows which have a the maximum value for model rating
    correct_in_ags= group[group["ags_rating"] >= correct_threshold]
    incorrect_in_ags= group[group["ags_rating"] <= incorrect_threshold]

    violating_rows=pd.DataFrame(columns=r_columns)
    
    is_correct = True
    # compare all correct rows to all incorrect rows.
    for i1, c_row in correct_in_ags.iterrows():
        for i2, i_row in incorrect_in_ags.iterrows(): 
            # if an incorrect row has a better rating than a correct row, it is broken
            if i_row["model_rating"]>=c_row["model_rating"]:
                violating_rows=violating_rows.append([i_row, c_row])
                is_correct = False
            else:
                pass
        ## debug errors to console
        if not is_correct:
            print(violating_rows.to_string(columns=r_columns))
            
    count+=1
    if (is_correct):
        correct+=1
    difficulty = group.iloc[0]["difficulty"]
    print(difficulty)
    resultrow = pd.DataFrame([[str(name).replace(",","").replace("(","").replace(")","").replace("'",""), 
                               1 if is_correct else 0, difficulty]],columns=result_columns)
    result=result.append(resultrow)

result.model=result.model.astype(int)
overal_correctness = float(correct)/count
print(overal_correctness)

In [None]:

difficulty_labels=("advanced", "hard", "medium", "easy")

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

%matplotlib inline

##setup styles
sns.set(style="darkgrid")

ax = sns.barplot(data=result, y="model", x="difficulty", ci=None, palette="deep", order=difficulty_labels)
ax.set_ylim(0,1)
ax.set_ylabel("sucessfully solved challanges [fraction]")
ax.set_xticklabels(labels=ax.get_xticklabels(), rotation=30)
## add persentage axis

for p in ax.patches:
    x=p.get_bbox().get_points()[:,0]
    y=p.get_bbox().get_points()[1,1]
    ax.annotate('{:1.1f}%'.format(y*100), (x.mean(), y), 
            ha='center', va='bottom')


##
plt.title("Performance of "+model_version+" on "+gold_set)
plt.show()

In [None]:
result

In [None]:
analogydataset