In [13]:
import pandas as pd
import numpy as np
import pylab as pl
from copy import deepcopy
from itertools import count
from sys import stdout

%matplotlib notebook

# Parameters
field = "Econ"

In [17]:
# Function definitions
########################

def NegLogLikelihood(vidList,simMat):
    '''
    Function to calculate average probability
    vidList is a single list of venue IDs
    simMat is the dataframe of vid (rows) vs aid (columns)
    and corresponding probabilities
    '''
    
    probs=simMat[vidList['AID'].iloc[0]]
    logVec=np.mean([-np.log(probs[tmp]) for tmp in vidList['VID']])
    
    return logVec

def VerboseShuffle(inVec):
    global shuffleCounter
    
    np.random.shuffle(inVec)
    
    try:
        n=next(shuffleCounter)
    except NameError:
        shuffleCounter=count(1)
        n=next(shuffleCounter)
        
    if n%100==0: stdout.write('#'+str(n)+" ")

In [4]:
# Loading in the data
#######################

# Extracting recommender system based scores
df=pd.read_csv("predictions_"+field+".csv") # Recommender system based
recdf=df.pivot(index='AID',columns='VID',values='Citations').sort_index().fillna(0)

# Extracting relevance
reldf=pd.read_csv("distances_econ_filtered.csv").set_index("AID").sort_index().fillna(0)
reldf=reldf[recdf.columns]
reldf.columns.name='VID'

# Transposing so it's easier to normalize later
recdf=recdf.T
reldf=reldf.T

# Validation data
realdf=pd.read_csv(field+"AIDPIDVIDANameVName.csv")[['AID','VID']]
realdf=realdf[realdf.AID.isin(reldf.columns)]
realdf=realdf[realdf.VID.isin(reldf.index)]

# Reality checks
assert(set(recdf.index)==set(reldf.index)) # should be same set of authors
assert(np.all(recdf.columns==reldf.columns))

In [41]:
# Converting to probabilities
###############################

# Temperature
taurel=.05
taurec=1

# Exponentiating
nreldf=np.exp(reldf/taurel)
nrecdf=np.exp(recdf/taurec)

# Normalizing
nreldf=nreldf/nreldf.sum()
nrecdf=nrecdf/nrecdf.sum()

Generating baseline versions based on randomized probabilities
===================================

In [42]:
# Shuffling
# (this takes a while!)
########################

rand_nreldf=deepcopy(nreldf)
rand_nrecdf=deepcopy(nrecdf)

# Clearing the shuffle counter
try: del(shuffleCounter)
except: pass

stdout.write("Relevance scores. Shuffling author: ")
foo=rand_nreldf.apply(VerboseShuffle)
print();del(shuffleCounter)

stdout.write("Recommender System scores. Shuffling author: ")
foo=rand_nrecdf.apply(VerboseShuffle)
print();del(shuffleCounter)

Relevance scores. Shuffling author: #100 #200 #300 #400 #500 #600 #700 #800 #900 #1000 
Recommender System scores. Shuffling author: #100 #200 #300 #400 #500 #600 #700 #800 #900 #1000 


In [44]:
realdg=realdf.groupby('AID')

# Comparing cases
print("Recommender system:")
print("===================")
print("Actual: "+str(realdg.apply(lambda tmp: NegLogLikelihood(tmp,nrecdf)).mean()))
print("Randomized: "+str(realdg.apply(lambda tmp: NegLogLikelihood(tmp,rand_nrecdf)).mean()))

print()
print("Relevance:")
print("==========")
print("Actual: "+str(realdg.apply(lambda tmp: NegLogLikelihood(tmp,nreldf)).mean()))
print("Randomized: "+str(realdg.apply(lambda tmp: NegLogLikelihood(tmp,rand_nreldf)).mean()))

Recommender system:
Actual: 8.24574018781
Randomized: 6.73416902266

Relevance:
Actual: 7.47520453529
Randomized: 6.78484530143
