In [1]:
import pandas as pd
import numpy as np
import pylab as pl
from copy import deepcopy

%matplotlib notebook

# Parameters
field = "Econ"

In [151]:
# Function definitions
########################

def NegLogLikelihood(vidList,simMat):
    '''
    Function to calculate average probability
    vidList is a single list of venue IDs
    simMat is the dataframe of vid (rows) vs aid (columns)
    and corresponding probabilities
    '''
    
    probs=simMat[vidList['AID'].iloc[0]]
    logVec=np.mean([-np.log(probs[tmp]) for tmp in vidList['VID']])
    
    return logVec

In [142]:
# Loading in the data
#######################

# Extracting recommender system based scores
df=pd.read_csv("predictions_Econ.csv") # Recommender system based
recdf=df.pivot(index='AID',columns='VID',values='Citations').sort_index().fillna(0)

# Extracting relevance
reldf=pd.read_csv("distances_econ_filtered.csv").set_index("AID").sort_index().fillna(0)
reldf=reldf[recdf.columns]
reldf.columns.name='VID'

# Transposing so it's easier to normalize later
recdf=recdf.T
reldf=reldf.T

# Validation data
realdf=pd.read_csv(field+"AIDPIDVIDANameVName.csv")[['AID','VID']]
realdf=realdf[realdf.AID.isin(reldf.columns)]
realdf=realdf[realdf.VID.isin(reldf.index)]

# Reality checks
assert(set(recdf.index)==set(reldf.index)) # should be same set of authors
assert(np.all(recdf.columns==reldf.columns))

In [139]:
# Converting to probabilities
###############################

# Temperature
taurel=.1
taurec=.5

# Exponentiating
nreldf=np.exp(reldf/taurel)
nrecdf=np.exp(recdf/taurec)

# Normalizing
nreldf=nreldf/nreldf.sum()
nrecdf=nrecdf/nrecdf.sum()

In [6]:
# Randomized versions
# (this takes a while!)
########################

rand_nreldf=deepcopy(nreldf)
rand_nrecdf=deepcopy(nrecdf)
foo=rand_nreldf.apply(np.random.shuffle)
foo=rand_nrecdf.apply(np.random.shuffle)

In [170]:
# Baseline case
realdg=realdf.groupby('AID')
realdg.apply(lambda tmp: NegLogLikelihood(tmp,nreldf)).mean()

6.1602566701491579