# Data evaluation
Split allnewMF.csv file into multiple groups by predicate. 
Sort each group by subject to apply binary search

In [73]:
import pandas as pd
import re
from datetime import datetime
from datetime import timedelta
import numpy as np

NaN = float('nan')
regex = re.compile(r"(^[a-z]{2}/)|(\(.*\))|(\s+)|([_,\"])")

# Loading generated metafacts

In [None]:
filename = '../results/wikID/allnewMFgen.csv' #allnewMFrestr

allnewMF = pd.read_csv(filename, sep='\t')
allnewMF = allnewMF.replace(np.nan, '-', regex=True)

allnewMF = allnewMF.replace(regex, '')
allnewMF['oldsubject'] = allnewMF['subject']
allnewMF['oldobject'] = allnewMF['object']
    
allnewMF['subject'] = allnewMF['subject'].str.replace(regex,"")
allnewMF['object'] = allnewMF['object'].str.replace(regex,"")

allnewMF['test'] = allnewMF['inDateTime'] + allnewMF['after'] + allnewMF['before']
allnewMF = allnewMF[allnewMF['test'] != '---']

print(len(allnewMF))

# Util functions

In [75]:
def fix(st):
    t = st.find('-')
    if t > 0:
        rest = st[t:]
        y = st[:t]
    else:
        rest = ''
        y = st
        
    return '%04d%s'%(int(y), rest)

In [76]:
""" string to date """
def str2date(dt):
    #try:
    if dt == '-':
        return None
    m = re.search('(\d{0,4}\-\d{0,2}-\d{0,2})|(\d{0,4}\-\d{0,2})|((\d{0,4}))', fix(dt))
    if m is None:
        #date = "0"
        print('weird format: ', dt)
        return None

    if m.group(1) != None:
        return datetime.strptime(m.group(1), '%Y-%m-%d')
    if m.group(2) != None:
        return datetime.strptime(m.group(2), '%Y-%m')
    if m.group(3) != None:
        return datetime.strptime(m.group(3), '%Y')

    """except ValueError:
        print('here!', dt)
        return None"""

## Validating

In [77]:
def distanceInterval2(validDate, generatedDate, isTimestamp = False):
    aux = [str2date(validDate[0]), str2date(validDate[1])]
    if aux[1] == None:
        A = aux[0]
        LA = 0
    else:
        """compute the median of the interval"""
        LA = abs((aux[0] - aux[1]).days)
        days = LA / 2
        A = aux[0] + timedelta(days=days)
    
    aux = [str2date(generatedDate[0]), str2date(generatedDate[1])]
    if aux[1] == None:
        B = aux[0]
        LB = 0
    elif aux[0] == None:
        B = aux[1]
        LB = 0
    else:
        if aux[0] == None:
            print(validDate, generatedDate)
        LB = abs((aux[0] - aux[1]).days)
        days = LB / 2
        B = aux[0] + timedelta(days=days)
        
    dist = abs((A - B).days)/365
    #w = 1 / (1 + abs(LA - LB)/365)
    
    #return w*(1 / (1 + dist))
    return dist
           

def distanceTimestamp2(validDate, generatedDate):
    A = str2date(validDate)
    B = str2date(generatedDate)
    dist = abs((A - B).days)/365
    #return 1 / (1 + dist)
    return dist
    
## case 4: ts include in interval
def checkContains(validDate, generatedDate):
    a = validDate[0] <= generatedDate
    b = validDate[1] >= generatedDate
    
    return a & b

def distanceContains2(a, b):
    A = [str2date(x) for x in a]
    B = str2date(b)
    if checkContains(A, B):
        LA = abs((A[0] - A[1]).days)
        days = LA / 2
        M = A[0] + timedelta(days=days)
        dist = abs((M - B).days)/365
        #return 1 / (1 + dist)
        return dist
    else:
        return .0

In [78]:
def cutYM(date):
    return [x[:7] for x in date]

def cutY(date):
    return [x[:4] for x in date]

def distanceInterval(validDate, generatedDate):
    validDate = [fix(x) for x in validDate]
    generatedDate = [fix(x) for x in generatedDate]
    return [distanceInterval2(validDate, generatedDate),
            distanceInterval2(cutYM(validDate), cutYM(generatedDate)),
            distanceInterval2(cutY(validDate), cutY(generatedDate))]

def distanceTimestamp(validDate, generatedDate):
    validDate = fix(validDate)
    generatedDate = fix(generatedDate)
    return [distanceTimestamp2(validDate, generatedDate),
            distanceTimestamp2(validDate[:7], generatedDate[:7]),
            distanceTimestamp2(validDate[:4], generatedDate[:4])]

def distanceContains(validDate, generatedDate):
    validDate = [fix(x) for x in validDate]
    generatedDate = fix(generatedDate)
    return [distanceContains2(validDate, generatedDate),
            distanceContains2(cutYM(validDate), generatedDate[:7]),
            distanceContains2(cutY(validDate), generatedDate[:4])]

In [79]:
def addYearsStr(date, years):
    date = fix(date)
    t = date.find('-')
    if t > 0:
        rest = date[t:]
        y = date[:t]
    else:
        rest = ''
        y = date
    y = int(y) + years
    
    return '%04d%s'%(int(y), rest)

In [80]:
def handleStuff(sims, row, rowDF, predicate):
    inDateTime = rowDF['inDateTime']
    after = rowDF['after']
    before = rowDF['before']
    """if predicate == 'isMarriedTo':
        inDateTime = addYearsStr(inDateTime, 18) if inDateTime != '-' else inDateTime
        after = addYearsStr(after, 18) if after != '-' else after"""

    if inDateTime != '-':
        if row['start'] != '-' and row['end'] != '-':
            sim = distanceContains([row['start'], row['end']], inDateTime)
        elif row['start'] != '-':
            sim = distanceTimestamp(row['start'], inDateTime)
        elif row['end'] != '-':
            sim = distanceTimestamp(row['end'], inDateTime)
    elif after != '-' and before != '-':
        if row['start'] != '-' and row['end'] != '-':
            sim = distanceInterval([row['start'], row['end']], [after, before])
        elif row['start'] != '-':
            sim = distanceContains([after, before], row['start'])
        elif row['end'] != '-':
            sim = distanceContains([after, before], row['end'])
    elif after != '-':
        if row['start'] != '-' and row['end'] != '-':
            sim = distanceContains([row['start'], row['end']], after)
        elif row['start'] != '-':
            sim = distanceTimestamp(row['start'], after)
        elif row['end'] != '-':
            sim = distanceTimestamp(row['end'], after)
    elif before != '-':
        if row['start'] != '-' and row['end'] != '-':
            sim = distanceContains([row['start'], row['end']], before)
        elif row['start'] != '-':
            sim = distanceTimestamp(row['start'], before)
        elif row['end'] != '-':
            sim = distanceTimestamp(row['end'], before)
    #sim = '\t'.join([str(x) for x in sim])

    if inDateTime != '-':
        if row['start'] != '-':
            #sim += '\t%f'%(distanceTimestamp2(row['start'], inDateTime))
            sim += distanceTimestamp(row['start'], inDateTime)
        else:
            #sim += '\t-'
            sim.append(NaN)
            sim.append(NaN)
            sim.append(NaN)
        if row['end'] != '-':
            #sim += '\t%f'%(distanceTimestamp2(row['end'], inDateTime))
            sim += distanceTimestamp(row['end'], inDateTime)
        else:
            #sim += '\t-'
            sim.append(NaN)
            sim.append(NaN)
            sim.append(NaN)
    else:
        #sim += '\t-\t-'
        sim.append(NaN)
        sim.append(NaN)
        sim.append(NaN)
        sim.append(NaN)
        sim.append(NaN)
        sim.append(NaN)
    if after != '-' and row['start'] != '-':
        #sim += '\t%f'%(distanceTimestamp2(row['start'], after))
        sim += distanceTimestamp(row['start'], after)
    else:
        #sim += '\t-'
        sim.append(NaN)
        sim.append(NaN)
        sim.append(NaN)
    if before != '-' and row['end'] != '-':
        #sim += '\t%f'%(distanceTimestamp2(row['end'], before))
        sim += distanceTimestamp(row['end'], before)
    else:
        #sim += '\t-'
        sim.append(NaN)
        sim.append(NaN)
        sim.append(NaN)

    sims.append((sim, float(rowDF['Confidence']), float(rowDF['PCA Confidence']), float(rowDF['Head Coverage'])))

In [81]:
def handleStuffTS(sims, row, rowDF):
    if rowDF['inDateTime'] != '-':
        if row['date'] != '-':
            sim = distanceTimestamp(row['date'], rowDF['inDateTime'])
    elif rowDF['after'] != '-' and rowDF['before'] != '-':
        if row['date'] != '-':
            sim = distanceContains([rowDF['after'], rowDF['before']], row['date'])
    elif rowDF['after'] != '-':
        if row['date'] != '-':
            sim = distanceTimestamp(row['date'], rowDF['after'])
    elif rowDF['before'] != '-':
        if  row['date'] != '-':
            sim = distanceTimestamp(row['date'], rowDF['before'])
    #sim = '\t'.join([str(x) for x in sim])
    
    if rowDF['inDateTime'] != '-':
        if row['date'] != '-':
            #sim += '\t%f'%(distanceTimestamp2(row['date'], rowDF['inDateTime']))
            sim += distanceTimestamp(row['date'], rowDF['inDateTime'])
        else:
            #sim += '\t-'
            sim.append(NaN)
            sim.append(NaN)
            sim.append(NaN)
    else:
        #sim += '\t-'
        sim.append(NaN)
        sim.append(NaN)
        sim.append(NaN)
    if rowDF['after'] != '-':
        if row['date'] != '-':
            #sim += '\t%f'%(distanceTimestamp2(row['date'], rowDF['after']))
            sim += distanceTimestamp(row['date'], rowDF['after'])
        else:
            #sim += '\t-'
            sim.append(NaN)
            sim.append(NaN)
            sim.append(NaN)
    else:
        #sim += '\t-'
        sim.append(NaN)
        sim.append(NaN)
        sim.append(NaN)
    if rowDF['before'] != '-':
        if row['date'] != '-':
            #sim += '\t%f'%(distanceTimestamp2(row['date'], rowDF['before']))
            sim += distanceTimestamp(row['date'], rowDF['before'])
        else:
            #sim += '\t-'
            sim.append(NaN)
            sim.append(NaN)
            sim.append(NaN)
    else:
        #sim += '\t-'
        sim.append(NaN)
        sim.append(NaN)
        sim.append(NaN)
    
    sim.append(NaN)
    sim.append(NaN)
    sim.append(NaN)
        
    sims.append((sim, float(rowDF['Confidence']), float(rowDF['PCA Confidence']), float(rowDF['Head Coverage'])))

In [82]:
def Log(logfile, rowDF, wikidata, similarity):
    logfile.write('%s\t%s\t%s\t%s\t%s\t%s\t %s %s %s %s %s %s\t%s\t%s\t%s\t%s\n'%(rowDF['rule'],rowDF['PCA Confidence'],
                                                                            rowDF['Head Coverage'],rowDF['Confidence'],
                                                                            rowDF['subID'],rowDF['objID'],
                                                                            rowDF['subject'], rowDF['predicate'],
                                                                            rowDF['object'], rowDF['inDateTime'],
                                                                            rowDF['after'], rowDF['before'],
                                                                            wikidata,
                                                                            '\t'.join([str(x) for x in similarity[0]]),
                                                                            str(similarity[1]), str(similarity[2])))
    #logfile.flush()

In [83]:
"""
datetype
0: timestamp
1: time interval
"""
def saveNewInputFile(file, rowDF, datetype):
    inDateTime = rowDF['inDateTime']
    after = rowDF['after']
    before = rowDF['before']
    if datetype == 0:
        dateInfo = 'inDateTime\t%s'%(inDateTime if inDateTime != '-' else after)
    else:
        dateInfo = 'after\t%s\tbefore\t%s'%(inDateTime if inDateTime != '-' else after, before)
    file.write('-\t%s\t%s\t%s\t%s\n'%(rowDF['oldsubject'], rowDF['predicate'], rowDF['oldobject'], dateInfo))

### TS Predicate

In [85]:
def ts_predicate(predicate, validateFile, subjectLabel, objectLabel):
    df = allnewMF[allnewMF['predicate'] == predicate]
    print('Records to validate', len(df))
    df = df.sort_values('subject')
    df['object'].head()    
    validDF = pd.read_csv('../../data_tocompare/%s.csv'%(validateFile), sep='\t')
    #validDF = validDF.replace(regex, '')
    validDF = validDF.sort_values(subjectLabel)
    print('Valid Records', len(validDF))
    cs = 0
    co = 0
    cot = 0
    sims = []
    sims_tsti = []
    sims_tsts = []
    file = open('%s.csv'%(predicate), 'w')
    logfile = open('%s-log.csv'%(predicate), 'w')
    logfile_tsts = open('%s-tsts-log.csv'%(predicate), 'w')
    logfile_tsti = open('%s-tsti-log.csv'%(predicate), 'w')
    logNoMatch = open('%s-log-nmatch.csv'%(predicate), 'w')
    flag = True
    objects = {}
    #rule = '?f<hasChild>?b?a<isMarriedTo>?f=>?a<hasChild>?b'
    for _, row in validDF.iterrows():
        if flag:
            lastSubject = row[subjectLabel]
            flag = False
        if lastSubject != row[subjectLabel]:
            partialDF = df[df['subID'] == lastSubject]
            #partialDF = df[(df['subID'] == lastSubject) & (df['rule'] == rule)]
            
            cot += len(partialDF)
            if len(partialDF) > 0:
                # print(lastSubject, objects)
                cs += 1
                for _, rowDF in partialDF.iterrows():
                    mfObj = rowDF['objID']
                    # print("->", mfObj)
                    if mfObj in objects and not objects[mfObj]['duplicated']:
                        # match
                        co += 1
                        handleStuffTS(sims, objects[mfObj], rowDF)
                        if rowDF['after'] != '-' and rowDF['before'] != '-':
                            Log(logfile_tsti, rowDF, 'WD: %s'%(objects[mfObj]['date']), sims[-1])
                            sims_tsti.append(sims[-1])
                        else:
                            Log(logfile_tsts, rowDF, 'WD: %s'%(objects[mfObj]['date']), sims[-1])
                            sims_tsts.append(sims[-1])
                            
                        #if sims[-1][0][0] <= 0.1: #meta-facts whose dist are <= 0.1
                        if sims[-1][1] >= 0.7: #meta-facts whose conf. are >=0.7
                            saveNewInputFile(file, rowDF, 0) # 0: timestamp, 1: time interval
                        Log(logfile, rowDF, 'WD: %s'%(objects[mfObj]['date']), sims[-1])
                    else:
                        # no match
                        if mfObj in objects and objects[mfObj]['duplicated']:
                            duplicatedInfo = "%s\t%s"%(objects[mfObj]['date'], objects[mfObj]['extras'])
                        else:
                            duplicatedInfo = " \t "
                        logNoMatch.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n'%(rowDF['rule'],rowDF['subject'], 
                                         rowDF['predicate'],rowDF['object'],
                                         rowDF['subID'],rowDF['objID'],
                                         rowDF['inDateTime'],rowDF['after'], 
                                         rowDF['before'], duplicatedInfo))


            lastSubject = row[subjectLabel]
            objects = {row[objectLabel]: {'date': row['date'], 'duplicated': False }}
        else:
            if row[objectLabel] in objects:
                objects[row[objectLabel]]['duplicated'] = True
                if 'extras' in objects[row[objectLabel]]:
                    objects[row[objectLabel]]['extras'].append(row['date'])
                else:
                    objects[row[objectLabel]]['extras'] = [row['date']]
            else:
                objects[row[objectLabel]] = {'date': row['date'], 'duplicated': False}


    file.close()
    logfile.close()
    logfile_tsts.close()
    logfile_tsti.close()
    logNoMatch.close()
    print("Total iqual subject %d"%cs)
    print("Total iqual subject-object %d"%co)
    print("Total nomatch subject-object %d"%(cot-co))
    return sims, sims_tsts, sims_tsti

### TI Predicate

In [86]:
def ti_predicate(predicate, validateFile, subjectLabel, objectLabel):
    df = allnewMF[allnewMF['predicate'] == predicate]
    print('Records to validate', len(df))
    df = df.sort_values('subject')
    df['object'].head()
    validDF = pd.read_csv('../../data_tocompare/%s.csv'%(validateFile), sep=',')
    #validDF = validDF.replace(regex, '')
    validDF = validDF.sort_values(subjectLabel)
    validDF = validDF.replace(np.nan, '-', regex=True)
    print('Valid Records', len(validDF))
    cs = 0
    co = 0
    cot = 0
    sims = []
    sims_tsts = []
    sims_titi = []
    sims_tsti = []
    file = open('%s.csv'%(predicate), 'w')
    logfile = open('%s-log.csv'%(predicate), 'w')
    logfile_tsts = open('%s-tsts-log.csv'%(predicate), 'w')
    logfile_titi = open('%s-titi-log.csv'%(predicate), 'w')
    logfile_tsti = open('%s-tsti-log.csv'%(predicate), 'w')
    logNoMatch = open('%s-log-nmatch.csv'%(predicate), 'w')
    flag = True
    objects = {}
    for _, row in validDF.iterrows():
        if flag:
            lastSubject = row[subjectLabel]
            flag = False
        if lastSubject != row[subjectLabel]:
            partialDF = df[df['subID'] == lastSubject]

            cot += len(partialDF)
            if len(partialDF) > 0:
                # print(lastSubject, objects)
                cs += 1
                for _, rowDF in partialDF.iterrows():
                    mfObj = rowDF['objID']
                    # print("->", mfObj)
                    if mfObj in objects and not objects[mfObj]['duplicated']:
                        # match
                        co += 1
                        handleStuff(sims, objects[mfObj], rowDF, predicate)
                        if rowDF['after'] != '-' and rowDF['before'] != '-' and objects[mfObj]['start'] != '-' and objects[mfObj]['end'] != '-':
                            Log(logfile_titi, rowDF, 'WD: %s %s'%(objects[mfObj]['start'], objects[mfObj]['end']), sims[-1])
                            sims_titi.append(sims[-1])
                        elif (rowDF['after'] != '-' and rowDF['before'] != '-') or (objects[mfObj]['start'] != '-' and objects[mfObj]['end'] != '-'):
                            Log(logfile_tsti, rowDF, 'WD: %s %s'%(objects[mfObj]['start'], objects[mfObj]['end']), sims[-1])
                            sims_tsti.append(sims[-1])
                        else:
                            Log(logfile_tsts, rowDF, 'WD: %s %s'%(objects[mfObj]['start'], objects[mfObj]['end']), sims[-1])
                            sims_tsts.append(sims[-1])
                        
                        #if sims[-1][0][0] <= 0.1: #meta-facts whose dist are <= 0.1
                        if sims[-1][1] >= 0.7: #meta-facts whose conf. are >=0.7
                            saveNewInputFile(file, rowDF, 1) # 0: timestamp, 1: time interval
                        Log(logfile, rowDF, 'WD: %s %s'%(objects[mfObj]['start'], objects[mfObj]['end']), sims[-1])
                    else:

                        if mfObj in objects and objects[mfObj]['duplicated']:
                            duplicatedInfo = "%s %s %s"%(objects[mfObj]['start'], objects[mfObj]['end'], objects[mfObj]['extras'])
                        else:
                            duplicatedInfo = "*"

                        # no match
                        logNoMatch.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n'%(rowDF['rule'],rowDF['subject'], 
                                         rowDF['predicate'],rowDF['object'], 
                                         rowDF['subID'],rowDF['objID'],
                                         rowDF['inDateTime'],rowDF['after'], 
                                         rowDF['before'], duplicatedInfo))


            lastSubject = row[subjectLabel]
            objects = {row[objectLabel]: {'start': row['start'], 'end': row['end'], 'duplicated': False }}
        else:
            if row[objectLabel] in objects:
                objects[row[objectLabel]]['duplicated'] = True
                if 'extras' in objects[row[objectLabel]]:
                    objects[row[objectLabel]]['extras'].append('%s %s'%(row['start'], row['end']))
                else:
                    objects[row[objectLabel]]['extras'] = ['%s %s'%(row['start'], row['end'])]
            else:
                objects[row[objectLabel]] = {'start': row['start'], 'end': row['end'], 'duplicated': False}


    file.close()
    logfile.close()
    logfile_tsts.close()
    logfile_tsti.close()
    logfile_titi.close()
    logNoMatch.close()
    print("Total iqual subject %d"%cs)
    print("Total iqual subject-object %d"%co)
    print("Total nomatch subject-object %d"%(cot-co))
    
    return sims, sims_tsts, sims_tsti, sims_titi

## hasChild

In [None]:
simsHasChild, simsHasChild_tsts, simsHasChild_tsti = ts_predicate('hasChild', 'hasChild', 'pId', 'childId')

In [88]:
c = 0
for x in simsHasChild:
    c += 1 if 1 >= x[0][0] else 0
print(c)

81


## directed

In [None]:
simsDirected, simsDirected_tsts, simsDirected_tsti = ts_predicate('directed', 'created', 'personId', 'workId')

In [None]:
c = 0
for x in simsDirected:
    c += 1 if 0 == x[0][0] else 0
print(c)

## isMarriedTo

In [None]:
simsIsMarriedTo, simsIsMarriedTo_tsts, simsIsMarriedTo_tsti, simsIsMarriedTo_titi = ti_predicate('isMarriedTo', 'isMarriedTo', 'pId', 'spId')

In [None]:
c = 0
for x in simsIsMarriedTo_tsti:
    c += 1 if 0 == x[0][0] else 0
print(c)

## isAffiliatedTo

In [None]:
simsIsAffiliatedTo, simsIsAffiliatedTo_tsts, simsIsAffiliatedTo_tsti, simsIsAffiliatedTo_titi = ti_predicate('isAffiliatedTo', 'isAffiliatedTo',
                                                                                                             'pId', 'affId')

In [None]:
c = 0
for x in simsIsAffiliatedTo:
    c += 1 if 0 == x[0][0] else 0
print(c)

## playsFor

In [None]:
simsPlaysFor, simsPlaysFor_tsts, simsPlaysFor_tsti, simsPlaysFor_titi = ti_predicate('playsFor', 'isAffiliatedTo',
                                                                                     'pId', 'affId')

In [None]:
c = 0
for x in simsPlaysFor:
    c += 1 if 0 == x[0][0] else 0
print(c)

### worksAt

In [None]:
simsWorksAt, simsWorksAt_tsts, simsWorksAt_tsti, simsWorksAt_titi = ti_predicate('worksAt', 'worksAt',
                                                                                 'pId', 'compId')

In [None]:
c = 0
for x in simsWorksAt:
    c += 1 if 0 == x[0][0] else 0
print(c)

## Create structure to analyse the data

In [28]:
d = {'hasChild':       [simsHasChild,simsHasChild_tsts,simsHasChild_tsti],
     'directed':       [simsDirected,simsDirected_tsts,simsDirected_tsti],
     'isMarriedTo':    [simsIsMarriedTo,simsIsMarriedTo_tsts,simsIsMarriedTo_tsti,simsIsMarriedTo_titi],
     'isAffiliatedTo': [simsIsAffiliatedTo,simsIsAffiliatedTo_tsts,simsIsAffiliatedTo_tsti,simsIsAffiliatedTo_titi],
     'playsFor':       [simsPlaysFor,simsPlaysFor_tsts,simsPlaysFor_tsti,simsPlaysFor_titi],
     'worksAt':        [simsWorksAt,simsWorksAt_tsts,simsWorksAt_tsti,simsWorksAt_titi]}

In [29]:
import gzip as gz
import pickle as pkl

In [30]:
f = gz.open('similarities_dbp.pkl.gz','wb')
pkl.dump(d,f)
f.close()

In [None]:
for x in d:
    print(x, len(d[x][0]))

In [71]:
def normalizeByColumn(data):
    nan = str('NaN')
    a, b, c, d = zip(*data)
    a = np.array(a, dtype=np.float64)
    
#     std = np.std(a, axis=0)
#     mean = np.mean(a, axis=0)
#     lim = mean+2*std
    
#     for x in a:
#         for i in range(len(std)):
#             if x[i] > lim[i]:
#                 x[i] = nan      
    a = a / np.nanmax(a, axis=0)
    #print(a)
    res = zip(a.tolist(), b, c, d)
    return list(res)

In [None]:
similarity = []
similarity_tsts = []
similarity_tsti = []
similarity_titi = []

for key in d:
    similarity += normalizeByColumn(d[key][0])
    similarity_tsts += normalizeByColumn(d[key][1])
    if len(d[key][2]) > 0:
        similarity_tsti += normalizeByColumn(d[key][2])
    if len(d[key]) > 3 and len(d[key][3]) > 0:
        similarity_titi += normalizeByColumn(d[key][3])
    
print(len(similarity))
print(len(similarity_tsts))
print(len(similarity_tsti))
print(len(similarity_titi))

In [None]:
f = gz.open('totalSimilarities_dbp.pkl.gz','wb')
pkl.dump((similarity, similarity_tsts, similarity_tsti, similarity_titi),f)
f.close()