In [1]:
import pandas as pd
from tqdm import tqdm
from collections import Counter
import numpy as np
import pickle as pkl
import random
from dateutil.parser import parse as parseDate

In [2]:
df = pd.read_csv('./data/attendShort.csv')

d2LevelNames = pkl.load(open('./data/d2LevelNames.pkl', 'rb'))
d3LevelNames = pkl.load(open('./data/d3LevelNames.pkl', 'rb'))

d2LevelNamesReverse = {v: k for k, v in d2LevelNames.items()}
d3LevelNamesReverse = {v: k for k, v in d3LevelNames.items()}

In [3]:
notNullUsers = df.groupby('userId').agg({'date': 'count'}).reset_index()
notNullUsers = set(notNullUsers[notNullUsers['date'] > 10]['userId'])
dfNN = df[df['userId'].isin(notNullUsers)]

In [4]:
d3LevelCounts = {}

for k in tqdm(dfNN['d3LevelId'].unique()):
    userIds = dfNN[dfNN['d3LevelId'] == k]['userId'].unique()
    
    dfNNItem = dfNN[(dfNN['userId'].isin(userIds))&(dfNN['d3LevelId'] != k)]
    
    d3LevelCounts[k] = dict(Counter(dfNNItem['d3LevelId']))
    
for k in d3LevelCounts:
    s = sum(d3LevelCounts[k])
    d3LevelCounts[k] = {kk : v / s for kk, v in d3LevelCounts[k].items()}
    
for k in d3LevelCounts:
    
    vec = np.zeros(max(d3LevelCounts) + 1)
    for kk, v in d3LevelCounts[k].items():
        vec[kk] = v
        
    d3LevelCounts[k] = vec

100%|██████████| 453/453 [00:45<00:00,  9.88it/s]


In [5]:
probTotal = np.zeros(max(d3LevelCounts) + 1)

for k, v in dict(df.groupby('d3LevelId').agg({'date': 'count'})['date']).items():
    probTotal[k] = v
    
probTotal /= np.sum(probTotal)

In [74]:
def getAttends(userId: str):
    
    recs = df[df['userId'] == userId].to_dict(orient = 'records')
    
    return [
        {'d3LevelId': r['d3LevelId'], 'date': r['date']} for r in recs
    ]

In [171]:
# userId = 101387513
# n = 5
# randomChoice = True
# dateAttenuation = False

# DATE_ATTENUATION_PARAM = 0.9

# attends = getAttends(userId)

# for a in attends:
#     a['date'] = parseDate(a['date'])

# attends = sorted(attends, key = lambda x: x['date'])[::-1]

# probVec = np.array([d3LevelCounts[a['d3LevelId']] for a in attends])

# if dateAttenuation:
#     dateMult = []

#     m = 1
#     for i in range(0, len(attends)):
#         dateMult.append(m)
#         m *= DATE_ATTENUATION_PARAM
# else:
#     dateMult = np.ones(len(attends))

# probVec = probVec.transpose().dot(dateMult)
# probVec /= np.sum(probVec)

# probDiff = (probVec - probTotal)
# probDiff[np.where(probDiff > 0)] = 0
# probDiff = -probDiff
# probDiff /= probDiff.sum()

# def getSetProb(arr):
    
#     p = 1.0

#     for i in range(len(arr)):
#         for j in range(i + 1, len(arr)):
#             p *= d3LevelCounts[arr[i]][arr[j]]
            
#     return p

# if randomChoice:
#     reses = [
#         np.random.choice(list(range(probVec.shape[0])), n, p = probVec, replace = False)
#         for i in range(10)
#     ]
    
#     reses = [(r, getSetProb(r)) for r in reses]
        
#     res = sorted(reses, key = lambda x: x[1])[0][0]
# else:
#     res = np.argsort(probDiff)[::-1][:n]

In [233]:
def recomendForUser(
    userId: int, 
    best: int = 5, 
    rare: int = 5,
    randomChoice = False,
    dateAttenuation = True
):
    
    return recomendForUserBest(
        userId, best
    ) + recomendForUserRare(
        userId, rare,
#         randomChoice = randomChoice, dateAttenuation = dateAttenuation
    )

def recommendFromVector(vec, n: int = 5, randomChoice = False):
    
    if randomChoice:
        res = np.random.choice(list(range(probVec.shape[0])), n, p = probVec, replace = False)
    else:
        res = np.argsort(probVec)[::-1][:n]
        
    return res

def recommendVectorBest(attends: list, x dateAttenuation = False):
    
    for a in attends:
        a['date'] = parseDate(a['date'])

    attends = sorted(attends, key = lambda x: x['date'])[::-1]

    probVec = np.array([d3LevelCounts[a['d3LevelId']] for a in attends])

    if dateAttenuation:
        dateMult = []

        m = 1
        for i in range(0, len(attends)):
            dateMult.append(m)
            m *= DATE_ATTENUATION_PARAM
    else:
        dateMult = np.ones(len(attends))

    probVec = probVec.transpose().dot(dateMult)
    probVec /= np.sum(probVec)

    return probVec

def recommendForUserRare(userId: str, n: int = 5, randomChoice = False, dateAttenuation = False):
    
    attends = getAttends(userId)

    for a in attends:
        a['date'] = parseDate(a['date'])

    attends = sorted(attends, key = lambda x: x['date'])[::-1]

    probVec = np.array([d3LevelCounts[a['d3LevelId']] for a in attends])

    if dateAttenuation:
        dateMult = []

        m = 1
        for i in range(0, len(attends)):
            dateMult.append(m)
            m *= DATE_ATTENUATION_PARAM
    else:
        dateMult = np.ones(len(attends))

    probVec = probVec.transpose().dot(dateMult)
    probVec /= np.sum(probVec)

    probDiff = (probVec - probTotal)
    probDiff[np.where(probDiff > 0)] = 0
    probDiff = -probDiff
    probDiff /= probDiff.sum()

    def getSetProb(arr):

        p = 1.0

        for i in range(len(arr)):
            for j in range(i + 1, len(arr)):
                p *= d3LevelCounts[arr[i]][arr[j]]

        return p

    if randomChoice:
        reses = [
            np.random.choice(list(range(probVec.shape[0])), n, p = probVec, replace = False)
            for i in range(10)
        ]

        reses = [(r, getSetProb(r)) for r in reses]

        res = sorted(reses, key = lambda x: x[1])[0][0]
    else:
        res = np.argsort(probDiff)[::-1][:n]
    
    return list(res)

def recommendForUserNew():
    return None

In [234]:
recomendForUser(101387513)

[6, 124, 16, 73, 5, 9, 77, 3, 79, 46]