In [1]:
import pandas as pd
from tqdm import tqdm
from collections import Counter
import numpy as np
import pickle as pkl
import random
from dateutil.parser import parse as parseDate
from datetime import datetime
import matplotlib.pyplot as plt

DATE_ATTENUATION_PARAM = 0.9

In [2]:
import pymongo

db = pymongo.MongoClient()['ltcLongevity']

In [3]:
df = pd.read_csv('./data/attendShort.csv')

d2LevelNames = pkl.load(open('./data/d2LevelNames.pkl', 'rb'))
d3LevelNames = pkl.load(open('./data/d3LevelNames.pkl', 'rb'))

d2LevelNamesReverse = {v: k for k, v in d2LevelNames.items()}
d3LevelNamesReverse = {v: k for k, v in d3LevelNames.items()}

In [4]:
notNullUsers = df.groupby('userId').agg({'date': 'count'}).reset_index()
notNullUsers = set(notNullUsers[notNullUsers['date'] > 10]['userId'])
dfNN = df[df['userId'].isin(notNullUsers)]

In [5]:
d3LevelCounts = {}

for k in tqdm(dfNN['d3LevelId'].unique()):
    userIds = dfNN[dfNN['d3LevelId'] == k]['userId'].unique()
    
    dfNNItem = dfNN[(dfNN['userId'].isin(userIds))&(dfNN['d3LevelId'] != k)]
    
    d3LevelCounts[k] = dict(Counter(dfNNItem['d3LevelId']))
    
for k in d3LevelCounts:
    s = sum(d3LevelCounts[k])
    d3LevelCounts[k] = {kk : v / s for kk, v in d3LevelCounts[k].items()}
    
for k in d3LevelCounts:
    
    vec = np.zeros(max(d3LevelCounts) + 1)
    for kk, v in d3LevelCounts[k].items():
        vec[kk] = v
        
    d3LevelCounts[k] = vec

100%|██████████| 453/453 [00:44<00:00, 10.18it/s]


In [6]:
dfNN = dfNN.sort_values('date')

In [7]:
d3LevelCounts = np.zeros((max(dfNN['d3LevelId'].unique()) + 1, max(dfNN['d3LevelId'].unique()) + 1))

for userId in tqdm(dfNN['userId'].unique()):
    
    acts = list(dfNN[dfNN['userId'] == userId]['d3LevelId'])
    
    for a1 in range(len(acts)):
        for a2 in range(a1 + 1, len(acts)):
            d3LevelCounts[acts[a1], acts[a2]] += 1

100%|██████████| 48382/48382 [07:29<00:00, 107.52it/s]


In [8]:
d3LevelCounts = {k: d3LevelCounts[k, :] for k in range(d3LevelCounts.shape[0])}

In [9]:
d3LevelCounts = {k : v / v.sum() for k, v in d3LevelCounts.items()}

In [10]:
probTotal = np.zeros(max(d3LevelCounts) + 1)

for k, v in dict(df.groupby('d3LevelId').agg({'date': 'count'})['date']).items():
    probTotal[k] = v
    
probTotal /= np.sum(probTotal)

In [11]:
def getAttends(userId: str):
    
    recs = df[df['userId'] == userId].to_dict(orient = 'records')
    
    return [
        {'d3LevelId': r['d3LevelId'], 'date': r['date']} for r in recs
    ]

In [12]:
# userId = 101387513
# n = 5
# randomChoice = True
# dateAttenuation = False


# attends = getAttends(userId)

# for a in attends:
#     a['date'] = parseDate(a['date'])

# attends = sorted(attends, key = lambda x: x['date'])[::-1]

# probVec = np.array([d3LevelCounts[a['d3LevelId']] for a in attends])

# if dateAttenuation:
#     dateMult = []

#     m = 1
#     for i in range(0, len(attends)):
#         dateMult.append(m)
#         m *= DATE_ATTENUATION_PARAM
# else:
#     dateMult = np.ones(len(attends))

# probVec = probVec.transpose().dot(dateMult)
# probVec /= np.sum(probVec)

# probDiff = (probVec - probTotal)
# probDiff[np.where(probDiff > 0)] = 0
# probDiff = -probDiff
# probDiff /= probDiff.sum()

# def getSetProb(arr):
    
#     p = 1.0

#     for i in range(len(arr)):
#         for j in range(i + 1, len(arr)):
#             p *= d3LevelCounts[arr[i]][arr[j]]
            
#     return p

# if randomChoice:
#     reses = [
#         np.random.choice(list(range(probVec.shape[0])), n, p = probVec, replace = False)
#         for i in range(10)
#     ]
    
#     reses = [(r, getSetProb(r)) for r in reses]
        
#     res = sorted(reses, key = lambda x: x[1])[0][0]
# else:
#     res = np.argsort(probDiff)[::-1][:n]

In [13]:
dfm = pd.DataFrame(db.usersV2.find({}, {'userId': 1, 'gender': 1, 'dateBirth': 1, '_id': False}))
dfm['age'] = ((datetime(year = 2023, month = 1, day = 1) - dfm['dateBirth'])).apply(lambda x: x.days / 365.25)

In [15]:
def ageClassFunction(user = None, userId = None):
    if user is None:
        user = db.usersV2.find_one({'userId': userId})
    
    age = (datetime.now() - user['dateBirth']).days / 365.25
    if age < 60:
        return '<60'
    if age > 90:
        return '>90'
    return '%d-%d'%(age // 10  * 10, (age // 10 + 1)  * 10)
        

class ModelMeta:
    
    def __init__(self, db = None):
        
        self.db = db
        
        return None
        
    def fit(self, dfm, classFunction):
        
        self.classFunction = classFunction
        
        self.classes = set()

        self.userClasses = {}
        users = list(self.db.usersV2.find({}))
        
        for user in tqdm(users):
            cls = self.classFunction(user = user)
            self.classes.add(cls)
            self.userClasses[user['userId']] = cls
            
        dfClasses = pd.DataFrame([{'userId': k, 'cls': v} for k, v in self.userClasses.items()])
        dfClasses = df.merge(dfClasses, on = 'userId', how = 'inner')
        dictClasses = dfClasses.groupby('cls').agg({'d3LevelId': list}).to_dict(orient = 'index')
        
        for k in dictClasses:
            vec = np.zeros(max(d3LevelCounts) + 1)

            for i in dictClasses[k]['d3LevelId']:
                vec[i] += 1

            dictClasses[k] = vec / np.sum(vec)
            
        self.dictClasses = dictClasses
        
        return None
    
    def __call__(self, userId: str):
        cls = self.classFunction(userId = userId)
        return self.dictClasses[cls]
    
mm = ModelMeta(db)
mm.fit(dfm, ageClassFunction)

100%|██████████| 52338/52338 [00:00<00:00, 419953.56it/s]


In [16]:
def multiplyProbVecs(v1, v2):
    vm = v2 / (1 - v2)
    v1 *= vm
    v1 = v1 / v1.sum()
    return v1

In [17]:
def recomendForUser(
    userId: int, 
    best: int = 5, 
    rare: int = 5,
    randomChoice = False,
    dateAttenuation = True
):
    
    attends = getAttends(userId)
    for a in attends:
        a['date'] = parseDate(a['date'])
        
    metaVec = mm(userId)
    
    probVecBest = recommendVectorBest(attends, dateAttenuation = dateAttenuation)
    probVecRare = recommendVectorRare(attends, dateAttenuation = dateAttenuation)
    
#     print(metaVec)
#     print(probVecBest)
#     print(probVecRare)

    for a in attends:
        probVecBest[a['d3LevelId']] = 0
        probVecRare[a['d3LevelId']] = 0

    recBest = recommendFromVector(multiplyProbVecs(probVecBest, metaVec))
    recRare = recommendFromVector(multiplyProbVecs(probVecRare, metaVec))
    
    return recBest + recRare

def recommendFromVector(probVec, n: int = 5, randomChoice = False):
    
    if randomChoice:
        res = np.random.choice(list(range(probVec.shape[0])), n, p = probVec, replace = False)
    else:
        res = np.argsort(probVec)[::-1][:n]
        
#         def getSetProb(arr):

#         p = 1.0

#         for i in range(len(arr)):
#             for j in range(i + 1, len(arr)):
#                 p *= d3LevelCounts[arr[i]][arr[j]]

#         return p

#     if randomChoice:
#         reses = [
#             np.random.choice(list(range(probVec.shape[0])), n, p = probVec, replace = False)
#             for i in range(10)
#         ]

#         reses = [(r, getSetProb(r)) for r in reses]

#         res = sorted(reses, key = lambda x: x[1])[0][0]
#     else:
#         res = np.argsort(probDiff)[::-1][:n]
        
    return list(res)

def recommendVectorBest(attends: list, dateAttenuation = False):

    attends = sorted(attends, key = lambda x: x['date'])[::-1]

    probVec = np.array([d3LevelCounts[a['d3LevelId']] for a in attends])

    if dateAttenuation:
        dateMult = []

        m = 1
        for i in range(0, len(attends)):
            dateMult.append(m)
            m *= DATE_ATTENUATION_PARAM
    else:
        dateMult = np.ones(len(attends))

    probVec = probVec.transpose().dot(dateMult)
    probVec /= np.sum(probVec)

    return probVec

def recommendVectorRare(attends: list, dateAttenuation = False):
    
    attends = sorted(attends, key = lambda x: x['date'])[::-1]

    probVec = np.array([d3LevelCounts[a['d3LevelId']] for a in attends])

    if dateAttenuation:
        dateMult = []

        m = 1
        for i in range(0, len(attends)):
            dateMult.append(m)
            m *= DATE_ATTENUATION_PARAM
    else:
        dateMult = np.ones(len(attends))

    probVec = probVec.transpose().dot(dateMult)
    probVec /= np.sum(probVec)

    probDiff = (probVec - probTotal)
    probDiff[np.where(probDiff > 0)] = 0
    probDiff = -probDiff
    probDiff /= probDiff.sum()
    
    return probDiff

def recommendForUserNew():
    return None

In [18]:
userId = 101387513

print('----Посещенные курсы----')
for i in set([a['d3LevelId'] for a in getAttends(userId)]):
    print(i, d3LevelNamesReverse[i])

print('----Рекомендованные курсы----')
for i in recomendForUser(101387513):
    print(i, d3LevelNamesReverse[i])

----Посещенные курсы----
77 ОНЛАЙН Английский язык
----Рекомендованные курсы----
6 ОНЛАЙН Мастер-класс по уходу за кожей в зрелом возрасте
16 ОНЛАЙН Гимнастика
32 ОНЛАЙН Суставная гимнастика
5 ОНЛАЙН Краеведение и онлайн-экскурсии
73 ОНЛАЙН История искусства
9 Скандинавская ходьба
6 ОНЛАЙН Мастер-класс по уходу за кожей в зрелом возрасте
3 ОФП
16 ОНЛАЙН Гимнастика
79 Настольный теннис
