In [32]:
import pandas as pd
from tqdm import tqdm
from collections import Counter
import numpy as np
import pickle as pkl
import random
from dateutil.parser import parse as parseDate
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import logging
import os

DATE_ATTENUATION_PARAM = 0.9

In [3]:
import pymongo

db = pymongo.MongoClient()['ltcLongevity']

In [4]:
df = pd.DataFrame(db.attends.find({}, {'userId': 1, 'd1LevelId': 1, 'd2LevelId': 1, 'd3LevelId': 1, 'date': 1}))

In [7]:
df = df[['userId', 'd1LevelId', 'd2LevelId', 'd3LevelId', 'date']]

In [10]:
df = df[df['d1LevelId'].notna()]
df = df[df['d2LevelId'].notna()]
df = df[df['d3LevelId'].notna()]

In [11]:
df['userId'] = df['userId'].astype('int')
df['d1LevelId'] = df['d1LevelId'].astype('int')
df['d2LevelId'] = df['d2LevelId'].astype('int')
df['d3LevelId'] = df['d3LevelId'].astype('int')

In [22]:
d2Id2Idx = {k: i for i, k in enumerate(df['d2LevelId'].unique())}
d3Id2Idx = {k: i for i, k in enumerate(df['d3LevelId'].unique())}

In [18]:
notNullUsers = df.groupby('userId').agg({'date': 'count'}).reset_index()
notNullUsers = set(notNullUsers[notNullUsers['date'] > 10]['userId'])
dfNN = df[df['userId'].isin(notNullUsers)]

In [5]:
# d3LevelCounts = {}

# for k in tqdm(dfNN['d3LevelId'].unique()):
#     userIds = dfNN[dfNN['d3LevelId'] == k]['userId'].unique()
    
#     dfNNItem = dfNN[(dfNN['userId'].isin(userIds))&(dfNN['d3LevelId'] != k)]
    
#     d3LevelCounts[k] = dict(Counter(dfNNItem['d3LevelId']))
    
# for k in d3LevelCounts:
#     s = sum(d3LevelCounts[k])
#     d3LevelCounts[k] = {kk : v / s for kk, v in d3LevelCounts[k].items()}
    
# for k in d3LevelCounts:
    
#     vec = np.zeros(max(d3LevelCounts) + 1)
#     for kk, v in d3LevelCounts[k].items():
#         vec[kk] = v
        
#     d3LevelCounts[k] = vec

100%|██████████| 453/453 [00:44<00:00, 10.18it/s]


In [19]:
dfNN = dfNN.sort_values('date')

In [23]:
d3LevelCounts = np.zeros((len(d3Id2Idx), len(d3Id2Idx)))

for userId in tqdm(dfNN['userId'].unique()):
    
    acts = list(dfNN[dfNN['userId'] == userId]['d3LevelId'])
    
    for a1 in range(len(acts)):
        for a2 in range(a1 + 1, len(acts)):
            d3LevelCounts[d3Id2Idx[acts[a1]], d3Id2Idx[acts[a2]]] += 1

100%|██████████| 48379/48379 [09:27<00:00, 85.27it/s] 


In [24]:
d3LevelCounts = {k: d3LevelCounts[k, :] for k in range(d3LevelCounts.shape[0])}

In [25]:
d3LevelCounts = {k : v / v.sum() for k, v in d3LevelCounts.items()}

In [26]:
probTotal = np.zeros(len(d3Id2Idx))

for k, v in dict(df.groupby('d3LevelId').agg({'date': 'count'})['date']).items():
    probTotal[d3Id2Idx[k]] = v
    
probTotal /= np.sum(probTotal)

In [31]:
pkl.dump(d3Id2Idx, open('./model/d3Id2Idx.pkl', 'wb'))
pkl.dump(d3LevelCounts, open('./model/d3LevelCounts.pkl', 'wb'))
pkl.dump(probTotal, open('./model/probTotal.pkl', 'wb'))

In [12]:
# userId = 101387513
# n = 5
# randomChoice = True
# dateAttenuation = False


# attends = getAttends(userId)

# for a in attends:
#     a['date'] = parseDate(a['date'])

# attends = sorted(attends, key = lambda x: x['date'])[::-1]

# probVec = np.array([d3LevelCounts[a['d3LevelId']] for a in attends])

# if dateAttenuation:
#     dateMult = []

#     m = 1
#     for i in range(0, len(attends)):
#         dateMult.append(m)
#         m *= DATE_ATTENUATION_PARAM
# else:
#     dateMult = np.ones(len(attends))

# probVec = probVec.transpose().dot(dateMult)
# probVec /= np.sum(probVec)

# probDiff = (probVec - probTotal)
# probDiff[np.where(probDiff > 0)] = 0
# probDiff = -probDiff
# probDiff /= probDiff.sum()

# def getSetProb(arr):
    
#     p = 1.0

#     for i in range(len(arr)):
#         for j in range(i + 1, len(arr)):
#             p *= d3LevelCounts[arr[i]][arr[j]]
            
#     return p

# if randomChoice:
#     reses = [
#         np.random.choice(list(range(probVec.shape[0])), n, p = probVec, replace = False)
#         for i in range(10)
#     ]
    
#     reses = [(r, getSetProb(r)) for r in reses]
        
#     res = sorted(reses, key = lambda x: x[1])[0][0]
# else:
#     res = np.argsort(probDiff)[::-1][:n]

In [29]:
dfm = pd.DataFrame(db.usersV2.find({}, {'userId': 1, 'gender': 1, 'dateBirth': 1, '_id': False}))
dfm['age'] = ((datetime(year = 2023, month = 1, day = 1) - dfm['dateBirth'])).apply(lambda x: x.days / 365.25)

In [95]:
def ageClassFunction(user = None, userId = None):
    if user is None:
        user = db.usersV2.find_one({'userId': userId})
    
    age = (datetime.now() - user['dateBirth']).days / 365.25
    if age < 60:
        return '<60'
    if age > 90:
        return '>90'
    return '%d-%d'%(age // 10  * 10, (age // 10 + 1)  * 10)
        

class ModelMeta:
    
    def __init__(self, db = None, d3Id2Idx = None, modelPath = None):
        
        if (db is None):
            raise Exception('Model need DB connection to work')
            
        self.db = db
        self.d3Id2Idx = d3Id2Idx
        
        if modelPath is not None:
            self.d3Id2Idx = pkl.load(open(f'{modelPath}/d3Id2Idx.pkl', 'rb'))
            self.classFunction = pkl.load(open(f'{modelPath}/classFunction.pkl', 'rb'))
            self.dictClasses = pkl.load(open(f'{modelPath}/dictClasses.pkl', 'rb'))
        
        return None
        
    def fit(self, dfm, classFunction):
        
        self.classFunction = classFunction
        
        self.classes = set()

        self.userClasses = {}
        users = list(self.db.usersV2.find({}))
        
        for user in tqdm(users):
            cls = self.classFunction(user = user)
            self.classes.add(cls)
            self.userClasses[user['userId']] = cls
            
        dfClasses = pd.DataFrame([{'userId': k, 'cls': v} for k, v in self.userClasses.items()])
        dfClasses = df.merge(dfClasses, on = 'userId', how = 'inner')
        dictClasses = dfClasses.groupby('cls').agg({'d3LevelId': list}).to_dict(orient = 'index')
        
        for k in dictClasses:
            vec = np.zeros(len(self.d3Id2Idx))

            for i in dictClasses[k]['d3LevelId']:
                vec[self.d3Id2Idx[i]] += 1

            dictClasses[k] = vec / np.sum(vec)
            
        self.dictClasses = dictClasses
        
        return None
    
    def save(self, modelPath):
        try:
            os.mkdir(modelPath)
        except:
            logging.exception('Unable to create folder for model')
            
        pkl.dump(self.classFunction, open(f'{modelPath}/classFunction.pkl', 'wb'))
        pkl.dump(self.dictClasses, open(f'{modelPath}/dictClasses.pkl', 'wb'))
        pkl.dump(self.d3Id2Idx, open(f'{modelPath}/d3Id2Idx.pkl', 'wb'))
    
    def __call__(self, userId: str):
        cls = self.classFunction(userId = userId)
        return self.dictClasses[cls]
    
mm = ModelMeta(db, d3Id2Idx)
mm.fit(dfm, ageClassFunction)

100%|██████████| 52338/52338 [00:00<00:00, 392165.82it/s]


In [96]:
mm.save('./model/modelAge')

ERROR:root:Unable to create folder for model
Traceback (most recent call last):
  File "/tmp/ipykernel_667078/1344200430.py", line 62, in save
    os.mkdir(modelPath)
FileExistsError: [Errno 17] File exists: './model/modelAge'


In [39]:
def multiplyProbVecs(v1, v2):
    vm = v2 / (1 - v2)
    v1 *= vm
    v1 = v1 / v1.sum()
    return v1

In [93]:
def getAttends(userId: str):
    
    recs = df[df['userId'] == userId].to_dict(orient = 'records')
    
    return [
        {'d3LevelId': r['d3LevelId'], 'date': r['date']} for r in recs
    ]

class ModelMain():
    
    def __init__(self, d3LevelCounts = None, probTotal = None, d3Id2Idx = None, modelPath = None):
        self.d3LevelCounts = d3LevelCounts
        self.d3Id2Idx = d3Id2Idx
        self.probTotal = probTotal
        
        if modelPath is not None:
            self.d3LevelCounts = pkl.load(open(f'{modelPath}/d3LevelCounts.pkl', 'rb'))
            self.probTotal = pkl.load(open(f'{modelPath}/probTotal.pkl', 'rb'))
            self.d3Id2Idx = pkl.load(open(f'{modelPath}/d3Id2Idx.pkl', 'rb'))
        
    def recomendForUser(
        self,
        userId: int, 
        best: int = 5, 
        rare: int = 5,
        randomChoice = False,
        dateAttenuation = True
    ):

        attends = getAttends(userId)
#         for a in attends:
#             a['date'] = parseDate(a['date'])

        metaVec = mm(userId)

        probVecBest = self.recommendVectorBest(attends, dateAttenuation = dateAttenuation)
        probVecRare = self.recommendVectorRare(attends, dateAttenuation = dateAttenuation)

        for a in attends:
            probVecBest[self.d3Id2Idx[a['d3LevelId']]] = 0
            probVecRare[self.d3Id2Idx[a['d3LevelId']]] = 0


        return probVecBest, probVecRare

    def recommendFromVector(probVec, n: int = 5, randomChoice = False):

        if randomChoice:
            res = np.random.choice(list(range(probVec.shape[0])), n, p = probVec, replace = False)
        else:
            res = np.argsort(probVec)[::-1][:n]

    #         def getSetProb(arr):

    #         p = 1.0

    #         for i in range(len(arr)):
    #             for j in range(i + 1, len(arr)):
    #                 p *= d3LevelCounts[arr[i]][arr[j]]

    #         return p

    #     if randomChoice:
    #         reses = [
    #             np.random.choice(list(range(probVec.shape[0])), n, p = probVec, replace = False)
    #             for i in range(10)
    #         ]

    #         reses = [(r, getSetProb(r)) for r in reses]

    #         res = sorted(reses, key = lambda x: x[1])[0][0]
    #     else:
    #         res = np.argsort(probDiff)[::-1][:n]

        return list(res)

    def recommendVectorBest(self, attends: list, dateAttenuation = False):

        attends = sorted(attends, key = lambda x: x['date'])[::-1]

        probVec = np.array([self.d3LevelCounts[self.d3Id2Idx[a['d3LevelId']]] for a in attends])

        if dateAttenuation:
            dateMult = []

            m = 1
            for i in range(0, len(attends)):
                dateMult.append(m)
                m *= DATE_ATTENUATION_PARAM
        else:
            dateMult = np.ones(len(attends))

        probVec = probVec.transpose().dot(dateMult)
        probVec /= np.sum(probVec)

        return probVec

    def recommendVectorRare(self, attends: list, dateAttenuation = False):

        attends = sorted(attends, key = lambda x: x['date'])[::-1]

        probVec = np.array([self.d3LevelCounts[self.d3Id2Idx[a['d3LevelId']]] for a in attends])

        if dateAttenuation:
            dateMult = []

            m = 1
            for i in range(0, len(attends)):
                dateMult.append(m)
                m *= DATE_ATTENUATION_PARAM
        else:
            dateMult = np.ones(len(attends))

        probVec = probVec.transpose().dot(dateMult)
        probVec /= np.sum(probVec)

        probDiff = (probVec - probTotal)
        probDiff[np.where(probDiff > 0)] = 0
        probDiff = -probDiff
        probDiff /= probDiff.sum()

        return probDiff

    def recommendForUserNew(self):
        return None
    
    def save(self, modelPath):
        try:
            os.mkdir(modelPath)
        except:
            logging.exception('Unable to create folder for model')
            
        pkl.dump(self.d3LevelCounts, open(f'{modelPath}/d3LevelCounts.pkl', 'wb'))
        pkl.dump(self.probTotal, open(f'{modelPath}/probTotal.pkl', 'wb'))
        pkl.dump(self.d3Id2Idx, open(f'{modelPath}/d3Id2Idx.pkl', 'wb'))
    
modelMain = ModelMain(d3LevelCounts, probTotal, d3Id2Idx)

In [75]:
def recommendFromVector(probVec, n: int = 5, randomChoice = False):

    if randomChoice:
        res = np.random.choice(list(range(probVec.shape[0])), n, p = probVec, replace = False)
    else:
        res = np.argsort(probVec)[::-1][:n]
        
    return list(res)

In [76]:
probVecBest, probVecRare = modelMain.recomendForUser(101387513)
metaVec = mm(101387513)

In [77]:
recBest = recommendFromVector(multiplyProbVecs(probVecBest, metaVec))
recRare = recommendFromVector(multiplyProbVecs(probVecRare, metaVec))

In [84]:
d3Id2IdxReverse = {v : k for k, v in d3Id2Idx.items()}

recBest = [d3Id2IdxReverse[r] for r in recBest]
recRare = [d3Id2IdxReverse[r] for r in recRare]

{'_id': ObjectId('6470b3890ddba6d6eda3e5ae'),
 'd0LevelName': 'Для ума',
 'd1LevelId': 649,
 'd1LevelName': 'Игры',
 'd2LevelId': 651,
 'd2LevelName': 'Интеллектуальные игры',
 'd3LevelId': 1042,
 'd3LevelName': 'ОНЛАЙН Английский язык',
 'online': True,
 'certificate': False,
 'title': 'Занятия по изучению правил игр, основанных на применении интеллекта и эрудиции, а также участие в них.'}

In [91]:
userId = 101387513

print('----Посещенные курсы----')
for i in set([a['d3LevelId'] for a in getAttends(userId)]):
    print(i, db.activities.find_one({'d3LevelId': i})['d3LevelName'])

print('----Рекомендованные курсы----')
for i in recBest:
    print(i, db.activities.find_one({'d3LevelId': int(i)})['d3LevelName'])
    
for i in recRare:
    print(i, db.activities.find_one({'d3LevelId': int(i)})['d3LevelName'])

----Посещенные курсы----
1154 ОНЛАЙН Английский язык
----Рекомендованные курсы----
1113 ОНЛАЙН Мастер-класс по уходу за кожей в зрелом возрасте
1083 ОНЛАЙН Гимнастика
1090 ОНЛАЙН Суставная гимнастика
1421 ОНЛАЙН Краеведение и онлайн-экскурсии
1162 ОНЛАЙН История искусства
139 Скандинавская ходьба
1113 ОНЛАЙН Мастер-класс по уходу за кожей в зрелом возрасте
104 ОФП
1083 ОНЛАЙН Гимнастика
151 Настольный теннис


In [98]:
db.attends.create_index([('userId', 1)])

'userId_1'