In [1]:
import json
import random
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
users = pd.read_csv('users.csv')
follows = pd.read_csv('follows.csv')
# activate = pd.read_csv('activate.csv')
follows.drop_duplicates(inplace=True)
follows = follows.sample(500000)

解决用户等启动问题之一: 基于排行榜的推荐, 诸如:

1. 用户是否是大v；
2. 用户被关注量排行, 或被关注量与关注量之比排行；
3. 用户是否被举报, 被举报的次数；
4. 用户是否在黑名单里, 被拉黑名单的次数；
5. 用户每天的登录次数, 及频率；
6. 用户每天的发帖, 点赞, 转发, 评论的次数和频率；
7. 用户帖子, 评论被点赞, 转发和评论的次数和频率, 及大v用户的数量和占比；

对于冷启动的问题, 并不局限于以上思路....

用户的浏览行为, 比如:

1. 用户看了, 点赞了, 转发, 评论某个用户；
2. 关注过该用户的还关注了；

这里以仅有的数据, 用户关注量和被关注量数据为例

In [3]:
def get_flg(val):
    try:
        return json.loads(val)['flg']
    except:
        pass

In [4]:
def get_flw(val):
    try:
        return json.loads(val)['flw']
    except:
        pass

先过滤掉很少关注别人, 或很少被关注的用户, 过滤掉不活跃的用户:
    
这里涉及到如何定义不活跃的用户, 或者活跃的用户:
1. 多少天内连续登录, 每天登录频率, 每天发帖, 点赞, 转发, 评论的次数和频率, 连续几天内的发帖, 点赞, 转发和频率的总次数和频率等
2. 一周内至少登录4次, 发5个帖子?

In [5]:
# users = users[users['key'].str.contains('|'.join(activate['uid']))]

In [6]:
# users['flg'] = users['val'].apply(get_flg)
# users['flw'] = users['val'].apply(get_flw)
users['flg'].fillna(0, inplace=True)
users['flw'].fillna(0, inplace=True)
users['flg'] = users['flg'].astype(int)
users['flw'] = users['flw'].astype(int)
users['flw'] = abs(users['flw'])
users.loc[users['flg'] != 0, 'score'] = users.loc[users['flg'] != 0, 'flw'] / users.loc[users['flg'] != 0, 'flg']
users['score'].fillna(0, inplace=True)

In [7]:
users.sort_values('flw', ascending=False)

Unnamed: 0,uname,nickname,ousername,lang,status,cm,flw,flg,infl,ico,shbpst,lkbpst,score
1237563,mtg4america,Marjorie Taylor Greene,MTG4America,en,a,,2108058798,319962,1.0,group47/getter/2021/07/03/15/92afd860-4202-bac...,337646.0,,6.588466e+03
2475649,realstewpeters,Stew Peters,RealStewPeters,en,a,,2096365366,10628,1.0,group18/getter/2021/11/07/02/46ef6db8-74c4-515...,167956.0,,1.972493e+05
2475937,biakicis,Bia Kicis,Biakicis,en,a,,2077271705,4,1.0,group14/tw/2021/09/06/20/de8221f9-a843-7f0c-b2...,23275.0,,5.193179e+08
1237456,rudygiuliani,Rudy W. Giuliani,RudyGiuliani,en,a,,2064195887,78628,1.0,group1/tw/2021/07/07/12/05116288-96b7-7af3-9f0...,48404.0,,2.625268e+04
1237600,zerohedge,ZeroHedge,zerohedge,en,a,,1963155208,36838,1.0,group44/tw/2021/08/18/19/b2f6a473-9525-fd79-95...,35787.0,,5.329158e+04
...,...,...,...,...,...,...,...,...,...,...,...,...,...
985709,mariakyturner,mariakyturner,mariakyTurner,en_us,,,0,0,,group49/origin/2021/09/12/17/10456b30-12bb-a94...,,,0.000000e+00
985708,jennifer9iadams,jennifer9iadams,jennifer9iadams,en_us,,,0,0,,group22/origin/2021/09/12/17/e93a6209-f2bd-181...,,,0.000000e+00
985707,d6mariaturner,d6Mariaturner,d6Mariaturner,en_us,,,0,0,,group38/origin/2021/09/12/17/c81145b8-3e11-1e8...,,,0.000000e+00
985706,donna17jones,donna17jones,donna17jones,en_us,,,0,0,,group2/origin/2021/09/13/16/475d71cb-6416-19aa...,,,0.000000e+00


In [8]:
users.sort_values('score', ascending=False, inplace=True)

In [9]:
users['rank'] = range(1, len(users) + 1)

In [10]:
users

Unnamed: 0,uname,nickname,ousername,lang,status,cm,flw,flg,infl,ico,shbpst,lkbpst,score,rank
1237588,julie_kelly,julie_kelly,julie_kelly,en,a,,756000330,1,1.0,group35/getter/2021/08/18/21/4c90e89b-11a3-059...,1137.0,,7.560003e+08,1
2475887,kristinoem,Kristi Noem,KristiNoem,en,a,,1811729701,3,1.0,group38/tw/2021/08/23/01/282d5e81-5a8f-af97-7e...,3316.0,,6.039099e+08,2
2475937,biakicis,Bia Kicis,Biakicis,en,a,,2077271705,4,1.0,group14/tw/2021/09/06/20/de8221f9-a843-7f0c-b2...,23275.0,,5.193179e+08,3
1237543,azaudit,Arizona Audit,AZAudit,en,qs,,324731303,1,1.0,group16/origin/2021/07/29/14/aaf2fa0f-015a-6a8...,1241.0,,3.247313e+08,4
1237429,cristinalaila,CristinaLaila,CristinaLaila,en,qs,,241212798,1,1.0,group37/origin/2021/08/30/17/ae2f4c7b-a906-28a...,1079.0,,2.412128e+08,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
933727,th8003th,th8003th,th8003th,en,a,,0,0,,,,,0.000000e+00,2476049
933724,freezed,freezed,freezed,en,qs,,0,0,,,,,0.000000e+00,2476050
933723,kporterg,kporterg,kporterg,en,qs,,0,0,,,,,0.000000e+00,2476051
933722,malchut,Malchut,Malchut,en,qs,,0,0,,,,,0.000000e+00,2476052


解决完冷启动问题, 以下准备从两个方向来做推荐:
1. 基于item-based的用户推荐；
2. 基于SVD矩阵分解的用户推荐；

1. 基于item-based的用户推荐

将用户看成要被推荐的物品, 建立各个用户之间的关注与被关注的关系, 形成一个矩阵图

先过滤掉稀疏, 很少关注别人, 或很少被关注的用户, 过滤掉不活跃的用户;

In [11]:
# follows = follows[follows['uid'].str.contains('|'.join(activate['uid'])) | follows['fid'].str.contains('|'.join(activate['uid']))]

1. 对于新注册的用户, 也就是冷启动问题, 可以通过排行榜推荐
2. 一旦用户有了关注列表, 下一次推荐就可以使用item-based的来推荐

In [12]:
train_data, test_data = train_test_split(follows, test_size=0.1, random_state=0)

基于item-based具体的逻辑如下:

1. 现有一个需要推荐的用户, 找到该用户已经关注的用户列表；
2. 针对这已经关注的用户列表, 我们要找到与每个被关注的用户相似的用户推荐该用户；
3. 这边衡量相似的用户, 使用的是jaccard集合相似度, 即每个被关注用户的用户列表与每个待推荐用户的用户列表的交集与并集之比, 该值越大, 该用户越与被关注用户相似, 也就越会被推荐；
4. 得到针对每个被关注用户和待推荐用户列表的共现矩阵后, 需要计算每个待推荐用户的被推荐的可能性, 方法是每个被关注用户的jaccard值和的平均值即为待推荐用户的score；
5. 得到每个待推荐用户的score后, 然后按照从高到底的顺序排序, 给到TOPN的推荐列表

In [13]:
import Recommender as Recommenders
is_model = Recommenders.item_similarity_recommender()
is_model.create(follows, 'uid', 'fid')
user_id = list(test_data.uid)[0]
user_items = is_model.get_user_items(user_id)

In [14]:
is_model.recommend(user_id, 10)

Unnamed: 0,uid,fid,score,rank
0,grykon,sharonday5,0.009804,1
1,grykon,david_lane,0.008772,2
2,grykon,stuartrgause,0.006944,3
3,grykon,estellp,0.004505,4
4,grykon,jonnywo,0.004505,5
5,grykon,debmoreno123,0.004505,6
6,grykon,rsanterre44,0.004274,7
7,grykon,geniesmith,0.003876,8
8,grykon,touli,0.003333,9
9,grykon,kimnak,0.003145,10


推荐系统的评估指标有如下几种:

1. 用户满意度
2. 预测准确性
3. 覆盖率
4. 多样性
5. 新颖性
6. 惊喜度
7. 信任度
8. 实时性
9. 健壮性
10. 商业目标

我们要做的就是最大化预测准确性, 使得覆盖率, 多样性, 新颖性大于某个预设的阈值.

而在这里, 对于推荐关注用户的评估就简单地选择一预测准确性和覆盖率为度量标准, 其它指标相对该任务不是最重要的, 有些也不是很适合

而对于预测准确性的度量, 可选择的有评分预测和TOPN推荐的召回和准确率

1. 评分预测: 有RMSE和MAE, 用于验证推荐的准确性的, 但存在很明显的缺陷是我们的推荐不是单单是衡量推荐的, 都是用户follow的精度, 还需要考虑召回等
2. TOPN推荐: 一般使用的是准确率和召回率, 也就是在推荐的N长列表中, 用户实际采纳的准确性和召回的情况

覆盖率其中一种定义:

为每个用户推荐的用户列表的并集 / 待推荐的用户列表

In [15]:
test_dict = dict()
for user_id in list(test_data.uid):
    test_dict.update({user_id:is_model.get_user_items(user_id)})

In [16]:
# test_list = ['jake7qa4', 'user1_qa4']
# for user_id in test_list:
#     test_dict.update({user_id:is_model.get_user_items(user_id)})

待推荐的用户需要先获取到关注的用户列表, 然后根据这个用户列表来进行推荐, 按理来说, 越往后的推荐应该越准确

In [17]:
def PrecisionRecall(test, N):
    hit = 0
    n_recall = 0
    n_precision = 0
    for user, items in test.items():
        rank = is_model.recommend(user, N)
        hit += len(set(rank['fid']) & set(items))
        n_recall += len(set(items))
        n_precision += N
    return hit / (1.0 * n_recall), hit / (1.0 * n_precision)

In [18]:
def Coverage(test, N):
    recomm_list = []
    for user, items in test.items():
        rank = is_model.recommend(user, N)
        recomm_list.append(set(rank['fid']))
    for n, recom_s in enumerate(recomm_list):
        if n == 0:
            temp = recom_s
        else:
            temp = temp.union(recom_s)

    return len(temp) / len(is_model.get_all_items_train_data())

In [None]:
recalls = []
precisions = []
coverages = []
for n in range(10, 31, 10):
    recall, precision = PrecisionRecall(test_dict, n)
    coverage = Coverage(test_dict, n)
    recalls.append(recall)
    precisions.append(precision)
    coverages.append(coverage)
    
final_recall, final_precision, final_coverage = sum(recalls) / len(range(10, 31, 10)), sum(precisions) / len(range(10, 31, 10)), sum(coverages) / len(range(10, 31, 10))

In [None]:
final_recall, final_precision, final_coverage

2. 基于矩阵分解SVD的用户推荐

基于相似度的推荐虽然比较简单, 但是对于用户比较多的时候, 计算时间就比较长, 因为每个推荐用户都需要遍历数据集, 故而现在普遍使用SVD来做基础的推荐

矩阵分解, 顾名思义在这里是对每个关注用户的打分, 这边就以这个用户被关注量 / 用户总关注量作为该分值

矩阵分解的思路是, 找到跟该推荐用户相似的用户所关注的对象, 根据相似度的排序进行用户的推荐

这边推荐的用户默认是存在被他人关注关系的用户, 而对于没有任何人关注他的则不会被推荐

In [20]:
uid_f_counts = pd.DataFrame(follows['uid'].value_counts())
uid_f_counts.reset_index(inplace=True)
uid_f_counts.columns = ['uid', 'uid_f_counts']
uid_f_counts = dict(uid_f_counts.values)

In [21]:
fid_f_counts = pd.DataFrame(follows['fid'].value_counts())
fid_f_counts.reset_index(inplace=True)
fid_f_counts.columns = ['fid', 'fid_f_counts']
fid_f_counts = dict(fid_f_counts.values)

In [22]:
for i in follows.index:
    follows.loc[i, 'fid_flw_count'] = fid_f_counts[follows.loc[i]['fid']]
    follows.loc[i, 'uid_flg_count'] = uid_f_counts[follows.loc[i]['uid']]

In [23]:
follows['frac_f_count'] = follows['fid_flw_count'] / follows['uid_flg_count']

同样这里待推荐的用户也需要先有关注列表, 也就是不是新注册用户

In [24]:
from scipy.sparse import coo_matrix

small_set = follows
uid_codes = small_set.uid.drop_duplicates().reset_index()
fid_codes = small_set.fid.drop_duplicates().reset_index()
uid_codes.rename(columns={'index':'uid_index'}, inplace=True)
fid_codes.rename(columns={'index':'fid_index'}, inplace=True)
uid_codes['u_index_value'] = list(uid_codes.index)
fid_codes['f_index_value'] = list(fid_codes.index)
small_set = pd.merge(small_set, uid_codes, how='left')
small_set = pd.merge(small_set, fid_codes, how='left')
mat_candidate = small_set[['u_index_value', 'f_index_value', 'frac_f_count']]
data_array = mat_candidate.frac_f_count.values
row_array = mat_candidate.u_index_value.values
col_array = mat_candidate.f_index_value.values

data_sparse = coo_matrix((data_array, (row_array, col_array)), dtype=float)

In [25]:
data_sparse

<182470x193109 sparse matrix of type '<class 'numpy.float64'>'
	with 500000 stored elements in COOrdinate format>

In [26]:
data_sparse.toarray()

array([[0.5       , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.00714286, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [27]:
import math as mt
from scipy.sparse.linalg import *
from scipy.sparse.linalg import svds
from scipy.sparse import csc_matrix

In [28]:
def compute_svd(urm, K):
    U, s, Vt = svds(urm, K)

    dim = (len(s), len(s))
    S = np.zeros(dim, dtype=np.float32)
    for i in range(0, len(s)):
        S[i,i] = mt.sqrt(s[i])

    U = csc_matrix(U, dtype=np.float32)
    S = csc_matrix(S, dtype=np.float32)
    Vt = csc_matrix(Vt, dtype=np.float32)
    
    return U, S, Vt

def compute_estimated_matrix(urm, U, S, Vt, uTest, N):
    rightTerm = S * Vt
    max_recommendation = N # N <= urm.shape[1]
    estimatedRatings = np.zeros(shape=(urm.shape[0], urm.shape[1]), dtype=np.float16)
    recomendRatings = np.zeros(shape=(urm.shape[0], max_recommendation), dtype=np.float16)
    for userTest in uTest:
        prod = U[userTest, :] * rightTerm
        estimatedRatings[userTest, :] = prod.todense()
        recomendRatings[userTest, :] = (estimatedRatings[userTest, :]).argsort()[:max_recommendation]
    return recomendRatings

In [35]:
small_set

Unnamed: 0,uid,fid,rank,fid_flw_count,uid_flg_count,frac_f_count,uid_index,u_index_value,fid_index,f_index_value
0,susanthompsonw4,bobbielangford,1.0,1.0,2.0,0.500000,6612435,0,6612435,0
1,lauracampbell20,cncdave,1.0,1.0,1.0,1.000000,9582650,1,9582650,1
2,fearlessnorm,iceberg2024,1.0,1.0,140.0,0.007143,5969422,2,5969422,2
3,lenastanley,pitdiesel,1.0,3.0,35.0,0.085714,2830816,3,2830816,3
4,afonsocg,carlazambelli,1.0,691.0,1.0,691.000000,3420045,4,3420045,4
...,...,...,...,...,...,...,...,...,...,...
499995,doyleweaver,blazeerb,1.0,1.0,314.0,0.003185,285538,1751,285336,193108
499996,starblazer,desertrose_az,1.0,7.0,34.0,0.205882,5466897,3800,5239879,5306
499997,lisa7259,newsmax,1.0,3065.0,1.0,3065.000000,6236731,182468,6734670,31
499998,gabrielagcs30,realpfigueiredo,1.0,294.0,1.0,294.000000,4870315,182469,2769555,4122


In [29]:
K=100
urm = data_sparse

U, S, Vt = compute_svd(urm, K)

In [36]:
uTest = [182468]

uTest_recommended_items = compute_estimated_matrix(urm, U, S, Vt, uTest, 4)

In [38]:
for user in uTest:
    print("Recommendation for user with user id {}".format(user))
    rank_value = 1
    for i in uTest_recommended_items[user, :]:
        fid_details = pd.DataFrame(small_set[small_set.u_index_value == i].drop_duplicates('u_index_value')['fid'], columns=['fid'])
        print("The number {} recommended user is {}".format(rank_value, list(fid_details['fid'])[0]))
        rank_value += 1

Recommendation for user with user id 182468
The number 1 recommended user is mtg4america
The number 2 recommended user is robcarsonshow
The number 3 recommended user is aliancabrasil
The number 4 recommended user is lucky100301


In [39]:
train_data, test_data = train_test_split(small_set, test_size=0.1, random_state=0)

In [40]:
test_dict = dict()
for user_id in list(test_data.uid):
    test_dict.update({user_id:is_model.get_user_items(user_id)})

In [41]:
def PrecisionRecall(test, small_set, urm, U, S, Vt, uTest, N):
    hit = 0
    n_recall = 0
    n_precision = 0

    uTest_recommended_items = compute_estimated_matrix(urm, U, S, Vt, uTest, N)
    for user, items in zip(uTest, test.values()):
        recom_list = []
        for i in uTest_recommended_items[user, :]:
            recom_list.append(small_set[small_set.u_index_value == i].drop_duplicates('u_index_value')['fid'].values[0])
        hit += len(set(recom_list) & set(items))
        n_recall += len(set(items))
        n_precision += N
    return hit / (1.0 * n_recall), hit / (1.0 * n_precision)

In [42]:
def Coverage(test, urm, U, S, Vt, uTest, N):
    uTest_recommended_items = compute_estimated_matrix(urm, U, S, Vt, uTest, N)
    recom_lists = []
    for user in uTest:
        recom_list = []
        for i in uTest_recommended_items[user, :]:
            recom_list.append(small_set[small_set.u_index_value == i].drop_duplicates('u_index_value')['fid'].values[0])
        recom_lists.append(set(recom_list))
        
    for n, recom_s in enumerate(recom_lists):
        if n == 0:
            temp = recom_s
        else:
            temp = temp.union(recom_s)

    return len(temp) / len(is_model.get_all_items_train_data())

In [43]:
uTest = test_data['u_index_value'].tolist()

In [None]:
recalls = []
precisions = []
coverages = []
for n in range(10, 31, 10):
    recall, precision = PrecisionRecall(test_dict, small_set, urm, U, S, Vt, uTest, n)
    coverage = Coverage(test_dict, urm, U, S, Vt, uTest, n)
    recalls.append(recall)
    precisions.append(precision)
    coverages.append(coverage)
    
final_recall, final_precision, final_coverage = sum(recalls) / len(range(10, 31, 10)), sum(precisions) / len(range(10, 31, 10)), sum(coverages) / len(range(10, 31, 10))

In [None]:
final_recall, final_precision, final_coverage

In [45]:
import recmetrics
from surprise import Reader, SVD, Dataset
from surprise.model_selection import train_test_split

In [46]:
reader = Reader(rating_scale=(0, int(small_set['frac_f_count'].max()) + 1))
data = Dataset.load_from_df(small_set[['u_index_value', 'f_index_value', 'frac_f_count']], reader)
trainset, testset = train_test_split(data, test_size=0.25)

In [47]:
algo = SVD()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7faa64294f70>

In [48]:
test = algo.test(testset)
test = pd.DataFrame(test)
test.drop("details", inplace=True, axis=1)
test.columns = ['uid', 'fid', 'actual', 'predictions']

In [49]:
print(recmetrics.mse(test.actual, test.predictions))
print(recmetrics.rmse(test.actual, test.predictions))

8780576.314148713
2963.203724712277


In [50]:
cf_model = test.groupby(['uid', 'fid'])['predictions'].max().to_frame()

In [51]:
# cf_model = test.pivot_table(index='uid', columns='fid', values='predictions').fillna(0)

In [52]:
def get_users_predictions(user_id, n, model):
    recommended_items = pd.DataFrame(model.loc[user_id])
    recommended_items.columns = ["predicted_rating"]
    recommended_items = recommended_items.sort_values('predicted_rating', ascending=False)    
    recommended_items = recommended_items.head(n)
    return recommended_items.index.tolist()

In [53]:
def PrecisionRecall(test, small_set, N, model):
    hit = 0
    n_recall = 0
    n_precision = 0

    for user_id in test['uid']:
        preds = get_users_predictions(user_id, N, model)
        actuals = small_set[small_set['u_index_value'] == user_id]['f_index_value'].tolist()
        hit += len(set(preds) & set(actuals))
        n_recall += len(set(actuals))
        n_precision += N
    return hit / (1.0 * n_recall), hit / (1.0 * n_precision)

In [54]:
def Coverage(test, small_set, N, model):
    recom_list = []
    for user_id in test['uid']:
        preds = get_users_predictions(user_id, N, model)
        recom_list.append(set(preds))
        
    for n, recom_s in enumerate(recom_list):
        if n == 0:
            temp = recom_s
        else:
            temp = temp.union(recom_s)

    return len(temp) / len(small_set['fid'].unique().tolist())

In [55]:
recalls = []
precisions = []
coverages = []
for n in range(5, 31, 5):
    recall, precision = PrecisionRecall(test, small_set, n, cf_model)
    coverage = Coverage(test, small_set, n, cf_model)
    recalls.append(recall)
    precisions.append(precision)
    coverages.append(coverage)
    
final_recall, final_precision, final_coverage = sum(recalls) / len(range(5, 31, 5)), sum(precisions) / len(range(5, 31, 5)), sum(coverages) / len(range(5, 31, 5))

In [56]:
final_recall, final_precision, final_coverage

(0.10202328910323744, 0.3658219333333333, 0.29677108092666143)

无论是item-based的, 还是svd, 其本质都是一样的, 都是找到待推荐用户关注的这部分用户相似用户的关注列表进行推荐