In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import numpy as np
from collections import Counter
import seaborn as sns

In [2]:
course_chapter_items_df = pd.read_csv('./data/course_chapter_items.csv')
course_df = pd.read_csv('./data/courses.csv')
users_df = pd.read_csv('./data/users.csv')
subgroups_df = pd.read_csv('./data/subgroups.csv')

train_group_df = pd.read_csv('./data/train_group.csv')
test_seen_group_df = pd.read_csv('./data/test_seen_group.csv')
val_seen_group_df = pd.read_csv('./data/val_seen_group.csv')
test_unseen_group_df = pd.read_csv('./data/test_unseen_group.csv')
val_unseen_group_df = pd.read_csv('./data/val_unseen_group.csv')

train_df = pd.read_csv('./data/train.csv')
test_seen_df = pd.read_csv('./data/test_seen.csv')
val_seen_df = pd.read_csv('./data/val_seen.csv')
test_unseen_df = pd.read_csv('./data/test_unseen.csv')
val_unseen_df = pd.read_csv('./data/val_unseen.csv')

In [3]:
def check_row_number_with_unique_amount(df, df_name, column):
    nrow = df.shape[0]
    nunique = pd.unique(df.loc[:,column]).shape[0]
    msg = f'dataframe {df_name} row number != unique {column} amount'
    assert nrow == nunique, msg
    print(f'dataframe {df_name} row number == unique {column} amount ({nrow} == {nunique})')

def check_unique_items_are_same_set(df1, df1_name, df1_column, df2, df2_name, df2_column):
    df1_set = set(df1.loc[:,df1_column].to_list())
    df2_set = set(df2.loc[:,df2_column].to_list())
    diff = df1_set.symmetric_difference(df2_set)
    msg = f'unique {df1_name}.{df1_column} & unique {df2_name}.{df2_column} are different'
    assert len(diff) == 0, msg
    print(f'unique {df1_name}.{df1_column} & unique {df2_name}.{df2_column} are the same')


## Use LightFM model dataset

### build json file in format("user_id", "subgroup")

In [4]:
# users_df -> user_id, interest
# train_group -> user_id, subgroup

##### build {user_id : interest}

In [5]:
users_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130566 entries, 0 to 130565
Data columns (total 5 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   user_id            130566 non-null  object
 1   gender             85371 non-null   object
 2   occupation_titles  29056 non-null   object
 3   interests          82756 non-null   object
 4   recreation_names   31935 non-null   object
dtypes: object(5)
memory usage: 5.0+ MB


In [6]:
subgroups2idx = {}
for (id, name) in zip(subgroups_df["subgroup_id"], subgroups_df["subgroup_name"]):
    subgroups2idx.update({name: id})

subgroups2idx

{'更多生活品味': 1,
 '護膚保養與化妝': 2,
 '平面設計': 3,
 '繪畫與插畫': 4,
 '電腦繪圖': 5,
 '應用設計': 6,
 '求職': 7,
 '英文': 8,
 '手作小物': 9,
 'DJ': 10,
 '更多音樂': 11,
 '更多藝術': 12,
 '烹飪料理與甜點': 13,
 '壓力舒緩': 14,
 '運動': 15,
 '親子教育': 16,
 '手工印刷': 17,
 '手工書': 18,
 '動態攝影': 19,
 '素描': 20,
 '樂器': 21,
 '色彩學': 22,
 '字體設計': 23,
 '手寫字': 24,
 '動態設計': 25,
 '音樂理論': 26,
 '刺繡': 27,
 '日文': 28,
 '心靈成長與教育': 29,
 '音樂創作': 30,
 '氣球': 31,
 '程式入門': 32,
 '程式語言': 33,
 '網頁前端': 34,
 '文書處理': 35,
 '角色設計': 36,
 '資料彙整': 37,
 '介面設計': 38,
 '網頁設計': 39,
 '商業攝影': 40,
 '網站架設': 41,
 '更多程式': 42,
 '程式思維': 43,
 '手機程式開發': 44,
 '韓文': 45,
 '更多手作': 46,
 '更多語言': 47,
 '歐洲語言': 48,
 '人聲': 49,
 '個人品牌經營': 50,
 '更多職場技能': 51,
 '網頁後端': 52,
 '後製剪輯': 53,
 '產品設計': 54,
 '靈性發展': 55,
 '影視創作': 56,
 '資料科學': 57,
 '軟體程式開發與維護': 58,
 '職場溝通': 59,
 '表演藝術': 60,
 '創業': 61,
 '文學': 62,
 '區塊鏈': 63,
 '金融商品': 64,
 '文案': 65,
 '數位行銷': 66,
 '設計理論': 67,
 'AI 人工智慧': 68,
 '投資觀念': 69,
 '理財': 70,
 '社會科學': 71,
 '社群行銷': 72,
 '影像創作': 73,
 '遊戲開發': 74,
 '程式理財': 75,
 '量化交易': 76,
 '更多設計': 77,
 '獨立接案': 78,
 '寵

In [7]:
df = users_df.fillna("None")
interest_df = df["interests"]
userid_df = df["user_id"]

users_list = []
interest_list = []

for id, interest in zip(userid_df, interest_df):
    users_list.append(id)
    interest_list.extend(interest.split(', '))

In [8]:
len(users_list), len(interest_list)

(130566, 130566)

In [9]:
users_list[0], interest_list[0]

('54ccaa73a784960a00948687',
 '職場技能_創業,藝術_電腦繪圖,設計_介面設計,設計_動態設計,設計_平面設計,投資理財_投資觀念,行銷_數位行銷,藝術_角色設計,藝術_繪畫與插畫,職場技能_個人品牌經營')

In [10]:
user_info = dict(zip(users_list, interest_list))

In [12]:
investment = [69, 70, 75, 76, 87]
crypto_coin = [63, 64, 66]
info = []
a = {}
for key in user_info.keys():
    
    if user_info[key] == "None":
        user_info[key] = [0]
        continue
    
    interests = user_info[key].split(",")
    name_list = []
    for interest in interests:
        interest_name = interest.split("_")[1]
        if interest_name == "篆刻":
            continue
        if interest_name == "西班牙文":
            name_list.append(47)
            name_list.append(48)
            continue
        if interest_name == "更多投資理財":
            name_list.append(id for id in investment)
            name_list.append(81)
            continue
        if interest_name == "比特幣":
            name_list.append(id for id in investment)
            name_list.append(id for id in crypto_coin)
            continue
        name_list.append(subgroups2idx[interest_name])
    # a.update({key:name for name in name_list})
    # break
    # a["user_id"] = key
    # a["interests"] = name_list
    a[key] = name_list
    info.append(a)
    # user_info[key] = name_list   

In [13]:
df = pd.DataFrame(list(info[0].items()), columns=["user_id", "interests"])

In [14]:
pd.melt(frame=df.head().set_index('user_id')['interests'].apply(pd.Series).reset_index(), id_vars=['user_id'], value_name='interests').dropna().drop(['variable'], axis=1)[:5]

Unnamed: 0,user_id,interests
0,54ccaa73a784960a00948687,61.0
1,54dca4456d7d350900e86bae,25.0
2,54e421bac5c9c00900cd8d47,3.0
3,54e961d4c5c9c00900cd8d84,70.0
4,54e9b744c5c9c00900cd8d8a,34.0


In [15]:
train_df = train_group_df.fillna("0")
train_id = train_df["user_id"].to_list()
train_subgroup = train_df["subgroup"].to_list()
train_info = {}
for id, subgroup in zip(train_id, train_subgroup):
    train_info[id] = subgroup.split(' ')

In [16]:
train_df = pd.DataFrame(list(train_info.items()), columns=["user_id", "subgroup"])

In [17]:
train_data = pd.melt(frame=train_df.set_index('user_id')['subgroup'].apply(pd.Series).reset_index(), id_vars=['user_id'], value_name='subgroup').dropna().drop(['variable'], axis=1)

In [18]:
import json
from lightfm.data import Dataset

dataset = Dataset()
dataset.fit((x for x in train_data['user_id']),
            (x for x in train_data['subgroup']))



In [19]:
num_users, num_items = dataset.interactions_shape()
print('Num users: {}, num_items {}.'.format(num_users, num_items))

Num users: 59737, num_items 92.


In [20]:
(training_data, weights) = dataset.build_interactions(((x, y) for x, y in zip(train_data['user_id'], train_data['subgroup'])))

print(repr(training_data))

<59737x92 sparse matrix of type '<class 'numpy.int32'>'
	with 235302 stored elements in COOrdinate format>


In [21]:
from lightfm import LightFM

model = LightFM(loss='warp')
model.fit(training_data, epochs=100, num_threads=1)

<lightfm.lightfm.LightFM at 0x18da51654f0>

### Prediction

In [22]:
test_seen_group_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7205 entries, 0 to 7204
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   user_id   7205 non-null   object
 1   subgroup  7205 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 112.7+ KB


In [23]:
test_id = test_seen_group_df["user_id"].to_list()
test_subgroup = test_seen_group_df["subgroup"].to_list()
test_seen_info = {}
for id, subgroup in zip(test_id, test_subgroup):
    test_seen_info[id] = subgroup


In [24]:
test_seen_daf = pd.DataFrame(list(test_seen_info.items()), columns=["user_id", "subgroup"])

In [25]:
test_seen_daf

Unnamed: 0,user_id,subgroup
0,5c6e7a8d6d180f002084a746,1
1,5e11926d54511e0d1440d024,1
2,59a1bbeb3ba5a507005d94bf,1
3,5a11254d516eb50051f99473,1
4,56702e0f13ef621200899d90,1
...,...,...
7200,5a181ca3152204001e1bf92b,1
7201,59c4b97840fc9f001e640b09,1
7202,5f998423f92f815dbbeb9130,1
7203,5a115497516eb50051f99539,1


#### test_seen dataset bulid

In [26]:
testseen_dataset = Dataset()
testseen_dataset.fit((x for x in test_seen_daf['user_id']),
            (x for x in test_seen_daf['subgroup']))

In [27]:
(test_data, weights) = testseen_dataset.build_interactions(((x, y) for x, y in zip(test_seen_daf['user_id'], test_seen_daf['subgroup'])))

print(repr(test_data))

<7205x1 sparse matrix of type '<class 'numpy.int32'>'
	with 7205 stored elements in COOrdinate format>


In [28]:
def sample_recommendation(model, data, user_ids, user_id):
     
    recommend_list = []
    for user_id in user_ids:
        # known_positives = data['item_labels'][data['train'].tocsr()[user_id].indices]
        
        scores = model.predict(user_id, np.arange(91))   # 91個subgroup
        # print(scores)
        top_items = subgroups_df["subgroup_id"][np.argsort(-scores)]
        
        # print("User %s" % user_id)

        # print("     Recommended:")
            
        recommend_list.append([str(x) for x in top_items[:4]])
        
    return recommend_list

In [29]:
user_id = test_seen_group_df["user_id"].to_dict()

users_list =[]
for i in range(len(user_id)):
    users_list.append(i)

In [None]:
recommend_list = sample_recommendation(model, test_data, users_list, user_id)  # test_seen_daf['user_id'].tolist()

In [31]:
import csv
with open("predict.csv", "w") as f:
    writer = csv.writer(f)
    writer.writerow(["user_id", "subgroup"])
    for i in range(len(recommend_list)):
        recommend = " ".join(recommend_list[i])
        writer.writerow([user_id[i], recommend])