In [1]:
import re
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel
from scipy.sparse import csr_matrix
import csv
import ml_metrics

In [2]:
users_df = pd.read_csv('./data/users.csv')
subgroups_df = pd.read_csv('./data/subgroups.csv')
course_df_original = pd.read_csv('./data/courses.csv').fillna('')

train_df = pd.read_csv('./data/train.csv')
test_seen_df = pd.read_csv('./data/test_seen.csv')
val_seen_df = pd.read_csv('./data/val_seen.csv')

In [3]:
id2course_mapping = course_df_original["course_id"].to_dict()
course2id_mapping = {v : k for k, v in id2course_mapping.items()}

In [4]:
course_df = pd.read_csv('./combination1231.csv').fillna('')
course_df.insert(0, 'course_id', course_df_original['course_id'])

In [5]:
for i in range(len(course_df)):
    s = ''
    s = course_df['description'][i]
    s1 = BeautifulSoup(s ,'html').text
    course_df['description'][i] = s1
course_df = course_df.replace('\n', '',regex=True).replace('&.;', '',regex=True).replace("--&?", "",regex=True).replace("\t", "",regex=True)
fillna=course_df.fillna('')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  course_df['description'][i] = s1


In [6]:
def combination(name, n):
    text=''
    if n-1 > 0:
        for i in range(n):
            text = text + name + ' '
    elif n-1 == 0:
        text = text + name + ' '
    return text
def combination_count(x, cn,ti,g,sg,gsg,t,d,wl,rt,rbg,tg, kcn,kti,kg,ksg,kgsg,kt,kd,kwl,krt,krbg,ktg):
    course_name = x['course_name']
    teacher_intro = x['teacher_intro']
    will_learn = x['will_learn']
    recommended_background = x['recommended_background']
    target_group = x['target_group']
    required_tools = x['required_tools']
    text = ''
    text += combination(course_name, cn)
    text += combination(teacher_intro, ti)
    text += combination(x['groups'], g)
    text += combination(x['sub_groups'], sg)
    text += combination(x['groups+subgroups'], gsg)
    text += combination(x['topics'], t)
    text += combination(x['description'], d)
    text += combination(x['will_learn'], wl)
    text += combination(x['required_tools'], rt)
    text += combination(x['recommended_background'], rbg)
    text += combination(x['target_group'], tg)
    text += combination(x['kw_name'], kcn)
    text += combination(x['kw_intro'], kti)
    text += combination(x['kw_group'], kg)
    text += combination(x['kw_sub'], ksg)
    text += combination(x['kw_gs'], kgsg)
    text += combination(x['kw_topics'], kt)
    text += combination(x['kw_desc'], kd)
    text += combination(x['kw_will'], kwl)
    text += combination(x['kw_tool'], krt)
    text += combination(x['kw_recommend'], krbg)
    text += combination(x['kw_target'], ktg)
    return text

def create_combination(x):
    text=''
    # Adjust weights
    text = combination_count(x, 1,2,1,2,1,0,0,1,1,1,1 ,1,1,0,0,0,0,0,1,1,2,1)
    return text

fillna['combination'] = fillna.apply(create_combination, axis=1)

In [7]:
stopwords = [k.strip() for k in open('baidu_stopwords_plus.txt', encoding='utf8').readlines() if k.strip() != '']

In [8]:
tfidf = TfidfVectorizer(tokenizer=None, stop_words=stopwords)
# Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(fillna['combination'])
tfidf_matrix_nd = tfidf_matrix.toarray()
np.save('item_embeddings_course.npy', tfidf_matrix_nd)



In [9]:
item_embed_FM = np.load('item_embeddings_course.npy')
item_embed_FM = item_embed_FM[:728]

In [10]:
concatenate_item_embed_nd = np.concatenate([tfidf_matrix_nd, item_embed_FM/2], axis = 1)
concatenate_item_embed = csr_matrix(concatenate_item_embed_nd)
# Compute the cosine similarity matrix
cosine_sim2 = linear_kernel(concatenate_item_embed, concatenate_item_embed)

In [11]:
def get_new_recommend(haved_courses_list, cosine_sim, top = 50):
    cosine_sim_sum = [ [i, 0] for i in range(len(course2id_mapping))]
    haved_courses_index_list = [ course2id_mapping[course_id] for course_id in haved_courses_list]
    for idx in haved_courses_index_list:
        # Get the pairwsie similarity scores of all courses with that course
        sim_scores = list(enumerate(cosine_sim[idx]))
        for i in range(len(sim_scores)):
            cosine_sim_sum[i][1] += sim_scores[i][1]

    # Sort the courses based on the similarity scores
    sim_scores = sorted(cosine_sim_sum, key=lambda x: x[1], reverse=True)

    recommend_indices = []
    # Get the scores of the 50 most course not buy
    for i in range(len(sim_scores)):
        if len(recommend_indices) < top:
            if sim_scores[i][0] not in haved_courses_index_list:
                recommend_indices.append(sim_scores[i][0])

    # Return the top 50 most similar course
    return course_df['course_id'].iloc[recommend_indices].tolist()

# Validate Accuracy of our approach

In [12]:
train_haved_purchased_course = {}
for seen_user_id, course_ids in zip(train_df["user_id"], train_df["course_id"]):
    course_id_list = course_ids.split(' ')
    train_haved_purchased_course[seen_user_id] = train_haved_purchased_course.setdefault(seen_user_id, []) + course_id_list

val_course = {}
val_seen_df_fillna = val_seen_df.fillna("")
for seen_user_id, course_ids in zip(val_seen_df_fillna["user_id"], val_seen_df_fillna["course_id"]):
    if len(course_ids) > 0:
        course_id_list = [ str(x) for x in course_ids.split(' ')]
        val_course[seen_user_id] = val_course.setdefault(seen_user_id, []) + course_id_list

In [13]:
answers, predictions, map50s = [], [], []
for user_id in val_seen_df["user_id"]:
    prediction = get_new_recommend(train_haved_purchased_course[user_id], cosine_sim2, top = 50)
    #print(prediction)
    prediction_idxs = prediction
    if user_id in val_course.keys():
        answer_idxs = val_course[user_id]
    else:
        answer_idxs = []
    predictions.append(prediction_idxs)
    answers.append(answer_idxs)
    map50s.append(ml_metrics.mapk(predicted= [prediction_idxs], actual= [answer_idxs], k = 50))
    
print(np.mean(map50s))
map50 = ml_metrics.mapk(predicted= predictions, actual= answers, k = 50)

0.07197245330068001


# Predict and Save

In [14]:
predict_user = test_seen_df["user_id"].to_list()

seen_user_haved_purchased_course = {}
for seen_user_id, course_ids in zip(train_df["user_id"], train_df["course_id"]):
    course_id_list = course_ids.split(' ')
    seen_user_haved_purchased_course[seen_user_id] = seen_user_haved_purchased_course.setdefault(seen_user_id, []) + course_id_list
for seen_user_id, course_ids in zip(val_seen_df["user_id"], val_seen_df["course_id"]):
    course_id_list = course_ids.split(' ')
    seen_user_haved_purchased_course[seen_user_id] = seen_user_haved_purchased_course.setdefault(seen_user_id, []) + course_id_list

In [15]:
predict_users = test_seen_df["user_id"].to_list()
with open("course_predict50.csv", "w") as f:
    writer = csv.writer(f)
    writer.writerow(["user_id", "course_id"])
    for user_id in predict_users:
        recommend = " ".join(get_new_recommend(seen_user_haved_purchased_course[user_id], cosine_sim2, top = 50))
        writer.writerow([user_id, recommend])