# Megogo

In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as sql_func
from pyspark.sql.types import *
from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.sql.functions import col
from pyspark import sql, SparkConf, SparkContext

from multiprocessing import Pool
import multiprocessing
from functools import partial

from lightfm.data import Dataset
from lightfm import LightFM
from collections import Counter

import pandas as pd
import numpy as np
import json
import copy
import os
from tqdm.auto import tqdm

%load_ext Cython




# Train test split

In [2]:
def check_path(path):
    os.system("if [ ! -d " + path + " ]; then mkdir -p " + path + "; fi")

In [3]:
DATA_PATH = 'data/'
SAVE_PATH = 'save/'

In [4]:
train_full = pd.read_csv(DATA_PATH + 'train_data_full.csv')
train_full.head(2)

Unnamed: 0,session_start_datetime,user_id,user_ip,primary_video_id,video_id,vod_type,session_duration,device_type,device_os,player_position_min,player_position_max,time_cumsum_max,video_duration,watching_percentage
0,2018-07-01 00:00:02.135,21603820,27241033,9583642,9583642,svod,688,web,Windows_10,6940,6940,93,8198,0.0839
1,2018-07-01 00:00:02.232,35636970,10887511,24645936,24645936,advod,3174,mobile,android,599,3173,0,5297,0.599


In [5]:
proportion_train = int(0.8*len(train_full))
train = train_full.loc[:proportion_train, :]
test = train_full.loc[proportion_train+1:, :]

In [6]:
#save split local
check_path(SAVE_PATH)

In [7]:
train.to_csv(SAVE_PATH+'train_80_20.csv', index=None)
test.to_csv(SAVE_PATH+'test_80_20.csv', index=None)

In [8]:
del train
del test
del train_full

# Model Spark ALS

In [9]:
train_path = SAVE_PATH+'train_80_20.csv'

In [10]:
conf = SparkConf().setAppName("test")
conf = (conf.setMaster('local[6]')
        .set('spark.executor.memory', '25G')
        .set('spark.driver.memory', '25G')
        .set('spark.driver.maxResultSize', '25G'))
sc = SparkContext(conf=conf)
sqlContext = sql.SQLContext(sc)

In [11]:
#in "local" write number of cpu to use

spark = SparkSession.builder.master("local[6]").appName("test").getOrCreate()

data_schema = StructType([
    StructField('session_start_datetime',TimestampType(), False),
    StructField('user_id',IntegerType(), False),
    StructField('user_ip',IntegerType(), False),
    StructField('primary_video_id',IntegerType(), False),
    StructField('video_id',IntegerType(), False),
    StructField('vod_type',StringType(), False),
    StructField('session_duration',IntegerType(), False),
    StructField('device_type',StringType(), False),
    StructField('device_os',StringType(), False),
    StructField('player_position_min',LongType(), False),
    StructField('player_position_max',LongType(), False),
    StructField('time_cumsum_max',LongType(), False),
    StructField('video_duration',IntegerType(), False),
    StructField('watching_percentage',FloatType(), False)
])
final_stat = spark.read.csv(
    train_path, header=True, schema=data_schema
).cache()

In [12]:
final_stat.limit(3).toPandas()

Unnamed: 0,session_start_datetime,user_id,user_ip,primary_video_id,video_id,vod_type,session_duration,device_type,device_os,player_position_min,player_position_max,time_cumsum_max,video_duration,watching_percentage
0,2018-07-01 00:00:02.135,21603820,27241033,9583642,9583642,svod,688,web,Windows_10,6940,6940,93,8198,0.0839
1,2018-07-01 00:00:02.232,35636970,10887511,24645936,24645936,advod,3174,mobile,android,599,3173,0,5297,0.599
2,2018-07-01 00:00:06.961,78312976,15427448,25397362,23346676,advod,3054,tv,samsung,599,3052,3032,3052,1.0


In [13]:
#for selecting users
# ids = [21603820, 78312976, 53477088]
# users_spark = final_stat.filter(final_stat.user_id.isin(ids)).select('user_id').distinct()

## Train ALS

In [14]:
ratings = (final_stat
    .select(
        'user_id',
        'primary_video_id',
        'watching_percentage',
    )
).repartition(200).cache()

In [15]:
%%time
als = ALS(rank=175, maxIter=10,
          implicitPrefs=True,
          regParam=1,
          alpha=50,
          userCol="user_id", itemCol="primary_video_id", ratingCol="watching_percentage",
          numUserBlocks=32, numItemBlocks=32,
          coldStartStrategy="drop")
model = als.fit(ratings)

CPU times: user 344 ms, sys: 85.9 ms, total: 430 ms
Wall time: 25min 49s


## predict ALS

In [16]:
%%time
userRecsDf = model.recommendForAllUsers(25).cache()
userRecsDf.count()

CPU times: user 101 ms, sys: 4.98 ms, total: 106 ms
Wall time: 5min 52s


In [17]:
#userSubsetRecs = model.recommendForUserSubset(users_spark, 35).cache()

In [18]:
userRecs = userRecsDf.toPandas()
userRecs.shape

(370865, 2)

In [19]:
predicted_dict = userRecs.set_index('user_id').to_dict('index')
predicted_dict = {str(user_id):[r[0] for r in recs['recommendations']] for user_id, recs in predicted_dict.items()}
#len(predicted_dict)

## save predict 

In [20]:
with open(SAVE_PATH+'ALS_predicted_for_all_users_on_full_175_rank.json', 'w') as f:
    json.dump(predicted_dict, f)

In [21]:
try:
    del userRecs
    del userRecsDf
    del model
    del data_schema
    del ratings
except:
    pass

# LightFM

### prepera dataset

In [22]:
train = pd.read_csv(SAVE_PATH+'train_80_20.csv')
test = pd.read_csv(SAVE_PATH+'test_80_20.csv')

In [23]:
# recommend for new users
raw_sub = [4201568, 10062382, 17141087, 3657616, 19624341,
           32222690, 6011714, 4874001, 3551683, 10976832, 
           30564130, 3696132, 18189717, 27740348, 2677761,
           19901318, 28309169, 3011924, 26106566, 29646969,
           19134359, 29114276, 25397362, 27230332, 11843490,
           28765172, 21172094, 18846225, 22241900, 762848,
           13914469, 8817934, 26997030, 16742579, 32970046]

In [24]:
train_iter = train[['user_id', 'primary_video_id', 'watching_percentage']].to_dict('records')
dataset = Dataset()
dataset.fit((x['user_id'] for x in train_iter),
            (x['primary_video_id'] for x in train_iter))

num_users, num_items = dataset.interactions_shape()
print('Num users: {}, num_items {}.'.format(num_users, num_items))

(interactions, weights) = dataset.build_interactions(((x['user_id'], x['primary_video_id'], x['watching_percentage'])
                                                      for x in train_iter))

Num users: 370865, num_items 7099.


In [25]:
def sample_recommendation(model, dataset, inv_map, user_map_to_int, user_id, k):
    
    user_id_int = user_map_to_int[user_id]
    n_users, n_items = dataset.interactions_shape()     
    scores = model.predict(user_id_int, np.arange(n_items))     
    top_items = np.argsort(-scores)[:k]   
    result = []
    for x in top_items:
        result.append(inv_map[x])
    return result

## Train LightFM

In [26]:
model = LightFM(loss='warp', no_components = 110)
model.fit(interactions=interactions, sample_weight = weights, epochs=40, num_threads=7)

<lightfm.lightfm.LightFM at 0x7f2720134400>

In [27]:
# dict of items
item_map_to_int = dataset.mapping()[2]
item_map = {v: k for k, v in item_map_to_int.items()}
user_map_to_int = dataset.mapping()[0]
user_map = {v: k for k, v in user_map_to_int.items()}

## Predict LightFM

In [28]:
users_in_test = test.user_id.unique()
result_test_to_submit = {}

for user_uid in tqdm(users_in_test):
    try:
        user_cat = user_map_to_int[user_uid]
        
        # perform inference
        recs = sample_recommendation(model, dataset, item_map, user_map_to_int, user_uid, 55)
    
        # drop scores and transform model's internal elelemnt category to element_uid for every prediction
        # also convert np.uint64 to int so it could be json serialized later
        result_test_to_submit[int(user_uid)] = [int(i) for i in recs]


    except LookupError:
        result_test_to_submit[int(user_uid)] = raw_sub




### save predict

In [29]:
with open(SAVE_PATH+'LightFM_predict.json', 'w') as f:
    json.dump(result_test_to_submit, f)

# Filtration

In [30]:
# with open(SAVE_PATH+'ALS_predicted_for_all_users_on_full_175_rank.json', 'r') as f:
#     predicted_dict = json.load(f)

In [31]:
users_in_test = test.user_id.unique()
users_in_train = train.user_id.unique()
known_users = list(set(users_in_train).intersection(set(users_in_test)))
unknown_users = list((set(users_in_test))-set(users_in_train))

In [32]:
predict_ALS = {}
for user in known_users:
    predict_ALS[user] = predicted_dict[str(user)]
for user in unknown_users:
    predict_ALS[user] = raw_sub

In [33]:
del predicted_dict

# Users preferences

In [34]:
train_path = SAVE_PATH+'train_80_20.csv'

In [35]:
df_tr = pd.read_csv(train_path)
df_tr.loc[df_tr.primary_video_id != df_tr.video_id, 'is_serial'] = 1

In [36]:
temp = df_tr.groupby(['user_id', 'primary_video_id', 'is_serial']).size().reset_index()
temp = temp.groupby(['user_id', 'is_serial']).size().reset_index()

In [37]:
users_who_have_watched_movies = set(temp.loc[temp.is_serial == 0, 'user_id'].unique())
users_who_have_watched_serials = set(temp.loc[temp.is_serial == 1, 'user_id'].unique())

In [38]:
users_who_watch_only_movies = users_who_have_watched_movies - users_who_have_watched_serials
users_who_watch_only_serials = users_who_have_watched_serials - users_who_have_watched_movies
user_who_watch_both = users_who_have_watched_serials.intersection(users_who_have_watched_movies)

In [39]:
with open('users_movies_only.txt', 'w') as f:
    for item in users_who_watch_only_movies:
        f.write("%s\n" % item)
        
with open('users_serials_only.txt', 'w') as f:
    for item in users_who_watch_only_serials:
        f.write("%s\n" % item)
        
with open('user_both.txt', 'w') as f:
    for item in user_who_watch_both:
        f.write("%s\n" % item)

In [40]:
del df_tr
del temp
del users_who_have_watched_movies
del users_who_have_watched_serials

# Create a filter

In [41]:
text_file = open("users_serials_only.txt", "r")
lines = text_file.read().split('\n')
text_file.close()
serials_u = set([int(i) for i in lines[:-1]])

text_file = open("users_movies_only.txt", "r")
lines = text_file.read().split('\n')
text_file.close()
films_u = set([int(i) for i in lines[:-1]])

text_file = open("user_both.txt", "r")
lines = text_file.read().split('\n')
text_file.close()
both_u = set([int(i) for i in lines[:-1]])

In [42]:
meta = pd.read_csv(DATA_PATH+'video_meta_data_full.csv')
series_in_video = meta.groupby('primary_video_id').count()['video_id']

In [43]:
films = set(meta[meta.type.isin(['FILM','FILM3D','MULTFILM', 'SHOWFILM'])]['primary_video_id'])
notfilms = set(meta[~meta.type.isin(['FILM','FILM3D', 'MULTFILM', 'SHOWFILM'])]['primary_video_id'])
mult = set(meta[meta.type.isin(['MULTFILM', 'MULTSERIAL'])]['primary_video_id'])

In [44]:
# postprocessing (filterring recommendations)
    
def check(u_to_observ, result_test_to_submit):
    
    to_observ = result_test_to_submit[u_to_observ]
    try:
        if u_to_observ in films_u:
            for item in copy.deepcopy(to_observ):
                if item in notfilms:
                    to_observ.remove(item)
                    continue
                if np.sum(np.array(train_f[str(u_to_observ)])==item)>=1:
                    to_observ.remove(item)

        elif u_to_observ in serials_u:
            for item in copy.deepcopy(to_observ):
                if item in films:
                    to_observ.remove(item)
                    continue
                if np.sum(np.array(train_f[str(u_to_observ)])==item)==series_in_video[item]:
                    to_observ.remove(item)
        else:
            for item in copy.deepcopy(to_observ):
                if (item in films) and (np.sum(np.array(train_f[str(u_to_observ)])==item)>=1):
                    to_observ.remove(item)
                    continue
                if (item in notfilms) and (np.sum(np.array(train_f[str(u_to_observ)])==item)==series_in_video[item]):
                    to_observ.remove(item)
        return(to_observ)
    
    except LookupError:
        return(to_observ)

### Transform train and test to dict

In [45]:
def batch_helper(users, test):
    test = test[test.user.isin(users)]
    one_res_dic = {}
    for user in users:
        one_res_dic[str(user)] = list(test[test.user==user].item)
    return one_res_dic

def batch_dic(test):
    users = list(test.user.unique())
    cpus_to_use = (multiprocessing.cpu_count() - 1)
    one_chunk = int(len(users)/cpus_to_use)
    chunks = [users[i*one_chunk:(i+1)*one_chunk] for i in range(cpus_to_use)]
    for u in users[(cpus_to_use+1)*one_chunk:]:
           chunks[-1].append(u)

    result = []
    result_dic = {}
    p = Pool()
    result = p.map(partial(batch_helper, test=test), chunks)

    for one_res in result:
           result_dic.update(one_res)

    return result_dic

In [46]:
train = train[['user_id', 'primary_video_id', 'watching_percentage']]
train.columns = ['user', 'item', 'rate']

In [47]:
%%time
train_f = batch_dic(train)

CPU times: user 8.33 s, sys: 4.47 s, total: 12.8 s
Wall time: 2min 57s


#### Filter for LightFM

In [48]:
result_test_to_submit_LightFM = {}
for k in tqdm(result_test_to_submit.keys()):
    result_test_to_submit_LightFM[k] = check(k, result_test_to_submit)




#### Filter for SparkALS

In [49]:
result_test_to_submit_SparkALS = {}
for k in tqdm(predict_ALS.keys()):
    result_test_to_submit_SparkALS[k] = check(k, predict_ALS)




### Merge 2 results

In [50]:
lightfm = result_test_to_submit_LightFM
als = result_test_to_submit_SparkALS

In [71]:
for user in unknown_users:
    als[user] = raw_sub
    lightfm[user] = raw_sub
    
for k in als.keys():
    if len(als[k]) == 0:
        als[k] = raw_sub

for k in lightfm.keys():
    if len(lightfm[k]) == 0:
        lightfm[k] = raw_sub

In [72]:
test_users = users_in_test

In [61]:
res = []
for k in list(test_users):
    try: 
        a = [str(el) for el in als[k]]
        res.append(' '.join(a))
    except KeyError:
        a = [str(el) for el in raw_sub]
        res.append(' '.join(a))
        
sub1 = pd.DataFrame({'user_id':list(test_users), 'primary_video_id':res})

In [62]:
sub1.head(2)

Unnamed: 0,primary_video_id,user_id
0,32222690 11269852 29646969 18189717 10567708 3...,109804426
1,5794535 5446025 22575620 5760990 18369026 2277...,22096103


In [63]:
res = []
for k in list(test_users):
    try: 
        a = [str(el) for el in lightfm[k]]
        res.append(' '.join(a))
    except KeyError:
        a = [str(el) for el in raw_sub]
        res.append(' '.join(a))
        
sub2 = pd.DataFrame({'user_id':list(test_users), 'primary_video_id':res})

In [64]:
sub2.head(2)

Unnamed: 0,primary_video_id,user_id
0,3696132 2677761 32222690 28765172 18763922 277...,109804426
1,5794535 2677761 27740348 3696132 11839626 2876...,22096103


In [65]:
def make_int(sub):
    res = []
    for row in sub:
        res.append([int(item) for item in row])
    return res

In [66]:
users1 = sub1['user_id']
users2 = sub2['user_id']

sub1 = [i.split(' ') for i in sub1.primary_video_id.values]
sub2 = [i.split(' ') for i in sub2.primary_video_id.values]

sub1 = make_int(sub1)
sub2 = make_int(sub2)

result1 = {}
for j in range(len(users1)):
    user = users1[j]
    result1[user] = {}
    for i in range(len(sub1[j])):
        result1[user][sub1[j][i]] = (200-i)
        
result2 = {}
for j in range(len(users2)):
    user = users2[j]
    result2[user] = {}
    for i in range(len(sub2[j])):
        result2[user][sub2[j][i]] = (200-i)

In [67]:
final_result = {}

for k in tqdm(result1.keys()):
    A = Counter(result1[k])
    B = Counter(result2[k])
    a = dict(A+B)
    final_result[str(k)] = sorted(a, key=a.get, reverse=True)[:10]




# Metric

In [73]:
%%cython
def average_precision(
        dict data_true,
        dict data_predicted,
        const unsigned long int k
) -> float:
    cdef:
        unsigned long int n_items_predicted
        unsigned long int n_items_true
        unsigned long int n_correct_items
        unsigned long int item_idx

        double average_precision_sum
        double precision

        set items_true
        list items_predicted

    if not data_true:
        raise ValueError('data_true is empty')

    average_precision_sum = 0.0

    for key, items_true in data_true.items():
        items_predicted = data_predicted.get(key, [])

        n_items_true = len(items_true)
        n_items_predicted = min(len(items_predicted), k)

        if n_items_true == 0 or n_items_predicted == 0:
            continue

        n_correct_items = 0
        precision = 0.0

        for item_idx in range(n_items_predicted):
            if items_predicted[item_idx] in items_true:
                n_correct_items += 1
                precision += <double>n_correct_items / <double>(item_idx + 1)

        average_precision_sum += <double>precision / <double>min(n_items_true, k)

    return average_precision_sum / <double>len(data_true)

def metric(true_data, predicted_data, k=20):
    true_data_set = {k: set(v) for k, v in true_data.items()}

    return average_precision(true_data_set, predicted_data, k=k)

## Filter test for relevant 

In [74]:
test.head(2)

Unnamed: 0,session_start_datetime,user_id,user_ip,primary_video_id,video_id,vod_type,session_duration,device_type,device_os,player_position_min,player_position_max,time_cumsum_max,video_duration,watching_percentage
1,2018-09-13 12:46:34.980,22096103,45934161,5794535,32408296,advod,434,mobile,iOS 9.3.5,0,405,0,405,1.0
2,2018-09-13 12:46:36.188,66879139,65943854,14497595,14497595,advod,9426,android_tv,android,557,5815,0,5817,1.0


In [75]:
test = test[test.watching_percentage >= 0.5]

In [76]:
test = test[['user_id', 'primary_video_id', 'watching_percentage']]
test.columns = ['user', 'item', 'rate']

In [77]:
test_f = batch_dic(test)

## Local validation

In [78]:
metric(test_f,final_result, k=10)

0.0546808932682842

## Save to submit

In [80]:
res = []
for k in list(final_result.keys()):
    a = final_result[k]
    res.append(' '.join(str(a)))

In [81]:
sub_csv = pd.DataFrame({'user_id':list(final_result.keys()), 'primary_video_id':res})
sub_csv.to_csv('super_final_submit.csv', index=False)