In [1]:
import cornac
from cornac.data import Reader
from cornac.datasets import citeulike
from cornac.eval_methods import RatioSplit, BaseMethod
from cornac.data import TextModality
from cornac.data.text import BaseTokenizer
import pandas as pd
from cornac.hyperopt import Discrete, Continuous
from cornac.hyperopt import GridSearch, RandomSearch

from harmonic_mean import HarmonicMean
from serendipity_wrapper import Serendipity
from combined_eval_method import CombinedBaseMethod
from new_random_search import NewRandomSearch

FM model is only supported on Linux.
Windows executable can be found at http://www.libfm.org.


In [2]:
def df_to_tuplelist(df):

    # transform into tuples
    tuple_list = list(df.itertuples(index=False, name=None))

    # rearrange
    for i in range(len(tuple_list)):
        tuple_list[i] = (str(tuple_list[i][1]), str(tuple_list[i][0]), float(tuple_list[i][2]))

    return tuple_list

def df_to_tuplelist_pair(df):
    # make into tuples
    cats_data = list(df.itertuples(index=False, name=None))

    # unzip the tuple into 2 lists
    cat_ids, texts = zip(*cats_data)

    cat_ids = (list(cat_ids))
    texts = list(texts)

    for i in range(len(texts)):
        texts[i] = texts[i].replace('\n', ' ')

    return cat_ids, texts

In [3]:
STRAT_OR_LSUO = 'strat'

data_dir = '../data/output'
train_df = pd.read_csv(f'{data_dir}/{STRAT_OR_LSUO}_train.csv', usecols=['catID', 'userID', 'like'])
validation_df = pd.read_csv(f'{data_dir}/{STRAT_OR_LSUO}_validation.csv', usecols=['catID', 'userID', 'like'])
test_df = pd.read_csv(f'{data_dir}/{STRAT_OR_LSUO}_test.csv', usecols=['catID', 'userID', 'like'])

like_data_train = df_to_tuplelist(train_df)
like_data_validation = df_to_tuplelist(validation_df)
like_data_test = df_to_tuplelist(test_df)


DESC_OR_DET = 'details'

aux_dir = '../data/auxiliary'
cats_df = pd.read_csv(f'{aux_dir}/cats.csv', usecols=['id', DESC_OR_DET])


cat_ids, texts = df_to_tuplelist_pair(cats_df)

In [4]:
#junyi's modifications
import numpy as np


# train test val filenames
trainfile = 'strat_train'
testfile = 'strat_test'
valfile = 'strat_validation'

# train test val datasets
train = pd.read_csv(f'{data_dir}/'+trainfile+'.csv')
test = pd.read_csv(f'{data_dir}/'+testfile+'.csv')
val = pd.read_csv(f'{data_dir}/'+valfile+'.csv')

# main data dataset
user = pd.read_csv(f'{aux_dir}/users.csv')
cat = pd.read_csv(f'{aux_dir}/cats.csv')
# interaction = pd.read_csv('model_data/auxiliary/interaction.csv')

# list of datasets
dfs = [train, test, val]

for i in range(len(dfs)):
    
    # for all
    dfs[i]['like'] = dfs[i]['like'].apply(lambda x: 1 if x else 0)
    
    # train
    if i == 0:
        dfs[i]['click'] = dfs[i]['click'].apply(lambda x: 1 if x else 0)
        dfs[i].drop(['Unnamed: 0', 'id','created_at', 'updated_at'], axis=1, inplace=True)
        dfs[i] = dfs[i][['userID', 'catID', 'like', 'dwell_time_ms', 'click']]
    
    elif i == 1 or i == 2:
        dfs[i].drop(['Unnamed: 0', 'id', 'dwell_time_ms', 'click', 'created_at', 'updated_at'], axis=1, inplace=True)
        dfs[i] = dfs[i][['userID', 'catID', 'like']]
        
train, test, val = dfs[0], dfs[1], dfs[2]

# checking dwell_time_ms in seconds
train['dwell_time_ms'].apply(lambda x: x/1000).describe()

# applying log transformation on dwell time to reduce impact of outliers
train['dwell_time_ms'].apply(lambda x: np.log(x)).describe()

train['log_dwell_time'] = train['dwell_time_ms'].apply(lambda x: np.log(x))
train.drop(['dwell_time_ms'], axis=1, inplace=True)

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

train['norm_log_dwell_time'] = scaler.fit_transform(train[['log_dwell_time']])
train.drop(['log_dwell_time'], axis=1, inplace=True)
train.head()

train_weighted = train.copy()

# weights L, C, D for like, click, dwell_time
L = 0.5
C = 0.25
D = 0.25

train_weighted['rating'] = L*train_weighted['like'] + C*train_weighted['click'] + D* train_weighted['norm_log_dwell_time']

train_weighted.drop(['like','click','norm_log_dwell_time'], axis=1, inplace=True)

# convert to tuple
like_data_train = [tuple(train_weighted.iloc[i]) for i in range(len(train_weighted))]
like_data_test = [tuple(test.iloc[i]) for i in range(len(test))]
like_data_validation = [tuple(val.iloc[i]) for i in range(len(val))]

In [5]:
# Instantiate a TextModality, it makes it convenient to work with text auxiliary information
# For more details, please refer to the tutorial on how to work with auxiliary data
item_text_modality = TextModality(
    corpus=texts,
    ids=cat_ids,
    tokenizer=BaseTokenizer(sep=" ", stop_words="english"),
    max_vocab=8000,
    max_doc_freq=0.5,
)

In [6]:
# Define an evaluation method to split feedback into train and test sets
bm = CombinedBaseMethod.from_splits(
train_data=like_data_train,
test_data=like_data_validation,
val_data=like_data_test,
verbose=True,
item_text=item_text_modality)

creating from splits
initialising Combined Base
rating_threshold = 1.0
exclude_unknowns = False
---
Training data:
Number of users = 104
Number of items = 400
Number of ratings = 3874
Max rating = 1.0
Min rating = 0.0
Global mean = 0.4
---
Test data:
Number of users = 96
Number of items = 187
Number of ratings = 479
Number of unknown users = 0
Number of unknown items = 3
---
Validation data:
Number of users = 100
Number of items = 196
Number of ratings = 488
---
Total users = 104
Total items = 403


In [7]:
# Instantiate CTR model
ctr = cornac.models.CTR(name='CTR_desc_strat_false_junyi_harm', k=250, max_iter=200, lambda_v=1)

# Use these for evaluation
eval_metrics = [
    HarmonicMean(
        10,
        Serendipity(),
        cornac.metrics.FMeasure(k=10),
        cornac.metrics.NCRR(),
        cornac.metrics.NDCG()
    ),
    Serendipity(),
    cornac.metrics.FMeasure(k=10),
    cornac.metrics.NCRR(),
    cornac.metrics.NDCG()
]

In [8]:
# Wrap MF model inside RandomSearch along with the searching space, try 30 times
rs_ctr = NewRandomSearch(
    model=ctr,
    space=[
        Discrete("k", [50, 75, 100, 150, 200]),
        Continuous("lambda_u", low=1e-4, high=1e1),
        Continuous("lambda_v", low=1e-4, high=1e1),
        Continuous("a", low=0.9, high=1),
        Continuous("b", low=0.0, high=0.1),
        Continuous("eta", low=0.001, high=0.1),
    ],
    metric=HarmonicMean(
        10,
        Serendipity(),
        cornac.metrics.FMeasure(k=10),
        cornac.metrics.NCRR(),
        cornac.metrics.NDCG()
    ),
    eval_method=bm,
    n_trails=30,
)

In [9]:
# Put everything together into an experiment and run it
cornac.Experiment(eval_method=bm, models=[rs_ctr], metrics=eval_metrics, user_based=False).run()


[RandomSearch_CTR_desc_strat_false_junyi_harm] Training started!
Evaluating: {'a': 0.904267111376383, 'b': 0.017835842630054898, 'eta': 0.09644248035592416, 'k': 75, 'lambda_u': 2.7644190625800724, 'lambda_v': 4.401998659388394}


  0%|          | 0/200 [00:00<?, ?it/s]

Learning completed!
Evaluating: {'a': 0.9072238975088809, 'b': 0.005138303213076967, 'eta': 0.019056349984648762, 'k': 200, 'lambda_u': 1.97920711074904, 'lambda_v': 6.828814638427304}


  0%|          | 0/200 [00:00<?, ?it/s]

KeyboardInterrupt: 