# LightFM

* Hybrid model (content based and collaborative filtering)

## Installation
`conda install -c conda-forge lightfm`

In [34]:
from lightfm import LightFM
import numpy as np
import pandas as pd
from lightfm.datasets import fetch_movielens
import math

print("Libraries imported")

Libraries imported


In [3]:
# Fetch LightFM dataset movielens
data = fetch_movielens(min_rating=4.0)

for i in data:
    print(i)

train
test
item_features
item_feature_labels
item_labels


In [4]:
# Example of item_feature_labels
count = 0;
for i in data['item_feature_labels']:
    if(count <5):
        print(f'{count+1}: {i}')
        count+=1

1: Toy Story (1995)
2: GoldenEye (1995)
3: Four Rooms (1995)
4: Get Shorty (1995)
5: Copycat (1995)


In [5]:
print('TRAIN: ', repr(data['train']), '\n\nTEST: ', repr(data['test']),
     '\n\nTYPE: ', type(data['train']))

TRAIN:  <943x1682 sparse matrix of type '<class 'numpy.int32'>'
	with 49906 stored elements in COOrdinate format> 

TEST:  <943x1682 sparse matrix of type '<class 'numpy.int32'>'
	with 5469 stored elements in COOrdinate format> 

TYPE:  <class 'scipy.sparse.coo.coo_matrix'>


In [6]:
train = data['train']
test = data['test']

## Traditional collaborative filtering model 

In [7]:
NUM_THREADS = 2
EPOCHS = 50

# Create model
model = LightFM(loss='warp')

# Start training
%time model = model.fit(train, epochs=EPOCHS, num_threads=NUM_THREADS)

Wall time: 5.04 s


In [19]:
# Import the evaluation routines
from lightfm.evaluation import auc_score

# Compute and print the AUC score, train set
train_auc = auc_score(model, train, num_threads=NUM_THREADS).mean()
print(f'AUC: {math.trunc(train_auc*100)}%')

# Compute and print the AUC score, test set
test_auc = auc_score(model, test, train_interactions=train, num_threads=NUM_THREADS).mean()
print(f'AUC: {math.trunc(test_auc*100)}%')

AUC: 96%
AUC: 93%


## Hybrid model

In [9]:
# Item features
item_features = data['item_features']

# Define model 
hybrid_model = LightFM(loss='warp')

# Fit the hybrid model. Note that this time, we pass
# in the item features matrix.
hybrid_model = hybrid_model.fit(train,
                item_features=item_features,
                epochs=EPOCHS,
                num_threads=NUM_THREADS)

In [17]:
train_auc = auc_score(hybrid_model,
                      train,
                      item_features=item_features,
                      num_threads=NUM_THREADS).mean()
print(f'AUC: {math.trunc(train_auc*100)}%')

AUC: 96%


In [20]:
test_auc = auc_score(hybrid_model,
                    test,
                    train_interactions=train,
                    item_features=item_features,
                    num_threads=NUM_THREADS).mean()
print(f'AUC: {math.trunc(test_auc*100)}%')

AUC: 93%


## User recommendation

In [22]:
# Users, items
n_users, n_items = data['train'].shape

In [23]:
# User 351
liked_movies = data['item_labels'][data['train'].tocsr()[200].indices]

In [24]:
# User 351
scores = hybrid_model.predict(200, np.arange(n_items))
scores

array([-5.76857471, -7.0861845 , -6.53366566, ..., -8.78007603,
       -8.84602356, -8.85424137])

In [32]:
top_items = data['item_labels'][np.argsort(-scores)]

print('PREVIOUS LIKED MOVIES:') 
for i in liked_movies[:5]:
    print('*',i) 

print('\n') 

print('RECOMMENDED MOVIES:')
for x in top_items[:5]:
    print('*',x)

PREVIOUS LIKED MOVIES:
* Get Shorty (1995)
* Seven (Se7en) (1995)
* Usual Suspects, The (1995)
* Taxi Driver (1976)
* Brothers McMullen, The (1995)


RECOMMENDED MOVIES:
* Pulp Fiction (1994)
* Usual Suspects, The (1995)
* Taxi Driver (1976)
* GoodFellas (1990)
* Fargo (1996)


## External dataset

Dataset from [this](https://github.com/oschow/take-a-hike) repo.

In [250]:
# Data
data = 'https://raw.githubusercontent.com/oschow/take-a-hike/master/AllTrails/data/all_ratings_matrix.csv'
df = pd.read_csv(data)

# Features
data_features = 'https://raw.githubusercontent.com/oschow/take-a-hike/master/AllTrails/data/all_hikes_with_hike_id.csv'
df_features = pd.read_csv(data_features)

In [251]:
# Merge dataframes
df = pd.merge(df, df_features, on='hike_id')
df.head()

Unnamed: 0,hike_id,variable,value,hike_region,total_distance,elevation_gain,hike_difficulty,stars,loop,out_and_back,point_to_point,dog_friendly,kid_friendly,camping,waterfall,river,lake,wildflowers,wildlife,views
0,hike43,user1,4,Pike National Forest,5.5,732,2,4.3,1,0,0,1,1,0,0,0,0,1,1,1
1,hike43,user10076,5,Pike National Forest,5.5,732,2,4.3,1,0,0,1,1,0,0,0,0,1,1,1
2,hike43,user10196,4,Pike National Forest,5.5,732,2,4.3,1,0,0,1,1,0,0,0,0,1,1,1
3,hike43,user10349,4,Pike National Forest,5.5,732,2,4.3,1,0,0,1,1,0,0,0,0,1,1,1
4,hike43,user10351,4,Pike National Forest,5.5,732,2,4.3,1,0,0,1,1,0,0,0,0,1,1,1


In [252]:
# Rename variable to user id
df['user_id'] = df['variable'] 
df = df.drop('variable', 1)

# Rename value to rating
df['rating'] = df['value'] 
df = df.drop('value', 1)

In [253]:
# Remove string
df['hike_id'] = df.hike_id.str.strip('hike')

df['user_id'] = df.hike_id.str.strip('user')

# Convert to int

df['hike_id'] = pd.to_numeric(df.hike_id, errors='coerce')

df['user_id'] = pd.to_numeric(df.user_id, errors='coerce')

In [254]:
print(f'Hike Id Type: {type(df.hike_id[0])}\nUser Id Type: {type(df.user_id[0])}')

Hike Id Type: <class 'numpy.int64'>
User Id Type: <class 'numpy.int64'>


In [255]:
# Change order of columns
df = df[['user_id', 'hike_id', 'rating', 'hike_region','total_distance','elevation_gain','hike_difficulty','stars','loop', 'out_and_back', 'point_to_point','dog_friendly', 'kid_friendly', 'camping', 'waterfall', 'river','lake', 'wildflowers', 'wildlife','views']]
df.head()

Unnamed: 0,user_id,hike_id,rating,hike_region,total_distance,elevation_gain,hike_difficulty,stars,loop,out_and_back,point_to_point,dog_friendly,kid_friendly,camping,waterfall,river,lake,wildflowers,wildlife,views
0,43,43,4,Pike National Forest,5.5,732,2,4.3,1,0,0,1,1,0,0,0,0,1,1,1
1,43,43,5,Pike National Forest,5.5,732,2,4.3,1,0,0,1,1,0,0,0,0,1,1,1
2,43,43,4,Pike National Forest,5.5,732,2,4.3,1,0,0,1,1,0,0,0,0,1,1,1
3,43,43,4,Pike National Forest,5.5,732,2,4.3,1,0,0,1,1,0,0,0,0,1,1,1
4,43,43,4,Pike National Forest,5.5,732,2,4.3,1,0,0,1,1,0,0,0,0,1,1,1


In [288]:
def get_ratings(df):
    ratings = []
    df = df[['user_id', 'hike_id', 'rating']]
    
    for x in df.values:
        ratings.append(x)
    
    return ratings

In [300]:
def get_features(df):
    df = df.drop(['user_id', 'rating'], 1)
 
    features = []
    for x in df.values:
        features.append(x)
    
    return features

In [269]:
#df_pivot = df.pivot_table(index='variable',columns='hike_id',values='value')

In [309]:
from lightfm.data import Dataset

dataset = Dataset()
dataset.fit((x[0] for x in get_ratings(df)),
            (x[1] for x in get_ratings(df)))

In [310]:
num_users, num_items = dataset.interactions_shape()
print('Num users: {}, num_items {}.'.format(num_users, num_items))

Num users: 1487, num_items 1487.


## Supply additional id's with fit_partial()
Ff we don’t have all user and items ids at once, we can repeatedly call `fit_partial` to supply additional ids. In this case, we will use this capability to add some item feature mappings

In [311]:
dataset.fit_partial(items=(x[0] for x in get_features(df)),
                    item_features=(x[1] for x in get_features(df)))

## Building the interactions matrix

In [312]:
(interactions, weights) = dataset.build_interactions(((x[0], x[1])
                                                      for x in get_ratings(df)))

print(repr(interactions))

<1487x1487 sparse matrix of type '<class 'numpy.int32'>'
	with 23842 stored elements in COOrdinate format>


In [308]:
item_features = dataset.build_item_features(((x[0], [x[1]])
                                              for x in get_features(df)))
print(repr(item_features))

<1487x1815 sparse matrix of type '<class 'numpy.float32'>'
	with 2974 stored elements in Compressed Sparse Row format>


## Building a model

In [324]:
from lightfm import LightFM
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import auc_score

model = LightFM(loss='warp')
model.fit(interactions, item_features=item_features, epochs=10)

<lightfm.lightfm.LightFM at 0x1e0901d9848>

In [325]:
# AUC Score
train_auc = auc_score(model,
                      interactions,
                      item_features=item_features
                     ).mean()
print('Hybrid training set AUC: %s' % train_auc)

Hybrid training set AUC: 0.95966834


In [335]:
scores = model.predict(df.user_id.values, df.hike_id.values, item_features=item_features)
top_items = df['hike_id'][np.argsort(-scores)]