# LightFM

* Hybrid model (content based and collaborative filtering)

## Installation
`conda install -c conda-forge lightfm`

In [34]:
from lightfm import LightFM
import numpy as np
import pandas as pd
from lightfm.datasets import fetch_movielens
import math

print("Libraries imported")

Libraries imported


In [3]:
# Fetch LightFM dataset movielens
data = fetch_movielens(min_rating=4.0)

for i in data:
    print(i)

train
test
item_features
item_feature_labels
item_labels


In [4]:
# Example of item_feature_labels
count = 0;
for i in data['item_feature_labels']:
    if(count <5):
        print(f'{count+1}: {i}')
        count+=1

1: Toy Story (1995)
2: GoldenEye (1995)
3: Four Rooms (1995)
4: Get Shorty (1995)
5: Copycat (1995)


In [5]:
print('TRAIN: ', repr(data['train']), '\n\nTEST: ', repr(data['test']),
     '\n\nTYPE: ', type(data['train']))

TRAIN:  <943x1682 sparse matrix of type '<class 'numpy.int32'>'
	with 49906 stored elements in COOrdinate format> 

TEST:  <943x1682 sparse matrix of type '<class 'numpy.int32'>'
	with 5469 stored elements in COOrdinate format> 

TYPE:  <class 'scipy.sparse.coo.coo_matrix'>


In [6]:
train = data['train']
test = data['test']

## Traditional collaborative filtering model 

In [7]:
NUM_THREADS = 2
EPOCHS = 50

# Create model
model = LightFM(loss='warp')

# Start training
%time model = model.fit(train, epochs=EPOCHS, num_threads=NUM_THREADS)

Wall time: 5.04 s


In [19]:
# Import the evaluation routines
from lightfm.evaluation import auc_score

# Compute and print the AUC score, train set
train_auc = auc_score(model, train, num_threads=NUM_THREADS).mean()
print(f'AUC: {math.trunc(train_auc*100)}%')

# Compute and print the AUC score, test set
test_auc = auc_score(model, test, train_interactions=train, num_threads=NUM_THREADS).mean()
print(f'AUC: {math.trunc(test_auc*100)}%')

AUC: 96%
AUC: 93%


## Hybrid model

In [9]:
# Item features
item_features = data['item_features']

# Define model 
hybrid_model = LightFM(loss='warp')

# Fit the hybrid model. Note that this time, we pass
# in the item features matrix.
hybrid_model = hybrid_model.fit(train,
                item_features=item_features,
                epochs=EPOCHS,
                num_threads=NUM_THREADS)

In [17]:
train_auc = auc_score(hybrid_model,
                      train,
                      item_features=item_features,
                      num_threads=NUM_THREADS).mean()
print(f'AUC: {math.trunc(train_auc*100)}%')

AUC: 96%


In [20]:
test_auc = auc_score(hybrid_model,
                    test,
                    train_interactions=train,
                    item_features=item_features,
                    num_threads=NUM_THREADS).mean()
print(f'AUC: {math.trunc(test_auc*100)}%')

AUC: 93%


## User recommendation

In [22]:
# Users, items
n_users, n_items = data['train'].shape

In [23]:
# User 351
liked_movies = data['item_labels'][data['train'].tocsr()[200].indices]

In [24]:
# User 351
scores = hybrid_model.predict(200, np.arange(n_items))
scores

array([-5.76857471, -7.0861845 , -6.53366566, ..., -8.78007603,
       -8.84602356, -8.85424137])

In [32]:
top_items = data['item_labels'][np.argsort(-scores)]

print('PREVIOUS LIKED MOVIES:') 
for i in liked_movies[:5]:
    print('*',i) 

print('\n') 

print('RECOMMENDED MOVIES:')
for x in top_items[:5]:
    print('*',x)

PREVIOUS LIKED MOVIES:
* Get Shorty (1995)
* Seven (Se7en) (1995)
* Usual Suspects, The (1995)
* Taxi Driver (1976)
* Brothers McMullen, The (1995)


RECOMMENDED MOVIES:
* Pulp Fiction (1994)
* Usual Suspects, The (1995)
* Taxi Driver (1976)
* GoodFellas (1990)
* Fargo (1996)


## External dataset

Dataset from [this](https://github.com/oschow/take-a-hike) repo.

In [556]:
# Data
data = 'https://raw.githubusercontent.com/oschow/take-a-hike/master/AllTrails/data/all_ratings_matrix.csv'
df = pd.read_csv(data)

# Features
#data_features = 'https://raw.githubusercontent.com/oschow/take-a-hike/master/AllTrails/data/all_hikes_with_hike_id.csv'
data_features = 'https://raw.githubusercontent.com/oschow/take-a-hike/master/AllTrails/data/all_hikes_with_hike_name.csv'
df_features = pd.read_csv(data_features)

In [557]:
df_features.tail()

Unnamed: 0,hike_name,hike_region,total_distance,elevation_gain,hike_difficulty,stars,hike_id,loop,out_and_back,point_to_point,dog_friendly,kid_friendly,camping,waterfall,river,lake,wildflowers,wildlife,views
1482,Foothills Nature Trail to CCC Shelter,Roosevelt National Forest,2.0,498,1,3.0,hike1482,0,1,0,1,0,0,0,1,0,1,1,1
1483,Sinton Trail,"Roswell, Colorado",5.6,374,1,4.0,hike1483,0,1,0,1,1,0,0,0,0,1,1,1
1484,Uncompahgre River Walk,"Montrose, Colorado",14.4,411,1,3.0,hike1484,0,1,0,1,1,0,0,1,0,0,1,1
1485,Mayhoffer Trail Loop,"Superior, Colorado",10.8,872,1,2.0,hike1485,1,0,0,0,1,0,0,0,0,0,0,0
1486,River Ponds Trail Loop trail,"Fort Collins, Colorado",2.0,14,1,4.0,hike1486,1,0,0,1,0,0,0,0,1,1,1,1


In [558]:
df.head()

Unnamed: 0,hike_id,variable,value
0,hike43,user1,4
1,hike6,user1,4
2,hike137,user10,5
3,hike7,user100,4
4,hike33,user1000,4


In [559]:
# Merge dataframes
df = pd.merge(df, df_features, on='hike_id')
df.columns

Index(['hike_id', 'variable', 'value', 'hike_name', 'hike_region',
       'total_distance', 'elevation_gain', 'hike_difficulty', 'stars', 'loop',
       'out_and_back', 'point_to_point', 'dog_friendly', 'kid_friendly',
       'camping', 'waterfall', 'river', 'lake', 'wildflowers', 'wildlife',
       'views'],
      dtype='object')

In [560]:
df.head()

Unnamed: 0,hike_id,variable,value,hike_name,hike_region,total_distance,elevation_gain,hike_difficulty,stars,loop,...,point_to_point,dog_friendly,kid_friendly,camping,waterfall,river,lake,wildflowers,wildlife,views
0,hike43,user1,4,Spruce Mountain Trail,Pike National Forest,5.5,732,2,4.3,1,...,0,1,1,0,0,0,0,1,1,1
1,hike43,user10076,5,Spruce Mountain Trail,Pike National Forest,5.5,732,2,4.3,1,...,0,1,1,0,0,0,0,1,1,1
2,hike43,user10196,4,Spruce Mountain Trail,Pike National Forest,5.5,732,2,4.3,1,...,0,1,1,0,0,0,0,1,1,1
3,hike43,user10349,4,Spruce Mountain Trail,Pike National Forest,5.5,732,2,4.3,1,...,0,1,1,0,0,0,0,1,1,1
4,hike43,user10351,4,Spruce Mountain Trail,Pike National Forest,5.5,732,2,4.3,1,...,0,1,1,0,0,0,0,1,1,1


In [561]:
# Rename variable to user id
df['user_id'] = df['variable'] 
df = df.drop('variable', 1)

# Rename value to rating
df['rating'] = df['value'] 
df = df.drop('value', 1)

In [562]:
# Remove string
df['hike_id'] = df.hike_id.str.strip('hike')

df['user_id'] = df.user_id.str.strip('user')

# Convert to int

df['hike_id'] = pd.to_numeric(df.hike_id, errors='coerce')

df['user_id'] = pd.to_numeric(df.user_id, errors='coerce')

In [563]:
print(f'Hike Id Type: {type(df.hike_id[0])}\nUser Id Type: {type(df.user_id[0])}')

Hike Id Type: <class 'numpy.int64'>
User Id Type: <class 'numpy.int64'>


In [564]:
df.head()

Unnamed: 0,hike_id,hike_name,hike_region,total_distance,elevation_gain,hike_difficulty,stars,loop,out_and_back,point_to_point,...,kid_friendly,camping,waterfall,river,lake,wildflowers,wildlife,views,user_id,rating
0,43,Spruce Mountain Trail,Pike National Forest,5.5,732,2,4.3,1,0,0,...,1,0,0,0,0,1,1,1,1,4
1,43,Spruce Mountain Trail,Pike National Forest,5.5,732,2,4.3,1,0,0,...,1,0,0,0,0,1,1,1,10076,5
2,43,Spruce Mountain Trail,Pike National Forest,5.5,732,2,4.3,1,0,0,...,1,0,0,0,0,1,1,1,10196,4
3,43,Spruce Mountain Trail,Pike National Forest,5.5,732,2,4.3,1,0,0,...,1,0,0,0,0,1,1,1,10349,4
4,43,Spruce Mountain Trail,Pike National Forest,5.5,732,2,4.3,1,0,0,...,1,0,0,0,0,1,1,1,10351,4


In [565]:
# Change order of columns
df = df[['user_id', 'hike_id', 'rating', 'hike_name', 'hike_region','total_distance','elevation_gain','hike_difficulty','stars','loop', 'out_and_back', 'point_to_point','dog_friendly', 'kid_friendly', 'camping', 'waterfall', 'river','lake', 'wildflowers', 'wildlife','views']]
df.head()

Unnamed: 0,user_id,hike_id,rating,hike_name,hike_region,total_distance,elevation_gain,hike_difficulty,stars,loop,...,point_to_point,dog_friendly,kid_friendly,camping,waterfall,river,lake,wildflowers,wildlife,views
0,1,43,4,Spruce Mountain Trail,Pike National Forest,5.5,732,2,4.3,1,...,0,1,1,0,0,0,0,1,1,1
1,10076,43,5,Spruce Mountain Trail,Pike National Forest,5.5,732,2,4.3,1,...,0,1,1,0,0,0,0,1,1,1
2,10196,43,4,Spruce Mountain Trail,Pike National Forest,5.5,732,2,4.3,1,...,0,1,1,0,0,0,0,1,1,1
3,10349,43,4,Spruce Mountain Trail,Pike National Forest,5.5,732,2,4.3,1,...,0,1,1,0,0,0,0,1,1,1
4,10351,43,4,Spruce Mountain Trail,Pike National Forest,5.5,732,2,4.3,1,...,0,1,1,0,0,0,0,1,1,1


In [569]:
# Check for duplicate 
duplicate = df[df.duplicated()] 

print(duplicate)

# Drop duplicates
#df.drop_duplicates(keep=False,inplace=True)

Empty DataFrame
Columns: [user_id, hike_id, rating, hike_name, hike_region, total_distance, elevation_gain, hike_difficulty, stars, loop, out_and_back, point_to_point, dog_friendly, kid_friendly, camping, waterfall, river, lake, wildflowers, wildlife, views]
Index: []

[0 rows x 21 columns]


In [570]:
print(f'New dataset shape: {df.shape}')

New dataset shape: (23842, 21)


In [571]:
def get_ratings(df):
    ratings = []
    df = df[['user_id', 'hike_id', 'rating']]
    
    for x in df.values:
        ratings.append(x)
    
    return ratings

In [572]:
def get_features(df):
    df = df.drop(['user_id', 'rating'], 1)
 
    features = []
    for x in df.values:
        features.append(x)
    
    return features

In [573]:
#df_pivot = df.pivot_table(index='variable',columns='hike_id',values='value')

In [574]:
from lightfm.data import Dataset

dataset = Dataset()
dataset.fit((x[0] for x in get_ratings(df)),
            (x[1] for x in get_ratings(df)))

In [575]:
num_users, num_items = dataset.interactions_shape()
print('Num users: {}, num_items {}.'.format(num_users, num_items))

Num users: 11891, num_items 1487.


## Supply additional id's with fit_partial()
Ff we don’t have all user and items ids at once, we can repeatedly call `fit_partial` to supply additional ids. In this case, we will use this capability to add some item feature mappings

In [576]:
dataset.fit_partial(items=(x[0] for x in get_features(df)),
                    item_features=(x[1] for x in get_features(df)))

## Building the interactions matrix

In [577]:
(interactions, weights) = dataset.build_interactions(((x[0], x[1])
                                                      for x in get_ratings(df)))

print(repr(interactions))

<11891x1487 sparse matrix of type '<class 'numpy.int32'>'
	with 23842 stored elements in COOrdinate format>


In [578]:
item_features = dataset.build_item_features(((x[0], [x[1]])
                                              for x in get_features(df)))
print(repr(item_features))

<1487x2950 sparse matrix of type '<class 'numpy.float32'>'
	with 2974 stored elements in Compressed Sparse Row format>


## Building a model

In [579]:
from lightfm import LightFM
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import auc_score

model = LightFM(loss='warp')
model.fit(interactions, item_features=item_features, epochs=10)

<lightfm.lightfm.LightFM at 0x1e0a865bec8>

In [580]:
# AUC Score
train_auc = auc_score(model,
                      interactions,
                      item_features=item_features
                     ).mean()
print('Hybrid training set AUC: %s' % train_auc)

Hybrid training set AUC: 0.9527566


## Train/Test

In [581]:
# Dataframe with 80% of values
df_train = df.sample(frac = 0.8) 
  
# Dataframe for test with 20% of values
df_test = df.drop(df_train.index) 

In [582]:
print(f'Train: {df_train.shape}\nTest: {df_test.shape}')

Train: (19074, 21)
Test: (4768, 21)


In [602]:
df_train.tail()

Unnamed: 0,user_id,hike_id,rating,hike_name,hike_region,total_distance,elevation_gain,hike_difficulty,stars,loop,...,point_to_point,dog_friendly,kid_friendly,camping,waterfall,river,lake,wildflowers,wildlife,views
11733,11544,927,4,Saxon Mountain,Atapaho National Forest,14.2,3577,3,3.7,0,...,0,1,0,0,0,0,0,1,1,1
20687,7271,248,4,Zapata Falls Trail,Rio Grande National Forest,0.9,233,2,4.4,0,...,0,1,1,0,1,1,0,1,0,1
6167,1706,9,5,Mount Bierstadt Trail,Arapaho National Forest,7.3,2767,3,4.3,0,...,0,1,0,0,0,0,1,1,1,1
4614,11584,28,3,Crater Lakes Trail,Roosevelt National Forest,5.4,1444,3,4.5,0,...,0,1,0,1,0,1,1,1,1,1
4088,2864,3,5,The Incline Trail,Manitou Park Recreation Area,3.6,1976,3,4.7,1,...,0,0,0,0,0,0,0,1,1,1


## Interactions matrix

In [584]:
(train, weights_train) = dataset.build_interactions(((x[0], x[1])
                                                      for x in get_ratings(df_train)))

print(repr(interactions))

<11891x1487 sparse matrix of type '<class 'numpy.float32'>'
	with 23842 stored elements in COOrdinate format>


In [585]:
(test, weights_test) = dataset.build_interactions(((x[0], x[1])
                                                      for x in get_ratings(df_test)))

print(repr(test))

<11891x1487 sparse matrix of type '<class 'numpy.int32'>'
	with 4768 stored elements in COOrdinate format>


In [586]:
item_features_train = dataset.build_item_features(((x[0], [x[1]])
                                              for x in get_features(df_train)))
print(repr(item_features_train))

<1487x2950 sparse matrix of type '<class 'numpy.float32'>'
	with 2917 stored elements in Compressed Sparse Row format>


In [587]:
item_features_test = dataset.build_item_features(((x[0], [x[1]])
                                              for x in get_features(df_test)))
print(repr(item_features_train))

<1487x2950 sparse matrix of type '<class 'numpy.float32'>'
	with 2917 stored elements in Compressed Sparse Row format>


In [596]:
train_auc = auc_score(model, train, item_features=item_features_train, num_threads=2).mean()
print('AUC: %s' % train_auc)

AUC: 0.95252883


In [595]:
test_auc = auc_score(model, test, item_features=item_features_test, num_threads=2).mean()
print('Collaborative filtering test AUC: %s' % test_auc)

Collaborative filtering test AUC: 0.9491917


In [607]:
model = LightFM(loss='warp')
%time model.fit(train, epochs=50, num_threads=2)

Wall time: 1.58 s


<lightfm.lightfm.LightFM at 0x1e0aa1de9c8>

In [610]:
def sample_recommendation(model, data, user_ids):

    n_users, n_items = interactions.shape

    for user_id in user_ids:
        known_positives = df['hike_name'][interactions.tocsr()[user_id].indices]

        scores = model.predict(user_id, np.arange(n_items))
        top_items = data['hike_name'][np.argsort(-scores)]

        print("User %s" % user_id)
        print("     Known positives:")

        for x in known_positives[:1]:
            print("        %s" % x)

        print("     Recommended:")

        for x in top_items[:5]:
            print("        %s" % x)

sample_recommendation(model, df, [11544, 7000, 2000])

User 11544
     Known positives:
        Maxwell Falls Lower Trail
     Recommended:
        Mount Falcon Castle Trail
        Mount Falcon Castle Trail
        Mount Falcon Castle Trail
        Spruce Mountain Trail
        Spruce Mountain Trail
User 7000
     Known positives:
        Mount Falcon Castle Trail
     Recommended:
        Mount Falcon Castle Trail
        Mount Falcon Castle Trail
        Spruce Mountain Trail
        Spruce Mountain Trail
        Mount Sanitas Trail
User 2000
     Known positives:
        Spruce Mountain Trail
     Recommended:
        Spruce Mountain Trail
        Spruce Mountain Trail
        Spruce Mountain Trail
        Mount Falcon Castle Trail
        Mount Falcon Castle Trail
