In [None]:
pip install lightfm

In [None]:
import itertools
import pandas as pd
import numpy as np

import lightfm
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm import cross_validation

# Import LightFM's evaluation metrics
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import auc_score

import json
import random

print("LightFM version: {}".format(lightfm.__version__))

# Hike data
Dataset with hike trails from the repo [Take A Hike - A Colorado Trail Recommender](https://github.com/oschow/take-a-hike)

In [30]:
# Data
data = 'https://raw.githubusercontent.com/oschow/take-a-hike/master/AllTrails/data/all_ratings_matrix.csv'
df = pd.read_csv(data)
df.columns = ['hike_id', 'user', 'rating']
df = df[['user', 'hike_id', 'rating']]

# Remove string
df['hike_id'] = df.hike_id.str.strip('hike')
df['user'] = df.user.str.strip('user')

# Convert to int
df['hike_id'] = pd.to_numeric(df.hike_id, errors='coerce')
df['user'] = pd.to_numeric(df.user, errors='coerce')
df.set_index('user')

# Convert ratings to 1 and 0
df.rating = df['rating'].apply(lambda x: 1 if x > 3 else 0)
df.head()

Unnamed: 0,user,hike_id,rating
0,1,43,1
1,1,6,1
2,10,137,1
3,100,7,1
4,1000,33,1


# Hike features

In [31]:
# Features
data_features = 'https://raw.githubusercontent.com/oschow/take-a-hike/master/AllTrails/data/all_hikes_with_hike_name.csv'
df_features = pd.read_csv(data_features)
df_features = df_features[['hike_id', 'hike_name', 'hike_region', 'total_distance', 'elevation_gain', 'hike_difficulty']]

# Remove string
df_features['hike_id'] = df_features.hike_id.str.strip('hike')
# Convert to int
df_features['hike_id'] = pd.to_numeric(df_features.hike_id, errors='coerce')

df_features.tail()

Unnamed: 0,hike_id,hike_name,hike_region,total_distance,elevation_gain,hike_difficulty
1482,1482,Foothills Nature Trail to CCC Shelter,Roosevelt National Forest,2.0,498,1
1483,1483,Sinton Trail,"Roswell, Colorado",5.6,374,1
1484,1484,Uncompahgre River Walk,"Montrose, Colorado",14.4,411,1
1485,1485,Mayhoffer Trail Loop,"Superior, Colorado",10.8,872,1
1486,1486,River Ponds Trail Loop trail,"Fort Collins, Colorado",2.0,14,1


In [32]:
df_features.iloc[:, :3]

Unnamed: 0,hike_id,hike_name,hike_region
0,0,Maxwell Falls Lower Trail,Arapaho National Forest
1,1,Royal Arch Trail,Chautauqua Park
2,2,Beaver Brook /Chavez Trail Loop,Genesee Park
3,3,The Incline Trail,Manitou Park Recreation Area
4,4,Emerald Lake Trail,Rocky Mountain National Park
...,...,...,...
1482,1482,Foothills Nature Trail to CCC Shelter,Roosevelt National Forest
1483,1483,Sinton Trail,"Roswell, Colorado"
1484,1484,Uncompahgre River Walk,"Montrose, Colorado"
1485,1485,Mayhoffer Trail Loop,"Superior, Colorado"


In [33]:
features = []
col = ['hike_name']*len(df_features['hike_name'].unique()) + ['hike_region']*len(df_features['hike_region'].unique()) + ['total_distance']*len(df_features['total_distance'].unique()) + ['hike_difficulty']*len(df_features['hike_difficulty'].unique())

unique_f1 = list(df_features['hike_name'].unique()) + list(df_features['hike_region'].unique()) + list(df_features['total_distance'].unique()) + list(df_features['hike_difficulty'].unique())

for x,y in zip(col, unique_f1):
    res = str(x)+ ":" +str(y)
    features.append(res)

## Merge dataframes

In [34]:
# Merge dataframes
df_features = pd.merge(df, df_features, on='hike_id', how='left')
df_features = df_features.drop_duplicates()
df_features.head()


Unnamed: 0,user,hike_id,rating,hike_name,hike_region,total_distance,elevation_gain,hike_difficulty
0,1,43,1,Spruce Mountain Trail,Pike National Forest,5.5,732,2
1,1,6,1,Mount Falcon Castle Trail,Mount Falcon Park,7.9,1836,2
2,10,137,1,Rainbow Gulch,Pike National Forest,2.6,267,1
3,100,7,1,Mount Sanitas Trail,Boulder Mountain Park,3.3,1354,2
4,1000,33,1,Barr Trail to Pikes Peak,Pike National Forest,23.0,7508,3


In [35]:
df_features = df_features.drop(['user', 'rating', 'elevation_gain'], axis=1)
df_features = df_features.sort_values(by='hike_id')
df_features = df_features.reset_index(drop=True)
df_features = df_features.drop_duplicates()
df_features = df_features.reset_index(drop=True)
df_features = df_features.sort_values(by='hike_id')
df_features.head()

Unnamed: 0,hike_id,hike_name,hike_region,total_distance,hike_difficulty
0,0,Maxwell Falls Lower Trail,Arapaho National Forest,4.2,2
1,1,Royal Arch Trail,Chautauqua Park,3.4,3
2,2,Beaver Brook /Chavez Trail Loop,Genesee Park,3.9,2
3,3,The Incline Trail,Manitou Park Recreation Area,3.6,3
4,4,Emerald Lake Trail,Rocky Mountain National Park,3.3,1


# Add external dataset

In [36]:
from lightfm.data import Dataset
dataset1 = Dataset()

# we call fit to supply userid, item id and user/item features
dataset1.fit(
        df['user'].unique(), 
        df['hike_id'].unique(),
        item_features = features )

num_users, num_items = dataset1.interactions_shape()
print('Num users: {}, num_items {}.'.format(num_users, num_items))

Num users: 11891, num_items 1487.


# Interactions matrix
- **Interactions matrix** tell us if at all the user interacted with an item
- **Weights matrix** quantifies that particular interaction

In [37]:
# Plugging in the interactions and their weights
(interactions, weights) = dataset1.build_interactions([(x[0], x[1], x[2]) for x in df.values ])

- Users = Rows
- Items = Columns

In [38]:
interactions.todense()

matrix([[1, 1, 0, ..., 0, 0, 0],
        [0, 0, 1, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

In [39]:
weights.todense()

matrix([[1., 1., 0., ..., 0., 0., 0.],
        [0., 0., 1., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

# Build item features

In [40]:
# Helper function that takes the item features and converts them into the proper "feature:value" format
def feature_colon_value(my_list):
    result = []
    ll = ['hike_name:','hike_region:', 'total_distance:', 'hike_difficulty:']
    aa = my_list
    for x,y in zip(ll,aa):
        res = str(x) +""+ str(y)
        result.append(res)
    return result

# Using the helper function to generate user features in proper format for ALL users

ad_subset = df_features[['hike_name','hike_region', 'total_distance', 'hike_difficulty']] 
ad_list = [list(x) for x in ad_subset.values]
feature_list = []
for item in ad_list:
    feature_list.append(feature_colon_value(item))
#print(f'Final output: {feature_list}')

In [41]:
item_tuple = list(zip(df_features['hike_id'], feature_list))

item_tuple[:3]

[(0,
  ['hike_name:Maxwell Falls Lower Trail',
   'hike_region:Arapaho National Forest',
   'total_distance:4.2',
   'hike_difficulty:2']),
 (1,
  ['hike_name:Royal Arch Trail',
   'hike_region:Chautauqua Park',
   'total_distance:3.4',
   'hike_difficulty:3']),
 (2,
  ['hike_name:Beaver Brook /Chavez Trail Loop',
   'hike_region:Genesee Park',
   'total_distance:3.9',
   'hike_difficulty:2'])]

In [42]:
item_features = dataset1.build_item_features(item_tuple)

print(repr(item_features))

<1487x3530 sparse matrix of type '<class 'numpy.float32'>'
	with 7435 stored elements in Compressed Sparse Row format>


# Cross validation

In [43]:
from lightfm.cross_validation import random_train_test_split
(train, test) = random_train_test_split(interactions=interactions, test_percentage=0.2)

# Build model

In [44]:
model = LightFM(loss='warp')
model.fit(train,
      item_features= item_features,
      epochs=10)

<lightfm.lightfm.LightFM at 0x1f629d3e370>

# Loss functions
A loss function is a measure of how good a prediction model does in terms of being able to predict the expected outcome.

### WARP  (Weighted Approximate-Rank Pairwise)
Maximises the rank of positive examples by repeatedly sampling negative examples until rank violating one is found

### BPR (Bayesian Personalised Ranking)
BPR: Bayesian Personalised Ranking pairwise loss: It maximizes the prediction difference between a positive example and a randomly chosen negative example. Useful when only positive interactions are present.

# Model evaluation
Documentation [here](https://making.lyst.com/lightfm/docs/lightfm.evaluation.html)

### Precision at k metric
Measure the precision at k metric for a model: the fraction of known positives in the first k positions of the ranked list of results. 

In [45]:
from lightfm.evaluation import precision_at_k
train_precision = precision_at_k(model, train,item_features=item_features, k=10).mean()
test_precision = precision_at_k(model, test, item_features=item_features,k=10).mean()
print('Precision: train %.2f, test %.2f.' % (train_precision, test_precision))

Precision: train 0.07, test 0.02.


### AUC
Measure the ROC AUC metric for a model: the probability that a randomly chosen positive example has a higher score than a randomly chosen negative example. A perfect score is 1.0.

In [46]:
from lightfm.evaluation import auc_score

# Train
train_auc = auc_score(model,
                      train,
                      item_features= item_features
                     ).mean()
print('Hybrid training set AUC: %s' % train_auc)

# Test
test_auc = auc_score(model,
                      test,
                      item_features= item_features
                     ).mean()
print('Hybrid test set AUC: %s' % test_auc)

Hybrid training set AUC: 0.9583463
Hybrid test set AUC: 0.81658816


# Generate recommendations

In [47]:
def sample_recommendation(model, data, user_ids):

    n_users, n_items = interactions.shape

    for user_id in user_ids:
        known_positives = df['hike_id'][df.user ==user_id]
        list_known = []

        for i in known_positives:
          list_known.append(i)

        scores = model.predict(user_id, np.arange(n_items))
        top_items = data['hike_id'][np.argsort(-scores)]

        #print("User %s" % user_id)
        #print("     Known positives:")

        #for x in known_positives[:]:
            #print("        %s" % x)

        recommendations = []
        #print("     Recommended:")
        for x in top_items[:10]:
          if x not in list_known:
            recommendations.append(x)
            #print("        %s" % x)
        return user_id, recommendations
        
        #print('\n')

sample_recommendation(model, df, [10])

(10, [4, 0, 654, 8, 462, 35, 3, 374, 976, 6])

# Match recommendations with hike trail names

In [48]:
users = [1, 5, 12, 5, 66]
dataframe = pd.DataFrame([])
#users = df['user'].values.tolist()
count = 1
for i in users:
  user = sample_recommendation(model, df, [i])
  recommendations = user[1]
  recommendations = recommendations[:3]
  print(f'Recommendations for user {user[0]}:')
  for r in recommendations:
    hike_name = df_features['hike_name'][df_features['hike_id'] == r].values[0]
    dataframe = dataframe.append(pd.DataFrame({'User': user[0], 'Recommended items': hike_name}, index=[i]), ignore_index=True)
    print(count, hike_name)
    count+=1
  count = 1
  print('\n')

Recommendations for user 1:
1 Lake Loop and Nature Trails
2 Well Gulch Natural Trail
3 Mohawk Lakes Trail


Recommendations for user 5:
1 Meadowlark Plymouth Creek Loop Trail
2 Red Hill/Mushroom Rock trail
3 Lost Lake Trail


Recommendations for user 12:
1 Emerald Lake Trail
2 Maxwell Falls Lower Trail
3 Red Hill/Mushroom Rock trail


Recommendations for user 5:
1 Meadowlark Plymouth Creek Loop Trail
2 Red Hill/Mushroom Rock trail
3 Lost Lake Trail


Recommendations for user 66:
1 Maxwell Falls Lower Trail
2 Lake Loop and Nature Trails
3 Pass Creek Trail


