In [1]:
# importing needed packages
import logging
from graph_recommender_system import GraphRecommendationSystem, load_data
import pandas as pd
import time

In [2]:
# Logging settings
logging.basicConfig(filename='info.log', level=logging.INFO)
# Initialize and connect to database
db = GraphRecommendationSystem("bolt://localhost:7687", "neo4j", "")
# NB this will run for ~8 hours

In [3]:
total_time = time.time()
# Split the data in train and test data
(train, test) = db.get_file_paths("active1000", 0.7)
# Importing train data from files to database (only need to run once) 
# db.import_data("active1000", train)  # NB this will run for ~8 hours
#logging.info(f"Import data took: {((time.time() - total_time)/60.0)} minutes") 

In [4]:
# Creating dataframes of the data to be used in evaluating the results from the recommendations
df_test = load_data("active1000", test)
df_train = load_data("active1000", train)

In [5]:
# Fetching users to be used in recommendations
df_test = df_test[df_test["url"] != "http://adressa.no"]
user_df = df_test.groupby(['userId']).size().reset_index(name='counts')
users = user_df["userId"].tolist()
len(users)

988

# Evaluation basics  
[Source](https://bond-kirill-alexandrovich.medium.com/precision-and-recall-in-recommender-systems-and-some-metrics-stuff-ca2ad385c5f8)

TP = True positives  
TN = True negatives  
FP = False positives  
FN = False negatives  

Accuracy = (TP + TN) / (TP+TN+FP+FN)  
Precision = TP / (TP + FP)  
Recall = TP / (TP+FN)  

In [6]:
def print_precision(df_predictions, df_test_data):
    test_data = df_test_data[["userId", "url"]]
    test_data_users_in_predictions = test_data[test_data["userId"].isin(df_predictions["userId"])]
    true_positives = test_data_users_in_predictions[test_data_users_in_predictions["url"].isin(predictions["url"])]
    print(f"Evaluation - Precision: \n"\
        "Prediction matched training data (TP) / all predictions (TP+TN): \n"\
            f"{len(true_positives)}/{len(predictions)}="\
                        f"{len(true_positives)/len(predictions)}")



In [7]:
def precision(test_data_user, predictions):
    if(len(predictions) == 0):
        return
    user_id = test_data_user["userId"].iloc[0]
    true_positives = test_data_user[test_data_user["url"].isin(predictions["url"])]
    logging.info(f"Evaluation - Precision: {user_id}, len: {len(test_data_user)}: "\
        "Prediction matched training data (TP) / all predictions (TP+TN): "\
            f"{len(true_positives)}/{len(predictions)}="\
                        f"{len(true_positives)/len(predictions)}")

def print_precision_per_user(df_predictions, df_test_data):
    test_data = df_test_data[["userId", "url"]]
    test_data.groupby('userId').apply(lambda user: precision(user, df_predictions[df_predictions["userId"] == user["userId"].iloc[0]]))

In [8]:
def print_recall(df_predictions, df_test_data):
    test_data = df_test_data[["userId", "url"]]
    test_data_users_in_predictions = test_data[test_data["userId"].isin(df_predictions["userId"])]
    true_positives = test_data_users_in_predictions[test_data_users_in_predictions["url"].isin(predictions["url"])]
    print(f"Evaluation - Recall: \n"\
        "Prediction matched training data (TP) / training data: \n"\
            f"{len(true_positives)}/{len(test_data)}="\
                        f"{len(true_positives)/len(test_data)}")

# Prediction: Predict articles by nearest neighbor and popularity

In [9]:
# Predict all users on popularity (uncomment to run)
start_time = time.time()
#predictions_raw_df = db.predict_on_popularity(users, None) # NB: runs for ~12 hours
#predictions_raw_df.to_feather("predictions_popularity_all_users_v2.feather")
logging.info(f"Prediction took: {((time.time() - start_time)/60.0)} minutes, or hours: {((time.time() - start_time)/3600.0)}")


In [10]:
# Evaluating: 10 predictions per user, for all users, on popularity
predictions = pd.read_feather("predictions_popularity_all_users.feather")
print_precision(predictions, df_test)
print_recall(predictions, df_test)
print_precision_per_user(predictions, df_test)

Evaluation - Precision: 
Prediction matched training data (TP) / all predictions (TP+TN): 
65/9880=0.006578947368421052
Evaluation - Recall: 
Prediction matched training data (TP) / training data: 
65/359377=0.00018086855864454319


# Prediction: Finding similar users and recommend their newest read articles

In [11]:
# Prediction all users. Finding similar users and recommend their newest read articles
start_time = time.time()
#predictions = db.predict_on_bestfriends_newest(users)
#predictions.to_feather("predictions_on_best_friend_all_users_v2.feather")
logging.info(f"Prediction on bestfriend took: {((time.time() - start_time)/60.0)} minutes, or hours: {((time.time() - start_time)/3600.0)}")
pd.set_option('display.max_colwidth', None)


In [12]:
# Evaluating: Recommender recall, prediction on finding similar users and using the newest read articles from these users
predictions = pd.read_feather("predictions_on_best_friend_all_users.feather")
print_precision(predictions, df_test)
print_recall(predictions, df_test)
print_precision_per_user(predictions, df_test)

Evaluation - Precision: 
Prediction matched training data (TP) / all predictions (TP+TN): 
3208/9660=0.3320910973084886
Evaluation - Recall: 
Prediction matched training data (TP) / training data: 
3208/359377=0.008926559017410686


In [13]:
# Cold start popularity recommendation
user = "newUser" # This ID does not exist in the database
predictions = db.predict_on_popularity(["newUser"])
predictions


Unnamed: 0,userId,url
0,newUser,http://adressa.no/pluss/magasin/2017/01/14/fives-vei-til-banken-14051471.ece
1,newUser,http://adressa.no/pluss/magasin/2017/02/25/det-%c3%a5ret-vi-sluttet-%c3%a5-se-i-bakken-14308932.ece
2,newUser,http://adressa.no/pluss/magasin/2016/02/01/trondheim-f%c3%b8r-og-n%c3%a5-del-2-12100990.ece
3,newUser,http://adressa.no/bolig/boligguiden/trondheim/trondheim-%c3%b8st/bolig1196938.html
4,newUser,http://adressa.no/pluss/magasin/2017/01/14/fives-vei-til-banken-14051471.ece
5,newUser,http://adressa.no/nyheter/utenriks/2017/02/12/trump-skryter-av-aksjon-mot-ulovlige-innvandrere-14211432.ece
6,newUser,http://adressa.no/100sport/sprek/her-ligger-deltagerne-strodd-etter-treningsokten-studie-hevder-tre-minutter-i-uken-kan-vare-nok-228600b.html
7,newUser,http://adressa.no/pluss/nyheter/2017/02/06/kun-den-som-har-mistet-et-barn-selv-kan-vite-hvor-hjerteskj%c3%a6rende-det-er-14175719.ece
8,newUser,http://adressa.no/pluss/okonomi/2017/02/11/hun-lokker-nordmenn-til-%c3%a5-satse-store-penger-p%c3%a5-omstridt-nettvaluta-14205262.ece
9,newUser,http://adressa.no/pluss/magasin/2017/01/16/hvor-i-all-verden...-14066667.ece


In [14]:
# Cold start popularity recommendation with categories
categories = ["sport", "okonomi", "nyheter"] # Simulates a user inputs its preferred categories
user = "newUser" # This ID does not exist in the database
predictions = db.predict_on_popularity(["newUser"], categories)
predictions

Unnamed: 0,userId,url
0,newUser,http://adressa.no/nyheter/utenriks/2017/02/12/trump-skryter-av-aksjon-mot-ulovlige-innvandrere-14211432.ece
1,newUser,http://adressa.no/pluss/okonomi/2017/02/11/hun-lokker-nordmenn-til-%c3%a5-satse-store-penger-p%c3%a5-omstridt-nettvaluta-14205262.ece
2,newUser,http://adressa.no/pluss/nyheter/2017/02/06/kun-den-som-har-mistet-et-barn-selv-kan-vite-hvor-hjerteskj%c3%a6rende-det-er-14175719.ece
3,newUser,http://adressa.no/pluss/nyheter/2017/02/06/kun-den-som-har-mistet-et-barn-selv-kan-vite-hvor-hjerteskj%c3%a6rende-det-er-14175719.ece
4,newUser,http://adressa.no/nyheter/trondheim/2017/01/04/vurderer-%c3%a5-flytte-fra-midtbyen-14005344.ece
5,newUser,http://adressa.no/pluss/nyheter/2017/02/06/kun-den-som-har-mistet-et-barn-selv-kan-vite-hvor-hjerteskj%c3%a6rende-det-er-14175719.ece
6,newUser,http://adressa.no/nyheter/nordtrondelag/2017/01/29/n%c3%a5r-det-ufattelige-skjer-deg-14132649.ece
7,newUser,http://adressa.no/pluss/nyheter/2017/02/06/kun-den-som-har-mistet-et-barn-selv-kan-vite-hvor-hjerteskj%c3%a6rende-det-er-14175719.ece
8,newUser,http://adressa.no/nyheter/okonomi/2017/02/12/%c3%98lsalget-til-rema-stuper-med-ny-markedsstrategi-14210907.ece
9,newUser,http://adressa.no/pluss/nyheter/2017/02/27/de-unge-blir-fattigere-og-eldre-rikere-i-norge-14292336.ece
