# Github Repository Recommender Systems

## 1. Import Libraries
We first import all the libraries needed for the analysis

In [1]:
#! pip install implicit
#! pip install "scipy>=1.0"
#! pip install scikit-surprise

import pyspark
from pyspark.sql import *
import pyspark.sql.functions as F
from pyspark import SparkContext
sc =SparkContext.getOrCreate()
spark= pyspark.sql.SQLContext(sc)

import implicit
from scipy import sparse

import pandas as pd
from datetime import datetime
import matplotlib
%matplotlib inline

## 1. Data Pre-processing
We retrive the github data on "watchers" (users can watch repositories) from public dataset "ghtorrent-bq" on Google BigQuery. It is not necessary to use all the data to generate recommendations, also some data may not even add much information, for example users who rarely watch repositories or user who watch almost "every" (a large number of) repositories. Therefore, we first filter data in BigQuery to retrieve the list of most popular repositories (top 500 being watched by users) and the users who watch these repositories (10,000 random users who has wathched 20 - 100 of these repositories).

The SQL Query for generating the raw data is shown as follows:

SELECT repo_id, user_id FROM [ghtorrent-bq:ght.watchers] 
WHERE user_id IN 

(SELECT user_id FROM

(SELECT user_id, COUNT(*) AS num_watching_repo

FROM [ghtorrent-bq:ght.watchers] 

GROUP BY user_id

HAVING num_watching_repo > 10 AND num_watching_repo < 100

LIMIT 10000))

AND repo_id IN

(SELECT repo_id FROM

(SELECT repo_id, COUNT(*) AS num_watcher

FROM [ghtorrent-bq:ght.watchers] 

GROUP BY repo_id

ORDER BY num_watcher DESC

LIMIT 1000))

In [2]:
#read data from csv file into a dataframe and add implicit rating column
raw_data = pd.read_csv("data/top500repo.csv")
ratings = [1] * raw_data.shape[0]
raw_data['rating'] = ratings
#reorder the dataframe
columns_order = ['user_id', 'repo_id', 'rating']
raw_data = raw_data.reindex(columns=columns_order)
raw_data.head()

Unnamed: 0,user_id,repo_id,rating
0,634204,20078281,1
1,4605019,20078281,1
2,5107395,20078281,1
3,8748144,20078281,1
4,341184,22042207,1


Since the raw data only contains the repositories that these users are watching, i.e. the rating numbers are all 1, we need to include the repositories that the users are not watching and add rating number (in this case we use 0) for them.

In [3]:
#first create a pivot table and fill all the NaNs as 0
data_matrix = raw_data.pivot(index='user_id', columns = 'repo_id', values = 'rating')
data_matrix = data_matrix.fillna(0)
#then convert the pivot table back to dataframe, 
#so that we get the full combinations of users and repos
use_data = data_matrix.stack().reset_index(name="rating")
use_data.shape

(4354000, 3)

## 2.1 SVD in scikit-surprise package 
The first method we use to generate recommendations is the SVD method available in scikit-surprise package.

In [4]:
#import the libraries needed for SVD
from surprise import Reader
from surprise import SVDpp
from surprise import SVD
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV
from surprise.model_selection import cross_validate
from surprise.model_selection import KFold
#since the default rating scale range is 0-5, we need to adjust the range
reader = Reader(rating_scale = (0.0,1.0))
#load data from dataframe
data = Dataset.load_from_df(use_data, reader)

In [5]:
#split data into training and testing set
trainset, testset = train_test_split(data, test_size=.15)

### K-fold cross validation

In [6]:
# define a cross-validation iterator
kf = KFold(n_splits=3)

algo = SVD()

for trainset, testset in kf.split(data):

    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)

RMSE: 0.1055
RMSE: 0.1053
RMSE: 0.1052


### Tune algorithm parameters with GridSearchCV

In [None]:
param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005],
              'reg_all': [0.4, 0.6]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

### Use SVD with best parameters

In [None]:
# We can now use the algorithm that yields the best rmse:
algo = gs.best_estimator['rmse']
algo.fit(data.build_full_trainset())

## 2.2 ALS in Spark
The second method we use is the ALS algorithm available in Spark

In [7]:
#import libraries needed
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

In [8]:
#Convert the pandas dataframe "use_data" into spark dataframe
ratings = spark.createDataFrame(use_data)
#Split data into training and testing sets
(training, test) = ratings.randomSplit([0.8, 0.2])

In [9]:
# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(maxIter=5, regParam=0.01, implicitPrefs=True, userCol="user_id", itemCol="repo_id", ratingCol="rating")
model = als.fit(training)

In [10]:
# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 0.10617955065788996


## 2.3 ALS in implicit package
The third method we use is the ALS algorithm available in implicit package. The implicit package does not inlude any evaluation method for the model, however it can generate recommendations for specific users and return related repositories.

In [11]:
#convert the data matrix to repo-user csr matrix
csr_repo_user = sparse.csr_matrix(data_matrix).T.tocsr()

In [12]:
#fit model
implicit_model = implicit.als.AlternatingLeastSquares(factors=50)
implicit_model.fit(csr_repo_user)

100%|██████████| 15.0/15 [00:40<00:00,  2.31s/it]


In [13]:
# recommend repos for a user
user_repos = csr_repo_user.T.tocsr()
recommendations = implicit_model.recommend(227, user_repos)
recommendations

[(18, 0.42213416),
 (23, 0.038269203),
 (17, 0.033236597),
 (77, 0.031356655),
 (211, 0.027273616),
 (221, 0.025331859),
 (8, 0.025092404),
 (161, 0.021297604),
 (159, 0.021291038),
 (344, 0.02116368)]

In [14]:
# find related repos
related = implicit_model.similar_items(15)
related

[(15, 0.6067302),
 (254, 0.28099835),
 (174, 0.2669173),
 (347, 0.25447154),
 (223, 0.25257245),
 (156, 0.2522952),
 (430, 0.21773125),
 (198, 0.21363513),
 (333, 0.20966932),
 (25, 0.19306782)]