In [1]:
import json
import ast
from pprint import pprint

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession, types, functions

import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *

In [2]:
# !pip install spark-nlp==3.3.0

In [3]:
%matplotlib inline

sns.set(style="whitegrid")
sns.set_color_codes("pastel")
sns.set(font="SimSun")

In [4]:
# Connect to the MondoDB database

spark = (
    SparkSession
    .builder 
    .appName("reviews") 
    .master("local[4]")
    .config("spark.driver.memory", "10g")
    .config("spark.driver.maxResultSize", "0") \
    .config("spark.kryoserializer.buffer.max", "2000M")\
    .config("spark.mongodb.input.uri", "mongodb://localhost:27017/linked_code.repos") 
    .config("spark.mongodb.output.uri", "mongodb://localhost:27017/linked_code.users") 
    .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.0,com.johnsnowlabs.nlp:spark-nlp_2.12:3.3.0")
    .getOrCreate()
)

21/10/18 14:50:41 WARN Utils: Your hostname, Kenneths-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.1.1 instead (on interface en0)
21/10/18 14:50:41 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/Users/kenneth/Documents/mcomp/CS5344/linked-code/linked_code_venv/lib/python3.7/site-packages/pyspark/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/kenneth/.ivy2/cache
The jars for the packages stored in: /Users/kenneth/.ivy2/jars
org.mongodb.spark#mongo-spark-connector_2.12 added as a dependency
com.johnsnowlabs.nlp#spark-nlp_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-bf105ebf-9757-4678-b210-edd2e7abaaef;1.0
	confs: [default]
	found org.mongodb.spark#mongo-spark-connector_2.12;3.0.0 in central
	found org.mongodb#mongodb-driver-sync;4.0.5 in central
	found org.mongodb#bson;4.0.5 in central
	found org.mongodb#mongodb-driver-core;4.0.5 in central
	found com.johnsnowlabs.nlp#spark-nlp_2.12;3.3.0 in central
	found com.typesafe#config;1.4.1 in central
	found org.rocksdb#rocksdbjni;6.5.3 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.603 in central
	found com.github.universal-automata#liblevenshtein;3.0.0 in central
	found com.google.code.findbugs#annotations;3.0.1 in central
	found net.jcip#jcip-annotations;1.0 in central
	found com.google.code

In [5]:
# Connect to the repos and user collections
repos = spark.read.format("com.mongodb.spark.sql.DefaultSource").load()
# repos = spark.read.format("com.mongodb.spark.sql.DefaultSource").option("uri","mongodb://localhost:27017/cs5344.repos").load()
users = spark.read.format("com.mongodb.spark.sql.DefaultSource").option("uri","mongodb://localhost:27017/linked_code.users").load()

21/10/18 14:51:01 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
21/10/18 14:51:07 WARN MongoInferSchema: Array Field 'starred' contains conflicting types converting to StringType


In [6]:
# Count the rows for both table
num_repos = repos.count()
num_user = users.count()

                                                                                

In [7]:
# Keep only a few columns
cols_to_keep = ['id','name', 'description', 'language']
df = repos.select(cols_to_keep)

In [8]:
# Display a few rows to review. Can use .head() also
df.show()

[Stage 6:>                                                          (0 + 1) / 1]

+---+--------------------+--------------------+------------+
| id|                name|         description|    language|
+---+--------------------+--------------------+------------+
|  1|                grit|**Grit is no long...|        Ruby|
| 26|           merb-core|Merb Core: All yo...|        Ruby|
| 27|            rubinius|The Rubinius Lang...|           C|
| 28|                 god|Ruby process monitor|        Ruby|
| 35|    exception_logger|Unmaintained. Sorry.|        Ruby|
| 36|            ambition|include Enumerabl...|        Ruby|
| 42|restful-authentic...|    inactive project|        Ruby|
| 43|       attachment_fu|Treat an ActiveRe...|        Ruby|
| 52|                  s3|psuedo s3 protoco...|  JavaScript|
| 53|               taboo|The solution for ...|  JavaScript|
| 54|            foxtracs|firefox trac inte...|  JavaScript|
| 56|           fotomatic|Flash photo widge...|ActionScript|
| 63|            starling|                null|        Ruby|
| 65|           merb-mor

                                                                                

In [9]:
# check the types for these columns
print(df.dtypes)

[('id', 'int'), ('name', 'string'), ('description', 'string'), ('language', 'string')]


In [10]:
# Keep only these columns for users
df_users = users.select(['id','starred'])
print(df_users.dtypes)                         

[('id', 'int'), ('starred', 'array<string>')]


In [17]:
# Display some values
# Starred seems to be a mix of repo names and id
df_users.show()

+---+--------------------+
| id|             starred|
+---+--------------------+
|  1|[redwoodjs/repeat...|
|  2|                null|
|  3|[80374620, 490832...|
|  4|[135159521, 22664...|
|  5|                null|
|  6|[98691746, 996823...|
|  7|[4542716, 1260477...|
| 17|[352160885, 10972...|
| 18|[51980455, 137828...|
| 19|                null|
| 20|                null|
| 21|[161109499, 37724...|
| 22|[cloudflare/lol-h...|
| 23|[319142957, 90042...|
| 25|[rowanwins/mapbox...|
| 26|                null|
| 27|[292396404, 28333...|
| 29|[activeloopai/Hub...|
| 31|                null|
| 34|                null|
+---+--------------------+
only showing top 20 rows



In [18]:
# pyspark cannot check if whole doc is null, have to test on each column
# can use SQL syntax in the string to make easy to write
where_conditions = ''
for i, col in enumerate(cols_to_keep):
    
    where_conditions += col + ' is NULL'
    if i < len(cols_to_keep)-1:
        where_conditions += ' AND '

print(where_conditions)

id is NULL AND name is NULL AND description is NULL AND language is NULL


In [19]:
# Get the count
df.filter(where_conditions).count()

                                                                                

0

In [20]:
total_repos_count = df.count()

                                                                                

In [21]:
# Description has null values. Count the values
no_desc_count = df.filter("description is NULL OR description == 'nan' OR description == 'NA' OR description == 'null' ").count()
print(f"Num of repos with no description = {no_desc_count} ({100 * no_desc_count/total_repos_count:.2f} %)")



Num of repos with no description = 95338 (9.88 %)


                                                                                

In [22]:
# Description has null values. Count the values
no_name_count = df.filter("name is NULL OR name == 'nan' OR name == 'NA' OR name == 'null' ").count()
print(f"Num of repos with no name = {no_name_count} ({100 * no_name_count/total_repos_count:.2f} %)")

[Stage 14:>                                                         (0 + 1) / 1]

Num of repos with no name = 6 (0.00 %)


                                                                                

In [23]:
# display these repos since the qty is small
df.filter("name is NULL OR name == 'nan' OR name == 'NA' OR name == 'null' ").show()

[Stage 16:>                                                         (0 + 1) / 1]

+---------+----+-------------------------------------+--------+
|       id|name|                          description|language|
+---------+----+-------------------------------------+--------+
| 11545928| nan|                 Native Abstractio...|     C++|
| 23432446|null|                 reasonable handli...|      Go|
|164454053|  NA|不定期推送数值计算的相关算法，矩阵...|     C++|
|109581962| nan|      nan是一款轻量级的分布式服务框架|    Java|
|  6580430|null|                 ❌ WordPress paren...|     PHP|
|261574716| nan|                 Zero allocation N...|      Go|
+---------+----+-------------------------------------+--------+



                                                                                

In [24]:
# filter out repos with no name or description
null_values = ['nan','NA','null']

df_filtered = ( 
    df
    .filter(~df['description'].isNull())
    .filter(~df['description'].isin(null_values))
    .filter(~df['name'].isNull())
    .filter(~df['name'].isin(null_values))
)

In [25]:
df_filtered.show()

+---+--------------------+--------------------+------------+
| id|                name|         description|    language|
+---+--------------------+--------------------+------------+
|  1|                grit|**Grit is no long...|        Ruby|
| 26|           merb-core|Merb Core: All yo...|        Ruby|
| 27|            rubinius|The Rubinius Lang...|           C|
| 28|                 god|Ruby process monitor|        Ruby|
| 35|    exception_logger|Unmaintained. Sorry.|        Ruby|
| 36|            ambition|include Enumerabl...|        Ruby|
| 42|restful-authentic...|    inactive project|        Ruby|
| 43|       attachment_fu|Treat an ActiveRe...|        Ruby|
| 52|                  s3|psuedo s3 protoco...|  JavaScript|
| 53|               taboo|The solution for ...|  JavaScript|
| 54|            foxtracs|firefox trac inte...|  JavaScript|
| 56|           fotomatic|Flash photo widge...|ActionScript|
| 65|           merb-more|Merb More: The Fu...|        null|
| 68|                thi

In [26]:
df_filtered.count()

                                                                                

869693

In [27]:
# There's no repo with both no name and description
# So the final count after the filter should be straight forward
total_repos_count - no_desc_count - no_name_count

869693

## Compute similarity scores for repo descriptions

#### Set up the SparkNLP pipeline

In [11]:
import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *

In [13]:
MODEL_NAME = "tfhub_use"

In [14]:
class RepoQueryPipeline():
    def __init__(self, model_name):
        # Transforms the input text into a document usable by the SparkNLP pipeline.
        self.document_assembler = DocumentAssembler()
        self.document_assembler.setInputCol('text')
        self.document_assembler.setOutputCol('document')

        # Separates the text into individual tokens (words and punctuation).
        self.tokenizer = Tokenizer()
        self.tokenizer.setInputCols(['document'])
        self.tokenizer.setOutputCol('token')
        
        # Encodes the text as a single vector representing semantic features.
        self.sentence_encoder = UniversalSentenceEncoder.pretrained(name=model_name)
        self.sentence_encoder.setInputCols(['document', 'token'])
        self.sentence_encoder.setOutputCol('sentence_embeddings')
        
    def init_pipeline(self):
        self.nlp_pipeline = Pipeline(stages=[
            self.document_assembler, 
            self.tokenizer,
            self.sentence_encoder
        ])
        
        # Fit the model to an empty data frame so it can be used on inputs.
        empty_df = spark.createDataFrame([['']]).toDF('text')
        pipeline_model = self.nlp_pipeline.fit(empty_df)
        self.light_pipeline = LightPipeline(pipeline_model)
        
    def get_similarity(self, emb_matrix):
        return np.matmul(emb_matrix, emb_matrix.transpose())
    
    def _encode_df(self, df):
        encoded_df = self.light_pipeline.transform(df)
            
        return encoded_df

    def convert_query_repo_desc_to_df(self, query_repo_desc):
        query_formatted = [(1, query_repo_desc)]

        columns = ["query_num", "text"]
        query_df = spark.createDataFrame(data=query_formatted, schema=columns).select("text")
        
        return query_df
    
    def _extract_emb_matrix(self, encoded_df):
        embs = []
        for r in encoded_df.collect():
            embs.append(r.sentence_embeddings[0].embeddings)
        emb_matrix = np.array(embs)
        
        return emb_matrix
    
    def get_emb_matrix(self, df):
        # Rename the column to match the pipeline model
        df = df.withColumnRenamed(df.columns[0],'text')

        encoded_df = self._encode_df(df)
        emb_matrix = self._extract_emb_matrix(encoded_df)
        return emb_matrix

#### Initialize the SparkNLP pipeline

In [31]:
repo_q_pl = RepoQueryPipeline(MODEL_NAME)
repo_q_pl.init_pipeline()

tfhub_use download started this may take some time.


21/10/18 15:42:01 WARN BasicProfileConfigLoader: Your profile name includes a 'profile ' prefix. This is considered part of the profile name in the Java SDK, so you will need to include this prefix in your profile name when you reference this profile from your Java code.
21/10/18 15:42:01 WARN BasicProfileConfigLoader: Your profile name includes a 'profile ' prefix. This is considered part of the profile name in the Java SDK, so you will need to include this prefix in your profile name when you reference this profile from your Java code.


Approximate size to download 923.7 MB
[ / ]tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
Download done! Loading the resource.
[ / ]

2021-10-18 15:42:42.774518: I external/org_tensorflow/tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


[OK!]


#### Select the repository size that you want to query against

In [56]:
REPOSITORY_SIZE = 1000

#### Encode the repositories' description text to generate embeddings

In [57]:
df_subset = df_filtered.limit(REPOSITORY_SIZE).select('description')

In [58]:
embs_matrix = repo_q_pl.get_emb_matrix(df_subset)

                                                                                

#### Insert query repo description

In [74]:
repo_q_desc = "Python machine learning"

#### Encode the query repo description

In [75]:
repo_q_desc_df = repo_q_pl.convert_query_repo_desc_to_df(repo_q_desc)
repo_q_desc_emb_matrix = repo_q_pl.get_emb_matrix(repo_q_desc_df)



#### Initialize the Annoy nearest neighbours pipeline

In [77]:
from annoy import AnnoyIndex

class AnnoyIdx():
    def __init__(self, embedding_size, dist_measure):
        self.t = AnnoyIndex(embedding_size, dist_measure)

    def build(self, embs_matrix):
        for i, emb in enumerate(embs_matrix):
            self.t.add_item(i, emb)
        self.t.build(10)

    def query(self, query_embedding, num_nbrs, inc_dist=False):
        return self.t.get_nns_by_vector(query_embedding, num_nbrs, include_distances=inc_dist)

In [78]:
annoy_idx = AnnoyIdx(512, "angular")
annoy_idx.build(embs_matrix)
result = annoy_idx.query(repo_q_desc_emb_matrix[0], 3, inc_dist=True)

#### Retrieve the most relevant repositories given the input query repository description 

In [65]:
descs = [desc[0] for desc in df_subset.collect()]

                                                                                

In [66]:
id_to_desc = {i:desc for i, desc in enumerate(descs)}

In [67]:
id_to_desc

{0: '**Grit is no longer maintained. Check out libgit2/rugged.** Grit gives you object oriented read/write access to Git repositories via Ruby.',
 1: "Merb Core: All you need. None you don't.",
 2: 'The Rubinius Language Platform',
 3: 'Ruby process monitor',
 4: 'Unmaintained. Sorry.',
 5: 'include Enumerable — Unmaintained',
 6: 'inactive project',
 7: 'Treat an ActiveRecord model as a file attachment, storing its patch, size, content type, etc.',
 8: 'psuedo s3 protocol for mozilla browsers',
 9: 'The solution for tabitus of the browser ',
 10: 'firefox trac integration',
 11: 'Flash photo widget prototype - hacked at last SHDH of 2007',
 12: "Merb More: The Full Stack. Take what you need; leave what you don't.",
 13: 'A very fast & simple Ruby web server',
 14: 'Rails RESTful controller abstraction plugin.',
 15: 'Ghost from Christmas past. Unmaintained.',
 16: '(offically at github.com/sinatra/sinatra) Classy web-development dressed in a DSL',
 17: 'Syncs one directory to another 

In [79]:
print("Relevant repos are:")
for i, idx in enumerate(result[0]):
    print(f"{i}: {id_to_desc[idx]}")
    print(f"dist: {result[1][i]}")
    print("\n")

Relevant repos are:
0: Python module that allows one to easily write and run Hadoop programs.
dist: 0.614687979221344


1: Collection of clustering algorithms written in Ocaml
dist: 0.6442485451698303


2: A python implementation of Grapevine
dist: 0.6676327586174011




Using simpleneighbors

In [242]:
from simpleneighbors import SimpleNeighbors

class SimpleNNIndex(SimpleNeighbors):
    def __init__(self, emb_dim_size, metric='angular'):
        super().__init__(emb_dim_size, metric)

    def build(self, sentences, sentence_embeddings):
        sentence_emb_tup = list(zip(sentences, sentence_embeddings))
        super().feed(sentence_emb_tup)
        super().build()
        print('index built')

    def query(self, query_embedding, num_nbrs):
        for r in super().nearest(query_embedding, num_nbrs):
            super().dist(query_embedding, )
        return super().nearest(query_embedding, num_nbrs)

In [162]:
# Subset the dataframe for comparison
df_subset = df_filtered.limit(5).select('description')
descs = [desc[0] for desc in df_subset.collect()]

In [247]:
embs_matrix = repo_q_pl.get_emb_matrix(df_subset)

repo_q_desc = "python"
repo_q_desc_df = repo_q_pl.convert_query_repo_desc_to_df(repo_q_desc)
repo_q_desc_emb_matrix = repo_q_pl.get_emb_matrix(repo_q_desc_df)

simple_nn_index = SimpleNNIndex(512)
simple_nn_index.build(descs, embs_matrix)
relevant_repos = simple_nn_index.query(repo_q_desc_emb_matrix[0], 3)

print("Relevant repos are:")
for i, repo in enumerate(relevant_repos):
    print(f"{i}: {repo}")
    print("\n")

index built
Relevant repos are:
0: Learn how to design large-scale systems. Prep for the system design interview.  Includes Anki flashcards.


1: A curated list of awesome Python frameworks, libraries, software and resources


2: 12306智能刷票，订票




## Finding similar users using implicit recommendation

We assume that if the user has starred the repository, he/she likes the repository.

If the user does not star the repository, we assume that he/she either dislikes it or does not know that about the existence of the repository.

https://databricks.com/session/building-an-implicit-recommendation-engine-with-spark
With reference to the link above, and in the context of a song recommendation system, the video mentioned that the preference of the user is defined as follows:

| Number of times that the user played | Preference |
| ------------------------------------ | ---------- |
| > 0                                  | 1          |
| 0                                    | 0          |

Confidence = 1 + alpha* preference

Alpha is a tuning parameter that separates the confidence between a preference of 1 and 0. A higher alpha would mean that a user with preference of 1 should be placed with higher confidence.


In [42]:
df_users.show()



+---+--------------------+
| id|             starred|
+---+--------------------+
|  1|[redwoodjs/repeat...|
|  2|                null|
|  3|[80374620, 490832...|
|  4|[135159521, 22664...|
|  5|                null|
|  6|[98691746, 996823...|
|  7|[4542716, 1260477...|
| 17|[352160885, 10972...|
| 18|[51980455, 137828...|
| 19|                null|
| 20|                null|
| 21|[161109499, 37724...|
| 22|[cloudflare/lol-h...|
| 23|[319142957, 90042...|
| 25|[rowanwins/mapbox...|
| 26|                null|
| 27|[292396404, 28333...|
| 29|[activeloopai/Hub...|
| 31|                null|
| 34|                null|
+---+--------------------+
only showing top 20 rows



Exception in thread "serve-DataFrame" java.net.SocketTimeoutException: Accept timed out
	at java.net.PlainSocketImpl.socketAccept(Native Method)
	at java.net.AbstractPlainSocketImpl.accept(AbstractPlainSocketImpl.java:409)
	at java.net.ServerSocket.implAccept(ServerSocket.java:560)
	at java.net.ServerSocket.accept(ServerSocket.java:528)
	at org.apache.spark.security.SocketAuthServer$$anon$1.run(SocketAuthServer.scala:64)


In [144]:
df_users_subset = df_users.limit(5)

In [145]:
from pyspark.sql.functions import explode, col

df_users_exp = df_users_subset.withColumn("starred", explode(df_users_subset.starred))

In [146]:
df_users_exp_repo_id = df_users_exp.where(df_users_exp.starred.rlike(r'^([\s\d]+)$'))

In [154]:
df_users_exp_repo_id.collect()

[Row(id=3, starred='80374620'),
 Row(id=3, starred='49083258'),
 Row(id=3, starred='324134'),
 Row(id=3, starred='301742'),
 Row(id=3, starred='1200050'),
 Row(id=3, starred='6887813'),
 Row(id=3, starred='7530802'),
 Row(id=3, starred='4489166'),
 Row(id=3, starred='3910314'),
 Row(id=3, starred='1663468'),
 Row(id=3, starred='1262785'),
 Row(id=3, starred='43412'),
 Row(id=3, starred='18294'),
 Row(id=3, starred='130'),
 Row(id=4, starred='135159521'),
 Row(id=4, starred='22664562'),
 Row(id=4, starred='96410740'),
 Row(id=4, starred='117137486'),
 Row(id=4, starred='2034023'),
 Row(id=4, starred='44634245'),
 Row(id=4, starred='32201257'),
 Row(id=4, starred='31674354'),
 Row(id=4, starred='12446895'),
 Row(id=4, starred='724712'),
 Row(id=4, starred='14089735'),
 Row(id=4, starred='17468095'),
 Row(id=4, starred='12364583'),
 Row(id=4, starred='6736646'),
 Row(id=4, starred='5008968'),
 Row(id=4, starred='4897488'),
 Row(id=4, starred='4694255'),
 Row(id=4, starred='4256530'),
 Row

In [149]:
unique_user_ids = df_users_exp_repo_id.select('id').distinct().collect()

                                                                                

In [150]:
unique_user_ids

[Row(id=3), Row(id=4)]

In [151]:
from pyspark.sql import Row

ratings = df_users_exp_repo_id.rdd.map(lambda p: Row(user_id=int(p[0]), repo_id=int(p[1]), rating=1))

In [152]:
from pyspark.mllib.recommendation import ALS

# Build the recommendation model using Alternating Least Squares
rank = 10
num_iterations = 10
model = ALS.trainImplicit(ratings, rank, num_iterations, alpha=0.01)

                                                                                

In [157]:
ratings_query = [(3,2150), (4,1200050)]
columns = ["user_id","repo_id"]
ratings_query_df = spark.createDataFrame(data=ratings_query, schema=columns)

In [158]:
ratings_query_rdd = ratings_query_df.rdd.map(lambda p: (p[0], p[1]))
predictions = model.predictAll(ratings_query_rdd).map(lambda r: ((r[0], r[1]), r[2]))

PythonRDD[1686] at RDD at PythonRDD.scala:53

In [160]:
predictions.collect()

                                                                                

[((4, 1200050), -0.013423285457077938), ((3, 2150), -5.190747893180081e-05)]