In [1]:
from pprint import pprint

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession, types, functions

import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *

In [2]:
# Connect to the MondoDB database

spark = (
    SparkSession
    .builder 
    .appName("reviews") 
    .master("local[4]")
    .config("spark.driver.memory", "10g")
    .config("spark.driver.maxResultSize", "0") \
    .config("spark.kryoserializer.buffer.max", "2000M")\
    .config("spark.mongodb.input.uri", "mongodb://localhost:27017/linked_code.repos") 
    .config("spark.mongodb.output.uri", "mongodb://localhost:27017/linked_code.users") 
    .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.0,com.johnsnowlabs.nlp:spark-nlp_2.12:3.3.0")
    .getOrCreate()
)

21/10/31 22:11:43 WARN Utils: Your hostname, Kenneths-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.1.1 instead (on interface en0)
21/10/31 22:11:43 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/Users/kenneth/Documents/mcomp/CS5344/linked-code/linked_code_venv/lib/python3.7/site-packages/pyspark/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/kenneth/.ivy2/cache
The jars for the packages stored in: /Users/kenneth/.ivy2/jars
org.mongodb.spark#mongo-spark-connector_2.12 added as a dependency
com.johnsnowlabs.nlp#spark-nlp_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-baae80fb-02de-4335-8bb9-716405cba66c;1.0
	confs: [default]
	found org.mongodb.spark#mongo-spark-connector_2.12;3.0.0 in central
	found org.mongodb#mongodb-driver-sync;4.0.5 in central
	found org.mongodb#bson;4.0.5 in central
	found org.mongodb#mongodb-driver-core;4.0.5 in central
	found com.johnsnowlabs.nlp#spark-nlp_2.12;3.3.0 in central
	found com.typesafe#config;1.4.1 in central
	found org.rocksdb#rocksdbjni;6.5.3 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.603 in central
	found com.github.universal-automata#liblevenshtein;3.0.0 in central
	found com.google.code.findbugs#annotations;3.0.1 in central
	found net.jcip#jcip-annotations;1.0 in central
	found com.google.code

In [3]:
# Connect to the repos and user collections
repos = spark.read.format("com.mongodb.spark.sql.DefaultSource").load()
# repos = spark.read.format("com.mongodb.spark.sql.DefaultSource").option("uri","mongodb://localhost:27017/cs5344.repos").load()
users = spark.read.format("com.mongodb.spark.sql.DefaultSource").option("uri","mongodb://localhost:27017/linked_code.users").load()

21/10/31 22:13:32 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
21/10/31 22:13:34 WARN MongoInferSchema: Array Field 'starred' contains conflicting types converting to StringType


In [4]:
# Keep only a few columns
cols_to_keep = ['id','name', 'description', 'language']
df = repos.select(cols_to_keep)

In [5]:
# Keep only these columns for users
df_users = users.select(['id','starred'])                   

In [6]:
# filter out repos with no name or description
null_values = ['nan','NA','null']

df_filtered = ( 
    df
    .filter(~df['description'].isNull())
    .filter(~df['description'].isin(null_values))
    .filter(~df['name'].isNull())
    .filter(~df['name'].isin(null_values))
)

## Compute similarity scores for repo descriptions

#### Set up the SparkNLP pipeline

In [7]:
import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *

In [8]:
MODEL_NAME = "tfhub_use"

In [9]:
class RepoQueryPipeline():
    def __init__(self, model_name):
        # Transforms the input text into a document usable by the SparkNLP pipeline.
        self.document_assembler = DocumentAssembler()
        self.document_assembler.setInputCol('text')
        self.document_assembler.setOutputCol('document')

        # Separates the text into individual tokens (words and punctuation).
        self.tokenizer = Tokenizer()
        self.tokenizer.setInputCols(['document'])
        self.tokenizer.setOutputCol('token')
        
        # Encodes the text as a single vector representing semantic features.
        self.sentence_encoder = UniversalSentenceEncoder.pretrained(name=model_name)
        self.sentence_encoder.setInputCols(['document', 'token'])
        self.sentence_encoder.setOutputCol('sentence_embeddings')
        
    def init_pipeline(self):
        self.nlp_pipeline = Pipeline(stages=[
            self.document_assembler, 
            self.tokenizer,
            self.sentence_encoder
        ])
        
        # Fit the model to an empty data frame so it can be used on inputs.
        empty_df = spark.createDataFrame([['']]).toDF('text')
        pipeline_model = self.nlp_pipeline.fit(empty_df)
        self.light_pipeline = LightPipeline(pipeline_model)
        
    def get_similarity(self, emb_matrix):
        return np.matmul(emb_matrix, emb_matrix.transpose())
    
    def _encode_df(self, df):
        encoded_df = self.light_pipeline.transform(df)
            
        return encoded_df

    def convert_query_repo_desc_to_df(self, query_repo_desc):
        query_formatted = [(1, query_repo_desc)]

        columns = ["query_num", "text"]
        query_df = spark.createDataFrame(data=query_formatted, schema=columns).select("text")
        
        return query_df
    
    def _extract_emb_matrix(self, encoded_df):
        embs = []
        for r in encoded_df.collect():
            embs.append(r.sentence_embeddings[0].embeddings)
        emb_matrix = np.array(embs)
        
        return emb_matrix
    
    def get_emb_matrix(self, df):
        # Rename the column to match the pipeline model
        df = df.withColumnRenamed(df.columns[0],'text')

        encoded_df = self._encode_df(df)
        emb_matrix = self._extract_emb_matrix(encoded_df)
        return emb_matrix

#### Initialize the SparkNLP pipeline

In [10]:
repo_q_pl = RepoQueryPipeline(MODEL_NAME)
repo_q_pl.init_pipeline()

tfhub_use download started this may take some time.


21/10/31 22:13:54 WARN BasicProfileConfigLoader: Your profile name includes a 'profile ' prefix. This is considered part of the profile name in the Java SDK, so you will need to include this prefix in your profile name when you reference this profile from your Java code.
21/10/31 22:13:54 WARN BasicProfileConfigLoader: Your profile name includes a 'profile ' prefix. This is considered part of the profile name in the Java SDK, so you will need to include this prefix in your profile name when you reference this profile from your Java code.


Approximate size to download 923.7 MB
[ | ]tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[ — ]Download done! Loading the resource.
[ | ]

2021-10-31 22:14:18.679690: I external/org_tensorflow/tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


[OK!]


#### Select the repository size that you want to query against

In [11]:
REPOSITORY_SIZE = 1000

#### Encode the repositories' description text to generate embeddings

In [12]:
df_subset = df_filtered.limit(REPOSITORY_SIZE).select('description')

In [13]:
embs_matrix = repo_q_pl.get_emb_matrix(df_subset)

                                                                                

#### Insert query repo description

In [14]:
repo_q_desc = "Python machine learning"

#### Encode the query repo description

In [15]:
repo_q_desc_df = repo_q_pl.convert_query_repo_desc_to_df(repo_q_desc)
repo_q_desc_emb_matrix = repo_q_pl.get_emb_matrix(repo_q_desc_df)

                                                                                

#### Initialize the Annoy nearest neighbours pipeline

In [16]:
from annoy import AnnoyIndex

class AnnoyIdx():
    def __init__(self, embedding_size, dist_measure):
        self.t = AnnoyIndex(embedding_size, dist_measure)

    def build(self, embs_matrix):
        for i, emb in enumerate(embs_matrix):
            self.t.add_item(i, emb)
        self.t.build(10)

    def query(self, query_embedding, num_nbrs, inc_dist=False):
        return self.t.get_nns_by_vector(query_embedding, num_nbrs, include_distances=inc_dist)

In [17]:
annoy_idx = AnnoyIdx(512, "angular")
annoy_idx.build(embs_matrix)
result = annoy_idx.query(repo_q_desc_emb_matrix[0], 3, inc_dist=True)

#### Retrieve the most relevant repositories given the input query repository description 

In [28]:
id_to_desc_name = {i: [row["description"], row["name"]] for i, row in enumerate(df_filtered.limit(REPOSITORY_SIZE).collect())}

In [38]:
print("Relevant repos are:")
for i, idx in enumerate(result[0]):
    print(i)
    print(f"name: {id_to_desc_name[idx][1]}")
    print(f"description: {id_to_desc_name[idx][0]}")
    print(f"distance: {result[1][i]}")
    print("\n")

Relevant repos are:
0
name: dumbo
description: Python module that allows one to easily write and run Hadoop programs.
distance: 0.614687979221344


1
name: ocaml-clustering
description: Collection of clustering algorithms written in Ocaml
distance: 0.6442485451698303


2
name: crapvine
description: A python implementation of Grapevine
distance: 0.6676327586174011


