* [User -> Users (by language)](#User--%3E-Users-(by-language))

In [1]:
import time
from typing import List

from pyspark.ml.linalg import SparseVector
from pyspark.sql.functions import collect_list, collect_set
from pyspark.ml.feature import CountVectorizer, CountVectorizerModel
from pyspark.rdd import RDD
from pyspark import SparkConf, SparkContext, StorageLevel
from pyspark.sql import SparkSession

In [2]:
spark = (
    SparkSession.builder
    .config("spark.mongodb.input.uri", "mongodb://localhost:27017/gh.users")
    .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.0")
    .getOrCreate()
)

21/10/31 15:59:03 WARN Utils: Your hostname, 15in.local resolves to a loopback address: 127.0.0.1; using 192.168.1.7 instead (on interface en0)
21/10/31 15:59:03 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/Users/raimibinkarim/.local/lib/python3.8/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/raimibinkarim/.ivy2/cache
The jars for the packages stored in: /Users/raimibinkarim/.ivy2/jars
org.mongodb.spark#mongo-spark-connector_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-a4f4c7b3-b709-48f3-8886-ce340678dde9;1.0
	confs: [default]
	found org.mongodb.spark#mongo-spark-connector_2.12;3.0.0 in central
	found org.mongodb#mongodb-driver-sync;4.0.5 in central
	found org.mongodb#bson;4.0.5 in central
	found org.mongodb#mongodb-driver-core;4.0.5 in central
:: resolution report :: resolve 202ms :: artifacts dl 11ms
	:: modules in use:
	org.mongodb#bson;4.0.5 from central in [default]
	org.mongodb#mongodb-driver-core;4.0.5 from central in [default]
	org.mongodb#mongodb-driver-sync;4.0.5 from central in [default]
	org.mongodb.spark#mongo-spark-connector_2.12;3.0.0 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules          

## User -> Users (by language)

Content-based filtering.

Represent every user as a vector of language counts.

Use cases:
* Given a user, find similar user IDs.
* Given a list of languages, find similar users.

In [31]:
repos = (
    spark.read
    .format("com.mongodb.spark.sql.DefaultSource")
    .option("uri","mongodb://localhost:27017/gh.repos")
    .load()
)

In [82]:
class SimilarUsersEngine:
    
    def __init__(self, model: CountVectorizerModel, schema, index: RDD):
        self.model = model
        self.schema = schema
        self.index = index
        
    @classmethod
    def start(cls, binarize: bool, limit: int = None):
        """
        1. Load the repos data.
        2. Count the languages for every user.
        3. Fit to CountVectorizer
        """
        repos = (
            spark.read
            .format("com.mongodb.spark.sql.DefaultSource")
            .option("uri","mongodb://localhost:27017/gh.repos")
            .load()
        )
        if limit:
            repos = repos.limit(limit)

        language_counts = (
            repos
            .select(["owner.login","language"])
            .groupBy("login")
            .agg(collect_set("language"))
        )

        cv = CountVectorizer(
            inputCol="collect_set(language)",
            outputCol="features",
            binary=binarize,
            minDF=0,
        )
        model = cv.fit(language_counts)
    
        index = (
            model
            .transform(language_counts)
            .select(["login", "features"])
            .rdd
            .map(lambda row: (row[0],row[1]))
            .persist(StorageLevel.MEMORY_AND_DISK)
        )
        
        return cls(
            model=model,
            schema=language_counts.schema,
            index=index,
        )

    def get_user_vector(self, login: str):
        try:
            query_user = self.index.filter(lambda user: user[0] == login).first()
        except ValueError:
            raise KeyError

        return query_user
        

    def find_most_similar_users_by_languages(
        self, 
        languages: List[str], 
        k: int = 5,
        similarity=jaccard,
    ) -> list:
        
        assert isinstance(languages, list)
        
        query: SparseVector = (
            self.model
            .transform(spark.createDataFrame([(0, languages)], self.schema))
            .first()
            .features
        )
        
        ids = (
            self.index
            .mapValues(similarity)
            .top(k, key=lambda x: x[1])
        )

        return ids
    
    def find_most_similar_users_by_login(
        self, 
        login: str, 
        k: int = 5, 
        similarity=jaccard,
    ) -> list:
        
        _, query = self.get_user_vector(login)
        
        ids = (
            self.index
            .mapValues(similarity)
            .top(k, key=lambda x: x[1])
        )

        return ids

In [83]:
def jaccard(user, query):
    return ((user.values > 0) * (query.values > 0)).sum()

def euclidean(user, query):
    return -user.squared_distance(query)

def cosine(user, query):
    dist = user.squared_distance(query)
    return dist / (user.norm(2)*query.norm(2))

No. of users with repositories

In [30]:
engine.index.count()

                                                                                

445962

In [27]:
a = time.time()
engine = SimilarUsersEngine.start(binarize=True)
b = time.time()
print(b-a)



98.58094573020935


In [61]:
x = SparseVector(3, {0:10,1:10,2:8})
u = SparseVector(3, {0: 0,1:10,2:8})
v = SparseVector(3, {0:10,1:10,2:8})
w = SparseVector(3, {0:10,1: 9,2:0})

In [81]:
engine.find_most_similar_users_by_languages(
    ["Python", "Kotlin", "Rust", "JavaScript"]
)

                                                                                

[('getsentry', 4.0),
 ('microsoft', 4.0),
 ('TheAlgorithms', 4.0),
 ('discord', 4.0),
 ('pact-foundation', 4.0)]

In [26]:
a = time.time()
engine.find_most_similar_users_by_login(login="remykarem")
b = time.time()
print(b-a)



7.480223894119263


