* [User -> Users (by language)](#User--%3E-Users-(by-language))
* [Basket analysis](#Basket-analysis:-Starred-repos)

In [None]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession

In [None]:
spark = (
    SparkSession.builder
    .config("spark.mongodb.input.uri", "mongodb://localhost:27017/gh.users")
    .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.0")
    .getOrCreate()
)

## User -> Users (by language)

Content-based filtering.

Represent every user as a vector of language counts.

Use cases:
* Given a user, find similar user IDs.
* Given a list of languages, find similar users.

In [None]:
from pyspark.ml.linalg import SparseVector
from pyspark.sql.functions import collect_list, collect_set
from pyspark.ml.feature import CountVectorizer
from typing import List

In [None]:
UserId = int

In [None]:
class SimilarUsersEngine:
    
    def __init__(self, model, schema, language_counts):
        self.model = model
        self.schema = schema
        self.language_counts = language_counts
        
    @classmethod
    def start(cls):
        repos = (
            spark.read
            .format("com.mongodb.spark.sql.DefaultSource")
            .option("uri","mongodb://localhost:27017/gh.repos")
            .load()
            .limit(1000)
        )

        language_counts = (
            repos
            .select(["owner.id","language"])
            .groupBy("id")
            .agg(collect_set("language"))
        )
        # user1 ["py", "js", "py"]

        cv = CountVectorizer(
            inputCol="collect_set(language)",
            outputCol="features",
            binary=True,
            minDF=0
        )

        return cls(
            model=cv.fit(language_counts),
            schema=language_counts.schema,
            language_counts=language_counts,
        )

    def find_most_similar_users_by_languages(self, languages: List[str]) -> List[int]:
        assert isinstance(languages, list)
        
        query = (
            self.model
            .transform(spark.createDataFrame([(0, languages)], self.language_counts.schema))
            .first()
            .features)
        
        result = self.model.transform(self.language_counts)

        ids = (
            result
            .rdd
            .map(lambda row: (row.id, row.features.dot(query)))
            .filter(lambda row: row[1] > 0)
            .map(lambda row: row[0])
            .collect()
        )

        return ids
    
    def find_most_similar_users_by_id(self, user_id: UserId):
        languages = engine.language_counts.rdd.filter(lambda row: row.id == user_id).first()[1]
        return self.find_most_similar_users_by_languages(languages)


In [None]:
engine = SimilarUsersEngine.start()

In [None]:
engine.find_most_similar_users_by_languages(["Python"])

In [None]:
engine.find_most_similar_users_by_id(21)

In [None]:
def get_language_counts_per_owner(repos_ownership: list) -> dict:
    
    language_counts_per_owner = {}
    
    for _, owner_id, language in repos_ownership:
        if owner_id in language_counts_per_owner:
            language_counts = language_counts_per_owner[owner_id]
            if language in language_counts:
                language_counts[language] += 1
            else:
                language_counts[language] = 1
        else:
            language_counts_per_owner[owner_id] = {language: 1}

    return language_counts_per_owner

def get_repos_per_owner(repos_ownership: list) -> dict:

    repos_per_owner = {}

    for repo_id, owner_id, _ in repos_ownership:
        if owner_id in repos_per_owner:
            repos_per_owner[owner_id].add(repo_id)
        else:
            repos_per_owner[owner_id] = set([repo_id])

    return repos_per_owner

# Play

In [None]:
repos = (
    spark.read
    .format("com.mongodb.spark.sql.DefaultSource")
    .option("uri","mongodb://localhost:27017/gh.repos")
    .load()
    .limit(100)
)

In [None]:
users = (
    spark.read
    .format("com.mongodb.spark.sql.DefaultSource")
    .option("uri","mongodb://localhost:27017/gh.users")
    .load()
)

relevant columns

In [None]:
repos = repos.select([
    "_id",
    "full_name", 
    "description",
    "language",
    "owner",
    "updated_at",       # scope last 5 years
    "fork",             # scope = false
    "stargazers_count", # popularity
    "created_at",       # recency
    "size",             # maturity, complexity
])

repos = repos.filter(repos.fork == False)

In [None]:
underdogs = (repos
    .filter((repos.language == "Rust") & (repos.stargazers_count < 10))
)

underdogs.sort(underdogs.created_at.desc()).show()

# Basket analysis: Starred repos

In [None]:
transaction = [repoid1, repoid6, repoid10]

In [None]:
pipeline = {
    '$match': {'starred.0': {'$type': 'number'}}
}

valid_users = (
    spark.read
    .format("com.mongodb.spark.sql.DefaultSource")
    .option("uri", "mongodb://localhost:27017/gh.users")
    .option("pipeline", str(pipeline))
    .load()
    .limit(10)
)

transactions = (valid_users
    .select(["_id", "starred"])
    .rdd.map(lambda a: set(a.starred))
    .collect()
)

transactions

In [None]:
h = []
for t in transactions:
    l = set()
    for item in t:
        item = str(item)
        l.add(item)
    h.append(l)
h

In [None]:
from apriori import generate_frequent_itemsets_id, apriori

In [None]:
%time
rules, _ = apriori(h, 0.1, 0.1, 3)

In [None]:
rules

In [None]:
from apyori import TransactionManager, gen_support_records, apriori

In [None]:
[len(t) for t in transactions]

In [None]:
!pip install apyori

In [None]:
transactions

In [None]:
%time
transaction_manager = TransactionManager.create(transactions)
support_records = list(gen_support_records(transaction_manager, 0.1, max_length=3))

In [None]:
list(apriori(transactions, min_support=0.1, min_confidence=0.1))

In [None]:
gene