# CHAOS for GitHub


The following block is just a common boilerplate to ensure the modules are loaded properly:

In [None]:
import os
import sys

root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if  os.path.basename(root) == 'chaos':
    if root not in sys.path:
        print("Add project root to syspath")
        sys.path.append(root)
    print("Change working directory to project gitchaos example")
    os.chdir(os.path.join(root, 'examples/gitchaos'))

    
import chaos

In [None]:
import json
import logging.config
from pathlib import Path

import yaml

from chaos.process.clean.df import DFCleaner
from chaos.process.clean.humanize import TextConverter, ColumnFormatType
from chaos.process.extract.graph import GraphEdgeMapper, GraphPopularityExtractor
from chaos.process.extract.name import NameToGenderExtractor
from chaos.process.extract.nlp import NLPEntityExtractor, NLPTokenExtractor
from chaos.process.extract.reduce import MostUsedExtractor
from chaos.process.pipeline import SequentialDataPipeline
from chaos.recommend.candidates import InteractionCG, DMCandidateRepo, StrategicCG
from chaos.recommend.evaluate.evaluator import LFMEvaluator, PredictionGraphEvaluator, Evaluator
from chaos.recommend.predict.predictor import LFMPredictor
from chaos.recommend.predict.reciprocal import ReciprocalWrapper
from chaos.recommend.translator import LFMTranslator
from chaos.shared.model import DataModel
from examples.gitchaos.extract import GitHubPreprocessor
from examples.gitchaos.fetch import GitHubSource

with open('res/logging.yml') as logging_cfg_file:
    logging_cfg = yaml.safe_load(logging_cfg_file)
    logging.config.dictConfig(logging_cfg)
logger = logging.getLogger(__name__)


## Data Generation

The ultimate goal of this scenario is to create your own GitHub universe for reciprocal recommendations.
Let's have a look at some GitHub interactions first and how they are weighted approximately within this scenario:

![GitHub interactions](img/interactions-gh.png)

As we have a symmetric interaction (collaboration), GitHub is a good choice for testing our RRS framework.

For this to work, you first need to get a personal access token to be authorized to use the new **GitHub GraphQL API** to use 5000 requests per hour.
Follow [these instructions](https://docs.github.com/en/github/authenticating-to-github/creating-a-personal-access-token) to get your personal access token **with the scope `user:email`** for fetching user-related info. 

Insert the token below in `YOUR_TOKEN` with your username as `START_NODE`.
If you want a big universe, choose 5000 `NODES`. Beware that this would take a long time.
If you don't want to wait, leave the `NODES` as is.

In [None]:
YOUR_TOKEN = "ghp-..."
START_NODE = 'torvalds' # Change this to your GitHub username
NODES = 500

BREADTH = 7 # Smaller values will make the algorithm go farther away from your home planet, the start node
VISUALIZE_WITH_AVATARS = True # See avatars in your universe
FILENAME = f"gh-{START_NODE}@{NODES}" # For persisting the file for later usage, no need to change

Ready to start **Reciprocal BFS**? Pushing play will start the algorithm and generate a network dependent on the above parameterization. Then, it will persist the model so that you don't loose it in subsequent runs.

This might take some time (about 3-5 minutes with `50 MBit` for `500` nodes, depending on GitHub's response times). The requests are not parallelized, as we do not want to start a DoS or go against GitHub's ToS.

In [None]:
logger.info(f"Start username: {START_NODE} / Breadth: {BREADTH} / Nodes: {NODES}")
src = GitHubSource(gql_spec=yaml.safe_load(open('./res/gql-spec.yml')),
                   token=YOUR_TOKEN,
                   start_user=START_NODE,
                   breadth=BREADTH, max_nodes=NODES)
checkpoint_path = Path(f'res/{FILENAME}')
data = None
if checkpoint_path.exists():
    data = DataModel.load(checkpoint_path)
    src.data = data
else:
    data = src.source_data()
    data.save(checkpoint_path)
print("Done!")

# The following is recommended to remove nodes that are not fully processed from the graph:
print("Synchronizing user_df with interaction_graph...")
data.sync_graph(inplace=True)
# Uncomment only if you are bored, drawing many nodes will take a long time:
# data.interaction_graph.draw()

### Let's have a look at the acquired data!

In [None]:
data.describe()
data.user_df

## Feature Engineering

Next, we define and run a data pipeline to process user profile features, as illustrated in the following image:

![GitHub Feature Engineering Pipeline](img/feat-eng-gh.png)


In [None]:
pipeline = SequentialDataPipeline([
    SequentialDataPipeline(name='Metadata Preparation Pipeline', processors=[
        GitHubPreprocessor(skills_per_user=25, programming_languages_per_user=6),
        DFCleaner(['bio'], fill_na_val=''),
        DFCleaner(['company', 'location'], str_clean_regex=r'[-.,;+]', fill_na_val=''),
        TextConverter('bio', ColumnFormatType.MARKDOWN),
        NLPEntityExtractor('bio', {'GPE': 'location_tags', 'LOC': 'location_tags', 'LANGUAGE': 'location_tags',
                                   'ORG': 'org_tags', 'PRODUCT': 'org_tags', 'NORP': 'org_tags'}),
    ]),
    SequentialDataPipeline(name='User Metadata', processors=[
        SequentialDataPipeline(name='Bio', processors=[
            NLPTokenExtractor('bio', 'bio_tags'),
            MostUsedExtractor('bio_tags', 'bio_tags', usage_threshold=2),
        ]),
        SequentialDataPipeline(name='Organizations', processors=[
            NLPTokenExtractor('company', 'org_tags'),
            MostUsedExtractor('org_tags', 'org_tags', usage_threshold=2),
        ]),
        SequentialDataPipeline(name='Location', processors=[
            NLPEntityExtractor('company', {'GPE': 'location_tags'}),
            NLPTokenExtractor('location', 'location_tags'),
            MostUsedExtractor('location_tags', 'location_tags', usage_threshold=2)
        ]),
        SequentialDataPipeline(name='Process skills', processors=[
            NLPEntityExtractor('descriptions', {'%': 'skill_tags'}),
            MostUsedExtractor('skills', 'skill_tags', usage_threshold=2),
            MostUsedExtractor('programmingLanguages', 'skill_tags', top=40, usage_threshold=2),
            MostUsedExtractor('skill_tags', 'skill_tags', top=1000, usage_threshold=2)
        ]),
    ]),
])
# We can also simply alter the user_df "manually", e.g. to add profile URLs
data.user_df['url'] = data.user_df.index.map(lambda u: f'https://github.com/{u}')
data = pipeline.execute(data)
data.user_df

### Processing the Interaction Graph

We do not normalize the edges this time and use the strengths as is. If you are curious, you can also adapt this, e.g. compare with the scenario "Learning Group".

Feel free to experiment!


In [None]:
interaction_pipeline = SequentialDataPipeline(name='Graph Manipulations', processors=[
    GraphEdgeMapper(cost=lambda e: 1 / e.strength, capacity=lambda e: e.strength),
    GraphPopularityExtractor(target_col='popularity', metrics=('eigenvector', 'degree'),
                             labels=['unknown', 'less-known', 'normal', 'well-known', 'popular', 'prominent'],
                             quantiles=[0.0, 0.1, 0.4, 0.6, 0.8, 0.99, 1.0], add_as_node_attrib=True)
])

data = interaction_pipeline.execute(data)
data.user_df['popularity']

## Evaluation

Now that we have processed that original data, we can test a variation of different `LFMPredictor` configurations.

You might need to adapt the `epochs` and/or `hp`s.

In [None]:
translator = LFMTranslator(data)
# Exlude follower re-recommendations with the following CG:
cg = StrategicCG(
    InteractionCG(DMCandidateRepo(data), interaction_pattern='follow', include=False),
    on_unknown_user=DMCandidateRepo(data)
)


hp = {'no_components': 48, 'learning_rate': 0.04}
# Found by using LFMHyperparameterOptimizer with 250 trials on github-xs, f1 metric, typically beats the above hps with less components!
hp_opt = {'no_components': 42, 'learning_rate': 0.0418, 'user_alpha': 1.7007e-05, 'item_alpha': 1.5008e-05}

evaluator = LFMEvaluator(
    interactions=translator.interaction_matrix,
    predictors={
        'Hybrid all': LFMPredictor(
            LFMTranslator(
                data, features={'bio_tags': 0.4, 'location_tags': 0.2, 'skill_tags': 0.2, 'org_tags': 0.2}
            ), cg, **hp
        ),
        'Hybrid all tuned': LFMPredictor(
            LFMTranslator(
                data, features={'bio_tags': 0.4, 'location_tags': 0.2, 'skill_tags': 0.2, 'org_tags': 0.2}
            ), cg, **hp_opt
        ),
        'Hybrid orgs + bio': LFMPredictor(
            LFMTranslator(
                data, features=['org_tags', 'bio_tags']
            ), cg, **hp
        ),
        'Collaborative Filtering only': LFMPredictor(translator, **hp_opt)
    }
)
evaluator.run_all(epochs=range(0, 72, 2), metrics=(Evaluator.PRECISION, Evaluator.RECALL, Evaluator.F1))

res = evaluator.best_of_all(Evaluator.PRECISION)
print(f"Best predictor for precision: {res.predictor} @ epoch {res.epoch} with {res.value}")
best_predictor = hybrid = evaluator[evaluator.best_of_all('precision').predictor]

evaluator.create_report()

## Recommendation

Let's put the best one inside a `ReciprocalWrapper` to make it return reciprocal recommendations for your own user profile.

The following is just a draft. Feel free to change and test anything you want.

In [None]:
print(f"Generating recommendations for you, {START_NODE}...")

rrs = ReciprocalWrapper(best_predictor, ku_factor=8, kv_factor=8)
recs = rrs.predict(START_NODE, k=10)

print("Predictions:", recs)
# print(rrs.stats['rank_violations'])
data.user_df.loc[recs.keys()]

# Query most similar tags if you want to:
# best_predictor.similar_features(['skill_tags:machine-learning'])

## Visualization

**Finally, let's create your own GitHub universe made out of latent dimensions!**

If launching the *TensorBoard Projector* does not work within your JupyterLab, try to follow the instructions written in the output in a separate terminal.

Otherwise, enojoy to explore! (select `PROJECTOR` in the drop-down after TensorBoard has started)

In [None]:
if VISUALIZE_WITH_AVATARS:
    logger.info("Download avatar images...")
    src.dl_avatars(base_dir=Path('temp/avatars/'))
    logger.info("Create sprite with all avatar images...")
    dim = src.create_avatar_sprite(Path('temp/avatars/'), Path(f'temp/avatars/#{FILENAME}.jpeg'))
    hybrid.visualize('users', Path(f'temp/avatars/#{FILENAME}.jpeg'), sprite_single_img_dim=dim,
                     extra_cols={'url'})
else:
    hybrid.visualize('users')
    
%load_ext tensorboard
%tensorboard --logdir ./temp