In [None]:
import os

import transformers
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import torch
import transformers

from sentence_transformers.cross_encoder.evaluation import CEBinaryClassificationEvaluator 

import sentence_transformers
from sklearn import preprocessing

from sklearn import model_selection


plt.style.use("dark_background")
%matplotlib inline

In [None]:
%cd ..

In [None]:
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [None]:
pd.set_option("display.max_colwidth", 150)

In [None]:
doc_id_t5_path = "output/doc_id_generation_model/best_checkpoint/"

In [None]:
model = transformers.T5ForConditionalGeneration.from_pretrained("Salesforce/codet5-base-multi-sum")#("output/doc_id_generation_model/best_checkpoint/")

In [None]:
#tokenizer = transformers.AutoTokenizer.from_pretrained("output/doc_id_generation_model/best_checkpoint/")

In [None]:
#model = model.cuda().half()

In [None]:
imports_df = pd.read_feather("output/selected_python_files_imports.feather")
files_df = pd.read_feather("output/selected_python_files.feather")

In [None]:
def get_predicted_path_summary(texts, n_beams=16, max_length=256, max_label_length=64, min_length=32):
    inputs = tokenizer(texts, max_length=max_length,  truncation=True,
                        padding="max_length", return_tensors="pt")
    summaries = model.generate(input_ids=inputs["input_ids"].to(model.device),
                     attention_mask=inputs["attention_mask"].to(model.device),
                     length_penalty=0.8, num_beams=n_beams, max_length=max_label_length, min_length=min_length)
    return tokenizer.batch_decode(summaries, skip_special_tokens=True, clean_up_tokenization_spaces=True)

In [None]:
import datasets
from functools import partial
import ast
from github_search import seq2seq_utils

In [None]:
paperswithcode_df = pd.read_csv("output/papers_with_readmes.csv")

In [None]:
imports_df = pd.read_feather("output/selected_python_files_imports.feather")
files_df = pd.read_feather("output/selected_python_files.feather")

In [None]:
repo_tasks = paperswithcode_df['tasks'].apply(lambda ts: ", ".join(ast.literal_eval(ts)))
repo_tasks = pd.DataFrame({"repo": paperswithcode_df['repo'], "tasks": repo_tasks})

In [None]:
files_with_tasks_df = repo_tasks.merge(files_df, on='repo')

In [None]:
seq2seq_dataset = datasets.load_from_disk("output/seq2seq_hf_dataset/")

In [None]:
paperswithcode_area_ds= datasets.Dataset.from_pandas(pd.read_csv("data/paperswithcode_tasks.csv").dropna()[['area', 'task_description']])

In [None]:
files_with_tasks_df.head()

In [None]:
plbart_str = "uclanlp/plbart-single_task-en_python"#uclanlp/plbart-python-en_XX"

In [None]:
cross_encoder = sentence_transformers.cross_encoder.CrossEncoder("microsoft/codebert-base", max_length=512, num_labels=1)

In [None]:
sample_files_with_tasks_df = files_with_tasks_df.iloc[::25].reset_index()

In [None]:
sample_files_with_tasks_df.head()

In [None]:
sample_files_with_tasks_df['tasks'].str.split(", ").explode().value_counts()

In [None]:
sample_files_with_tasks_df['repo'].value_counts()

In [None]:
class TextPairDataPreprocessor:
    
    def __init__(self, first_text_columns, second_text_columns, add_first_text_columns_prompts=True, add_second_text_columns_prompts=False):
        self.first_text_columns = first_text_columns
        self.second_text_columns = second_text_columns 
        self.first_interpolation_str = self.get_interpolation_str(first_text_columns, add_first_text_columns_prompts)
        self.second_interpolation_str = self.get_interpolation_str(second_text_columns, add_second_text_columns_prompts)
    
    def get_interpolation_str(self, columns, add_prompt):
        if add_prompt:
            return "\n".join(f"# {col}: " + "{}" for col in columns)
        else:
            return "\n".join(["{}" for __ in range(len(columns))])
    
    def __repr__(self):
        first_pretty_interpolation_str = "\t" + self.first_interpolation_str.replace("\n", "\n\t")
        second_pretty_interpolation_str = "\t" + self.second_interpolation_str.replace("\n", "\n\t")
        return (f"{self.__class__.__name__}\n" +
            f"first text columns: {self.first_text_columns}\n" +  
            f"second text columns: {self.second_text_columns}\n" +  
            f"pattern:\n {first_pretty_interpolation_str}\n{second_pretty_interpolation_str}"
        )
       
    def prepare_input_examples(self, df, label):
        return [
            sentence_transformers.InputExample(
                texts=[self.first_interpolation_str.format(*values),  self.second_interpolation_str.format(*second_values)],
                label=label
            )
            for values, second_values in zip(zip(*[df[col] for col in self.first_text_columns]), zip(* [df[col] for col in self.second_text_columns]))
        ]

In [None]:
repo_path_task_content_preprocessor = TextPairDataPreprocessor(first_text_columns=["repo", "path", "tasks"], second_text_columns=["content"])

In [None]:
repo_path_task_content_preprocessor

In [None]:
repo_path_task_content_preprocessor.prepare_input_examples(sample_files_with_tasks_df, 1)[0].texts

In [None]:
positive_input_examples = repo_path_task_content_preprocessor.prepare_input_examples(sample_files_with_tasks_df, 1)

In [None]:
sample_files_with_permuted_tasks_df = sample_files_with_tasks_df.copy()

In [None]:
sample_files_with_permuted_tasks_df['tasks'] = sample_files_with_tasks_df['tasks'].sample(len(sample_files_with_tasks_df)).reset_index(drop=True)

In [None]:
negative_input_examples = repo_path_task_content_preprocessor.prepare_input_examples(sample_files_with_permuted_tasks_df, 0)

In [None]:
input_examples = positive_input_examples + negative_input_examples
input_example_labels = np.ones(len(input_examples))
input_example_labels[len(positive_input_examples):] = 0

In [None]:
train_input_examples, test_input_examples = model_selection.train_test_split(input_examples, stratify=input_example_labels, test_size=0.2, random_state=0)

In [None]:
evaluator = CEBinaryClassificationEvaluator.from_input_examples(test_input_examples)

In [None]:
full_texts = pd.Series([" ".join(ie.texts) for ie in input_examples])

In [None]:
full_texts = files_with_tasks_df['content']

In [None]:
train_dataloader = torch.utils.data.DataLoader(train_input_examples, shuffle=True, batch_size=8)

In [None]:
transformers.logging.set_verbosity_error()

In [None]:
evaluator(cross_encoder)

In [None]:
cross_encoder.fit(train_dataloader,
  epochs=5,
  evaluator=evaluator,
  use_amp=True,
  callback=lambda score, epoch, steps: print("epoch {} score: {}".format(epoch, round(score, 3)))
)

In [None]:
cross_encoder.save("output/sbert/cross_encoder_repo_path_task_10k")

In [None]:
cross_encoder

In [None]:
len(positive_input_examples)

In [None]:
test_input_examples

In [None]:
test_predicted_scores = cross_encoder.predict([ex.texts for ex in test_input_examples])

In [None]:
test_true_scores = np.array([ex.label for ex in test_input_examples])

In [None]:
import seaborn as sns

In [None]:
sns.distplot(pd.DataFrame({"test_score_residual": test_true_scores - test_predicted_scores}))

In [None]:
test_texts = [ex.texts for ex in test_input_examples]

In [None]:
test_texts[4]

In [None]:
test_predicted_scores

In [None]:
test_tasks = [ex[0].split("tasks: ")[-1] for ex in test_texts]

In [None]:
test_results_df = pd.concat([
    pd.DataFrame.from_records(test_texts, columns=["text1", "text2"]),
    pd.Series(test_tasks, name="tasks").str.split(", "),
    pd.Series(test_predicted_scores, name="model_score"),
    pd.Series(test_true_scores, name="is_positive"),
],
axis=1)

In [None]:
test_results_df[test_results_df['is_positive']]

In [None]:
test_results_df[test_results_df['is_positive'] == 1].explode("tasks").groupby("tasks").agg(["mean", "count"]).sort_values(("model_score", "mean"), ascending=False)

In [None]:
test_results_df[test_results_df['is_positive'] == 1].explode("tasks").groupby("tasks").agg(["mean", "count"])[("model_score", "mean")].plot.hist()#, ascending=False)

In [None]:
repo_path_task_preprocessor = TextPairDataPreprocessor(first_text_columns=["repo", "path"], second_text_columns=["tasks"])
repo_path_content_task_preprocessor = TextPairDataPreprocessor(first_text_columns=["repo", "path", "content"], second_text_columns=["tasks"])

In [None]:
repo_path_content_task_predictions = cross_encoder.predict([ex.texts for ex in repo_path_content_task_preprocessor.prepare_input_examples(files_with_tasks_df[1::25], 1)])

In [None]:
repo_path_task_predictions = cross_encoder.predict([ex.texts for ex in repo_path_task_preprocessor.prepare_input_examples(files_with_tasks_df[1::25], 1)])

In [None]:
[ex.texts for ex in repo_path_task_preprocessor.prepare_input_examples(files_with_tasks_df[:1:25], 1)]

# repo + path  ~ tasks

In [None]:
pd.Series(repo_path_task_predictions).plot.hist()
pd.Series(repo_path_task_predictions).describe()

# repo + path + content ~ tasks

In [None]:
pd.Series(repo_path_content_task_predictions).plot.hist()
pd.Series(repo_path_content_task_predictions).describe()

In [None]:
sentence_transformers.SentenceTransformer("output/sbert/cross_encoder_repo_path_task_10k/")

In [None]:
files_with_tasks_df.head()

In [None]:
cross_encoder.predict([
    ["# tasks: neural networks \n", positive_input_pairs[0][1]]
])

In [None]:
pairs_without_content =[[f"#repo: {repo}\n #path: {path}", f"# tasks: {tasks}"]
    for (repo, tasks, path) in zip(
        sample_files_with_tasks_df['repo'],
        sample_files_with_tasks_df['tasks'],
        sample_files_with_tasks_df['path'])]

In [None]:
pairs_without_content[7]

In [None]:
repo_path_task_scores = cross_encoder.predict([
    [f"#repo: {repo}\n #path: {path}", f"#tasks: {tasks}"]
    for (repo, tasks, path) in zip(
        sample_files_with_tasks_df['repo'],
        sample_files_with_tasks_df['tasks'],
        sample_files_with_tasks_df['path'])]
)


In [None]:
repo_path_content_scores = cross_encoder.predict([
    [f"#repo: {repo}\n #path: {path}", f"#content: {content}"]
    for (repo, content, path) in zip(
        sample_files_with_tasks_df['repo'],
        sample_files_with_tasks_df['content'],
        sample_files_with_tasks_df['path'])]
)


In [None]:
pd.Series(repo_path_content_scores).describe()

In [None]:
pd.Series(repo_path_task_scores).describe()

In [None]:
sample_files_with_tasks_df.sort_values("score", ascending=False).head(20)

In [None]:

__, tokenizer = seq2seq_utils.get_seq2seq_model_with_tokenizer("Salesforce/codet5-base-multi-sum")

In [None]:
#tokenizer = transformers.RobertaTokenizerFast.from_pretrained("Salesforce/codet5-base-multi-sum") 

In [None]:
decoder_start_token_id = tokenizer("<PATH_TASK_SEP>", add_special_tokens=False)['input_ids'][0]

In [None]:
max_length = 64
inputs = tokenizer(example_contents, max_length=max_length,  truncation=True,
                        padding="max_length", return_tensors="pt")

In [None]:
outputs = model.generate(input_ids=inputs["input_ids"].to(model.device),
             attention_mask=inputs["attention_mask"].to(model.device),
             length_penalty=0.8, num_beams=10, max_length=64, min_length=8)#, decoder_start_token_id=decoder_start_token_id)

In [None]:
tokenizer.decode(outputs[7].tolist())

In [None]:
output_paths = tokenizer.batch_decode(outputs.cpu(), skip_special_tokens=True, clean_up_tokenization_spaces=True)

In [None]:
predicted_doc_ids_df = pd.DataFrame.from_records([p.split("<PATH_TASK_SEP>") for p in output_paths], columns=["path", "tasks"])

In [None]:
example_doc_ids_df = pd.DataFrame.from_records([p.split("<PATH_TASK_SEP>") for p in example_doc_idxs], columns=["path", "tasks"]) 

In [None]:
predicted_doc_ids_df['tasks']

In [None]:
example_doc_ids_df['tasks']

In [None]:
model_inputs = [content + " predict docID: " for (content, doc_id) in zip(example_contents, example_doc_idxs)]

In [None]:
model_inputs[1]

In [None]:
sru_contents = '''

import torch
from torch import nn
from typing import List
import os
import json
import sru


rnn_class_type_mapping = {"lstm": nn.LSTM, "sru": sru.SRU}


class SentenceRNN(nn.Module):
    """
    sentence_transformers RNN wrapper
    """
'''

In [None]:
dimension_reduction_contents = '''

import numpy as np
import tqdm
from sklearn import decomposition


class IncrementalHyperbolicMDS:
    def __init__(self, n_components, dtype="float16"):
        self.ipca = decomposition.IncrementalPCA(n_components=n_components)
        self.dtype = dtype

    def partial_fit(self, D):
        Y = -np.cosh(D)
        self.ipca.partial_fit(Y)
'''

In [None]:
recommender_contents = '''

import scipy
import pandas as pd
import numpy as np


from sklearn import compose, feature_extraction, metrics
from functools import reduce, partial
import attr
from typing import Union
import umap
import altair


from game_recommender import steam_data
'''# + " predict docID: lambdaofgod <REPO_NAME_SEP> mlutil <REPO_PATH_SEP> mlutil/recommendation.py"

In [None]:
text_mining_content = '''
def get_wordnet_similarity(
    word, another_word, similarity_method="resnik", pos=None, ic=None
):
    if ic is None:
        ic = wordnet_ic.ic("ic-semcor.dat")
    assert similarity_method in [
        "lin",
        "jcn",
        "resnik",
    ], "Unsupported similarity method: " + str(similarity_method)
    word_synset = wn.synsets(word, pos)[0]
    another_word_synset = wn.synsets(another_word, pos)[0]
    if similarity_method == "lin":
        return word_synset.lin_similarity(another_word_synset, ic)
    elif similarity_method == "jcn":
        return word_synset.jcn_similarity(another_word_synset, ic)
    else:
        return word_synset.res_similarity(another_word_synset, ic)
'''

In [None]:
zsl_content = '''
import numpy as np
import attr
from toolz import partial
from scarce_learn.zero_shot import zsl_base

from sklearn import preprocessing
import torch
from torch import nn, optim
from scarce_learn.zero_shot import torch_util


class DEVISELayer(nn.Module):

    def __init__(self, n_features, n_class_features, margin, init_weights_std=0.1):
        super(DEVISELayer, self).__init__()
        init_weights = init_weights_std * torch.randn(n_features, n_class_features) 
        self.weights = nn.Parameter(data=init_weights.cuda())
        self.margin = margin

    def forward(self, X, y, label_embeddings):
        loss = torch.Tensor([0]).cuda()
        for i in range(X.shape[0]):
            loss += self._devise_loss(X[i], y[i], label_embeddings)
        return loss

    def _devise_loss(self, embedding, label, label_embeddings):
        indicator = torch.ones(label_embeddings.shape[0], dtype=bool)
        indicator[label] = 0
        per_class_loss = torch_util.similarity_based_hinge_loss(self.weights, embedding, label, label_embeddings)
        return nn.ReLU()(self.margin + per_class_loss).sum()

    def predict(self, X, label_embeddings):
        class_similarities = torch_util.bilinear_feature_similarity(self.weights, X, label_embeddings)
        return torch.argmax(class_similarities, axis=1)

'''

In [None]:
evolutionary_content = '''
import attr
import numpy as np
import matplotlib.pyplot as plt
from operator import itemgetter
import pandas as pd
import tqdm
import logging


try:
    import numba
except ImportError as e:
    logging.warning(
        "numba not found, you'll not be able to use mlutil.evolutionary_algorithms.multiobjective"
    )


def bounded_gaussian_noise_mutation(x, n_mutants, lo=0, hi=1, sigma=1e-2):
    noise = sigma * np.random.randn(n_mutants, x.shape[-1])
    return np.clip(x + noise, lo, hi)


@attr.s
class NSGAII:

    optimized_function = attr.ib()
    chromosome_size: int = attr.ib()
    mutation_function = attr.ib(default=bounded_gaussian_noise_mutation)
    random_initializer = attr.ib(default=np.random.rand)
    population_bounds = attr.ib(default=(0, 1))
    objective_names = attr.ib(default=("1st objective", "2nd objective"))
'''

In [None]:
optimal_transport_content = '''
import numpy as np
import ot
import pandas as pd
import seaborn as sns
from sklearn.metrics.pairwise import cosine_distances


def get_stem_vectors(filtered_stems, keyed_vectors):
    return np.vstack(
        [
            np.mean([keyed_vectors[w] for w in stem_list], axis=0)
            for stem_list in filtered_stems
            if len(stem_list) > 0
        ]
    )


def get_word_vector_optimal_transport(
    word_vectors1, word_vectors2, ot_method=ot.sinkhorn, reg=0.01, normalize_dists=True
):
    cost = cosine_distances(word_vectors1, word_vectors2)
    height, width = cost.shape
    a = np.ones(height)
    b = np.ones(width)
    if normalize_dists:
        a = a / a.sum()
        b = b / b.sum()
    ot_matrix = ot_method(a, b, cost, reg=reg)
    return ot_matrix, (ot_matrix * cost).sum()

'''

In [None]:
#model = model.cuda().half()

In [None]:
optimal_transport_content

In [None]:
model.device

In [None]:
path_content = " predict docID: lambdaofgod <REPO_NAME_SEP> mlutil <REPO_PATH_SEP> mlutil/recommendation.py"

In [None]:
tokenizer.decode(
    model.generate(**(tokenizer(path_content, return_tensors="pt").to(model.device)),
    num_beams=5, max_length=128, min_length=32)[0].tolist(), top_k=0, top_p=0.9
)

In [None]:
paths = ["_nbdev.py", "haystack_search.py", "rss_feeds.py", "zero_shot_learning.py"]

In [None]:
paths_series = "lambdaofgod <REPO_NAME_SEP> pytorch_hackathon <REPO_PATH_SEP> pytorch_hackathon <PATH_TASK_SEP> " + pd.Series(paths)

In [None]:
tokenizer.decode(
    model.generate(**(tokenizer(paths_series[3], return_tensors="pt").to(model.device)),
    num_beams=10, max_length=128, min_length=16)[0].tolist(), #, top_p=0.8
)

In [None]:
example = seq2seq_dataset[:32]

In [None]:
import torch

In [None]:
def mean_pooling(model_output, attention_mask):
    # Extract the token embeddings
    token_embeddings = model_output[0]
    # Compute the attention mask
    input_mask_expanded = (attention_mask
                           .unsqueeze(-1)
                           .expand(token_embeddings.size())
                           .float())
    # Sum the embeddings, but ignore masked tokens
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    # Return the average as a single vector
    return sum_embeddings / sum_mask

In [None]:
model = model.cuda()

In [None]:
def embed_text(examples, tokenize=False):
    if tokenize:
        inputs = tokenizer(examples["text"], padding=True, truncation=True,
                       max_length=128, return_tensors="pt")
    else:
        inputs = {"input_ids": torch.tensor(examples["input_ids"]).to(model.device), "attention_mask": torch.tensor(examples["attention_mask"]).to(model.device)}
    with torch.no_grad():
        model_output = model.encoder(**inputs)
    pooled_embeds = mean_pooling(model_output, inputs["attention_mask"])
    return {"embedding": pooled_embeds.cpu().numpy()}

In [None]:
embs_dataset = seq2seq_dataset.train_test_split(test_size=10000)['test'].map(embed_text,
    batched=True,
    batch_size=128
)

In [None]:
embs_dataset[0].keys()#['labels'][0]

In [None]:
embs_dataset.add_faiss_index("embedding")

In [None]:
tokenizer.tokenize("metric learning", return_tensors="pt")

In [None]:
type(embs_dataset[0]['embedding'])

In [None]:
query_text = "implements zero-shot learning"

In [None]:
query_emb = model.encoder(**tokenizer(query_text, return_tensors="pt").to(model.device))

In [None]:
query_emb = query_emb.last_hidden_state.mean(axis=1)[0].to("cpu").detach().numpy()

In [None]:
import numpy as np

In [None]:
#query_emb = np.array(embs_dataset[0]['embedding'])

In [None]:
files_df.iloc[0]['content']

In [None]:
files_sample = files_with_tasks_df.iloc[::500]

In [None]:
files_sample.head()

In [None]:
[embs_dataset["contents"][i] for i in list(embs_dataset.search("embedding", query=query_emb.astype("float32")).indices)]

In [None]:
embs_dataset

In [None]:
import torch

In [None]:
example_imports

In [None]:
model.generate(**(tokenizer(example_imports, return_tensors="pt").to(model.device)),
    num_beams=20, max_length=128, min_length=32)

In [None]:
encoder_output = model.encoder(**inputs.to(model.device)) #decoder_input_ids=torch.tensor([[decoder_start_token_id]]).to(model.device))

In [None]:
unique_tasks = files_with_tasks_df['tasks'].str.split(',').explode().str.strip().unique()

In [None]:
unique_tasks 

In [None]:
encoder_output.last_hidden_state.shape

In [None]:
tokenizer.decode((-lm_output.logits).argsort()[:,:,:5][0,0].tolist())

# TODO wyławianie istotnych informacji z konfigów

In [None]:
%%time
generated_doc_ids = get_predicted_path_summary(example_contents, max_length=64, min_length=16, max_label_length=32)

In [None]:
predicted_tasks = [d for d in generated_doc_ids]

In [None]:
true_tasks = [d for d in example_doc_idxs]

In [None]:
for p in zip(predicted_tasks, true_tasks, example_contents):
    print("#")
    print(p[2])
    print(p[0])
    print(p[1])