### Adding fasttext probs and sentiments

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from datetime import datetime
import warnings
from sklearn.model_selection import train_test_split
import json
import fasttext
warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', 200)

In [None]:
df = pd.read_csv("data/allrepos_processed.csv")
senti = pd.read_csv("data/allrepos_sentiments.csv")
ftprobs = pd.read_csv("data/allrepos_ftprobs.csv")

In [None]:
df["ft_bug"] = ftprobs["__label__bug"] 
df["ft_feature"] = ftprobs["__label__feature"] 
df["ft_other"] = ftprobs["__label__other"] 

In [None]:
df["title_sentistrenght_p"] = senti["title_sentistrenght"].apply(lambda x: x.split(',')[0])
df["title_sentistrenght_n"] = senti["title_sentistrenght"].apply(lambda x: x.split(',')[1])

df["body_sentistrenght_p"] = senti["body_sentistrenght"].apply(lambda x: x.split(',')[0])
df["body_sentistrenght_n"] = senti["body_sentistrenght"].apply(lambda x: x.split(',')[1])

df["title_polarity"] = senti["title_textblob"].apply(lambda x: x.split(',')[0])
df["title_subjectivity"] = senti["title_textblob"].apply(lambda x: x.split(',')[1])

df["body_polarity"] = senti["body_textblob"].apply(lambda x: x.split(',')[0])
df["body_subjectivity"] = senti["body_textblob"].apply(lambda x: x.split(',')[1])

In [None]:
df["positive_title_sentistrenght_n"] = df["title_sentistrenght_n"].astype(int).abs()
df["positive_body_sentistrenght_n"] = df["body_sentistrenght_n"].astype(int).abs()
df["positive_title_polarity"] = df["title_polarity"].astype(float) + 1
df["positive_body_polarity"] = df["body_polarity"].astype(float) + 1

### Applynig some other preprocessings and adding some columns

In [None]:
df = df[~df.closer_login.isna()]

In [None]:
def compute_time_interval(t1, t2):
    d1 = datetime.strptime(t1, "%Y-%m-%dT%H:%M:%SZ")
    d2 = datetime.strptime(t2, "%Y-%m-%dT%H:%M:%SZ")
    delta = d2-d1
    return delta.days

In [None]:
base_time = df.created_at.max()

In [None]:
df["author_account_age"] = df["author_created_at"].apply(lambda x: round(compute_time_interval(x, base_time)/365))    
df["closer_account_age"] = df["closer_created_at"].apply(lambda x: round(compute_time_interval(x, base_time)/365))

In [None]:
df["has_assignee"] = ~df["assignee"].isna()
df["num_of_assignees"] = df["assignees"].apply(lambda x: 0 if pd.isna(x) else len(x.split("|")))
df['has_milestone'] = df['milestone'].apply(lambda x: 0 if pd.isna(x) else 1) 

In [None]:
df["author_github_cntrb"] = df["author_github_cntrb"].apply(lambda x: int(str(x).replace(',','')))
df["closer_github_cntrb"] = df["closer_github_cntrb"].apply(lambda x: int(str(x).replace(',','')))

In [None]:
df["same_author_closer"] = df.apply(lambda x: x.author_login == x.closer_login, axis=1)

In [None]:
df['numeric_association'] = df['author_association'].apply(lambda x: 0 if x == "NONE"  else 1 if x == "CONTRIBUTOR" else 2 if x == "MEMBER" else 3 if x == "OWNER" else 4)
# df['numeric_association'] = df['numeric_association'].apply(lambda col:pd.Categorical(col).codes)

# dummies = pd.get_dummies(df["author_association"],prefix="association")
# df = pd.concat([df,dummies], axis =1)

In [None]:
df["max_prob"] = df.apply(lambda x: max(x.ft_bug,x.ft_feature,x.ft_other), axis=1)
df["ft_issue_type"] = df.apply(lambda x: 2 if x.ft_bug == x.max_prob else 1 if x.ft_feature == x.max_prob else 0, axis=1)

In [None]:
df["labels"] = df["labels"].astype(str)
df["num_labels"] = df["labels"].apply(lambda x: x.count("|"))
df["lower_labels"] = df.labels.apply(lambda x: str(x).lower())

df.isduplicate = df.lower_labels.apply(lambda x: "duplicate" in x)
df = df[~df.isduplicate]

In [None]:
label_cats_df = pd.read_csv("labels_clusters.csv")
label_cats = list(label_cats_df.columns)

In [None]:
def has_cat(x):
    issue_labels = x.split("|")  
    if(len(set(issue_labels)-set(cat_labels)) == len(set(issue_labels))):
        return 0
    return 1

In [None]:
for label_cat in label_cats:
    cat_labels = list(label_cats_df[~label_cats_df[label_cat].isna()][label_cat])
    df[label_cat] = df["lower_labels"].apply(has_cat)
    print(label_cat)
    print(df[label_cat].value_counts())

In [None]:
#run this section once, then u can comment it

repos_df = pd.read_excel("p0to5_labels_alllangs.xlsx")
repos_class_map = {}

for index, row in repos_df.iterrows():
    repo = row["repo"][29:] 
    repo_class_map = {}
    for cat in ["class1", "class2"]:
        labels = row[cat]
        for char in labels[1:]:
            repo_class_map["p"+char] = cat
    repos_class_map[repo] = repo_class_map
    
with open("repos_class_map.json" , "w") as f:
    f.write(json.dumps(repos_class_map, indent = 4))
    
repos = list(repos_class_map.keys())

with open("2class_repo_addresses.json", "w") as f:
    f.write(json.dumps(repos, indent=4))
    
with open("2class_repo_names.json", "w") as f:
    f.write(json.dumps([repo.split('/')[1] for repo in repos], indent=4))

In [None]:
with open("repos_class_map.json") as f:
    repos_class_map = json.loads(f.read())
    
df["repo_label_2class"] = df.apply(lambda x: np.nan if x.repo not in repos_class_map else repos_class_map[x.repo][x.actual_label_cat], axis=1)

In [None]:
df["repo_label_2class"].isna().value_counts()

### Removing non english issues

In [None]:
model = fasttext.load_model('../classification/data/lid.176.bin')

df["title"] = df["title"].astype(str)
df["body"] = df["body"].astype(str)
df["title_ft"] = df["title"].apply(lambda x: x.replace('\n', ' '))
df["body_ft"] = df["body"].apply(lambda x: x.replace('\n', ' '))

title_langs = model.predict(list(df["title_ft"]))[0]
body_langs = model.predict(list(df["body_ft"]))[0]

df["title_lang"] = list(map(lambda x: x[0] , title_langs))
df["body_lang"] = list(map(lambda x: x[0] , body_langs))
title_langs.clear()
body_langs.clear()

In [None]:
print(df[df["title_lang"]=="__label__en"].shape)
print(df[df["body_lang"]=="__label__en"].shape)
print(df[(df["title_lang"]=="__label__en") & (df["body_lang"]=="__label__en")].shape)

In [None]:
df["title_lang"].value_counts()

In [None]:
df = df[(df["title_lang"]=="__label__en") & (df["body_lang"]=="__label__en")]
df.drop(columns=["title_lang", "body_lang", "title_ft", "body_ft"], inplace=True)

### saving the dataframes

In [None]:
dataframes = {}
reaction_time_med = {}

with open("p_repo_addresses.json") as f:
    repo_addresses = json.loads(f.read())
    
with open("p_repo_names.json") as f:
    repos = json.loads(f.read())

for i, repo in enumerate(repos):
    
    repo_df = df[df.repo == repo_addresses[i]]
    train, test = train_test_split(repo_df, test_size=0.2, random_state = 42, shuffle=True)
    train["test_tag"] = 0
    test["test_tag"] = 1
    repo_df = pd.concat([train, test], ignore_index=True)
    dataframes[repo] = repo_df
    reaction_time_med[repo] = repo_df.reaction_time.median()   
    print(repo, reaction_time_med[repo])
    repo_df.to_csv(f"data/{repo}.csv", index=False)

repo = 'cross_repo'
repo_df = df
reaction_time_med[repo] = repo_df.reaction_time.median()  
train, test = train_test_split(repo_df, test_size=0.2, random_state = 42, shuffle=True)
train["test_tag"] = 0
test["test_tag"] = 1
repo_df = pd.concat([train, test], ignore_index=True)
repo_df.to_csv(f"data/{repo}.csv", index=False)
dataframes[repo] = repo_df
repos += [repo]

### Normalization

In [None]:
nontext_columns = [    
    'num_labels',
    'is_pull_request',
    'title_len',
    'body_len',
    'num_comments',
    'num_events',
    'author_followers',
    'closer_followers',
    'author_following',
    'closer_following',
    'author_public_repos',
    'closer_public_repos',
    'author_public_gists',
    'closer_public_gists',
    'author_core_team',
    'author_has_association',
    'author_issue_counts',
    'commits_count',
    'has_commit',
    'cm_developers_number',
    'cm_developers_ratio',
    'cm_developers_unique',
    'cm_authors_unique',
    'cm_developers_ratio_unique',
    'cm_mean_len',
    'time_to_discuss',
    'author_github_cntrb',
    'closer_github_cntrb',
    'author_repo_cntrb',
    'closer_repo_cntrb',
    'title_words_num',
    'body_words_num',   
    'title_alpha_len',
    'title_alphabet_ratio',
    'body_alpha_len',
    'body_alphabet_ratio',
    'body_processed_len',
    'title_processed_len',
    'title_processed_words_num',
    'body_processed_words_num',
    'num_of_sharps',
    'num_of_at',
    'num_of_qmark',
    'num_of_codesnippets',
    'num_of_functions',
    'num_of_issues',
    'num_of_paths',
    'num_of_dates',
    'num_of_times',
    'num_of_urls',
    'num_of_emails',
    'num_of_obligations',
    'has_email',
    'has_code',    
    'ft_bug',
    'ft_feature',
    'ft_other',
    'max_prob',
    'ft_issue_type',
    'title_sentistrenght_p',
    'body_sentistrenght_p',
    'title_subjectivity',
    'body_subjectivity',
    'positive_body_sentistrenght_n',
    'positive_title_sentistrenght_n',
    'positive_title_polarity',
    'positive_body_polarity',
    'author_account_age',
    'closer_account_age',
    'has_assignee',
    'num_of_assignees',
    'has_milestone',
    'numeric_association'
]

In [None]:
for repo in repos:
    print(repo)
    df = dataframes[repo]
    train = df[df.test_tag==0]
    test = df[df.test_tag==1]
    
    min_max_scaler = MinMaxScaler()
    train[nontext_columns]  = min_max_scaler.fit_transform(train[nontext_columns])
    test[nontext_columns]  = min_max_scaler.transform(test[nontext_columns])
    df = pd.concat([train, test], ignore_index=True)
    
    df["priority_med"] = df.reaction_time.apply(lambda x: 2 if x<=reaction_time_med[repo] else 1 if x>reaction_time_med[repo] else 0)
    
    df.to_csv(f"data/{repo}_norm.csv", index=False)