### 1- Adding fasttext probs and sentiments

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from datetime import datetime
import warnings
from sklearn.model_selection import train_test_split
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("../data/repos/3/allrepos_processed.csv")
senti = pd.read_csv("../data/repos/3/allrepos_sentiments.csv")
ftprobs = pd.read_csv("../data/repos/3/allrepos_ftprobs.csv")

In [3]:
df["ft_bug"] = ftprobs["__label__bug"] 
df["ft_feature"] = ftprobs["__label__feature"] 
df["ft_other"] = ftprobs["__label__other"] 

In [4]:
df["title_sentistrenght_p"] = senti["title_sentistrenght"].apply(lambda x: x.split(',')[0])
df["title_sentistrenght_n"] = senti["title_sentistrenght"].apply(lambda x: x.split(',')[1])

df["body_sentistrenght_p"] = senti["body_sentistrenght"].apply(lambda x: x.split(',')[0])
df["body_sentistrenght_n"] = senti["body_sentistrenght"].apply(lambda x: x.split(',')[1])

df["title_polarity"] = senti["title_textblob"].apply(lambda x: x.split(',')[0])
df["title_subjectivity"] = senti["title_textblob"].apply(lambda x: x.split(',')[1])

df["body_polarity"] = senti["body_textblob"].apply(lambda x: x.split(',')[0])
df["body_subjectivity"] = senti["body_textblob"].apply(lambda x: x.split(',')[1])

In [5]:
df["positive_title_sentistrenght_n"] = df["title_sentistrenght_n"].astype(int).abs()
df["positive_body_sentistrenght_n"] = df["body_sentistrenght_n"].astype(int).abs()
df["positive_title_polarity"] = df["title_polarity"].astype(float) + 1
df["positive_body_polarity"] = df["body_polarity"].astype(float) + 1

### 2- Applynig some other preprocessings and adding some columns

In [6]:
df = df[~df.closer_login.isna()]

In [7]:
def compute_time_interval(t1, t2):
    d1 = datetime.strptime(t1, "%Y-%m-%dT%H:%M:%SZ")
    d2 = datetime.strptime(t2, "%Y-%m-%dT%H:%M:%SZ")
    delta = d2-d1
    return delta.days

In [8]:
base_time = df.created_at.max()

In [9]:
df["author_account_age"] = df["author_created_at"].apply(lambda x: round(compute_time_interval(x, base_time)/365))    
df["closer_account_age"] = df["closer_created_at"].apply(lambda x: round(compute_time_interval(x, base_time)/365))

In [10]:
df["has_assignee"] = ~df["assignee"].isna()
df["num_of_assignees"] = df["assignees"].apply(lambda x: 0 if pd.isna(x) else len(x.split("|")))
df['has_milestone'] = df['milestone'].apply(lambda x: 0 if pd.isna(x) else 1) 

In [11]:
df["author_github_cntrb"] = df["author_github_cntrb"].apply(lambda x: int(str(x).replace(',','')))
df["closer_github_cntrb"] = df["closer_github_cntrb"].apply(lambda x: int(str(x).replace(',','')))

In [12]:
df['numeric_association'] = df['author_association'].apply(lambda x: 0 if x == "NONE"  else 1 if x == "CONTRIBUTOR" else 2 if x == "MEMBER" else 3 if x == "OWNER" else 4)

In [None]:
# print(df[(df.repository == "spring-framework") & (df.author_login == "spring-issuemaster")].shape)
# print(df[(df.repository == "spring-framework") & (df.author_login != "spring-issuemaster")].shape)

In [None]:
# print(df[(df.repository == "spring-framework") & (df.closer_login == "spring-issuemaster")].shape)
# print(df[(df.repository == "spring-framework") & (df.closer_login != "spring-issuemaster")].shape)

In [None]:
# print(df[(df.repository == "spring-framework") & (df.author_login == "spring-issuemaster") & (df.closer_login == "spring-issuemaster")].shape)

In [None]:
# df["author_creation_day_before_issue"] = df.apply(lambda x: np.nan if x.author_login == "spring-issuemaster" else compute_time_interval(x.author_created_at, x.created_at) ,axis=1)
# df["tyro_author"] = df["author_creation_day_before_issue"].apply(lambda x: 1 if x<5 else 0)

In [None]:
'''
spring-framework has a spring-issuemaster author which owns 17005 issues form total 19719 issues!!
this user also closed 16900 issues of spring-framework and 230 issues from spring-boot
spring-issuemaster doesnt have any repo and star, thus considering author info for spring-framework repo is meaningless
'''

In [13]:
print(df[df.repository == "guava"].shape)
print(df[(df.repository == "guava") & (df.author_login == "gissuebot")].shape)
print(df[(df.repository == "guava") & (df.closer_login == "gissuebot")].shape)
print(df[(df.repository == "guava") & (df.author_login == "gissuebot") & (df.closer_login == "gissuebot")].shape)

(3132, 112)
(1609, 112)
(1344, 112)
(1344, 112)


In [14]:
print(df[df.repository == "RxJava"].shape)
print(df[(df.repository == "RxJava") & (df.author_login == "dependabot-preview[bot]")].shape)
print(df[(df.repository == "RxJava") & (df.closer_login == "dependabot-preview[bot]")].shape)
print(df[(df.repository == "RxJava") & (df.author_login == "dependabot-preview[bot]") & (df.closer_login == "dependabot-preview[bot]")].shape)

(6084, 112)
(50, 112)
(1, 112)
(1, 112)


In [15]:
print(df[df.repository == "retrofit"].shape)
print(df[(df.repository == "retrofit") & (df.author_login == "dependabot[bot]")].shape)
print(df[(df.repository == "retrofit") & (df.closer_login == "dependabot[bot]")].shape)
print(df[(df.repository == "retrofit") & (df.author_login == "dependabot[bot]") & (df.closer_login == "dependabot[bot]")].shape)

(2960, 112)
(2, 112)
(0, 112)
(0, 112)


In [None]:
# df.columns[df.isna().any()].tolist()

In [16]:
df[df.reaction_time.isna()].shape

(33582, 112)

In [17]:
df.shape

(101552, 112)

In [18]:
df.repository.value_counts()

elasticsearch       44099
spring-framework    19717
spring-boot         15254
okhttp              10306
RxJava               6084
guava                3132
retrofit             2960
Name: repository, dtype: int64

In [19]:
dataframes = {}
base_repos = ['elasticsearch', 'spring-framework', 'spring-boot', 'okhttp', 'RxJava', 'guava', 'retrofit']
corss_repos = ["corss_7", "cross_without_sf", "cross_without_bot", "cross_without_sf_el", "cross_without_bot_el"]
repos = base_repos + corss_repos
reaction_time_med = {}

In [None]:
# dataframes["elasticsearch"] =  df[df.repository == "elasticsearch"]
# dataframes["spring-boot"] =  df[(df.repository == "spring-boot") & (df.closer_login != "spring-issuemaster")]
# dataframes["spring-framework"] =  df[(df.repository == "spring-framework") & (df.author_login == "spring-issuemaster") & (df.closer_login == "spring-issuemaster")]

In [20]:
for repo in base_repos:
    dataframes[repo] = df[df.repository == repo]
    reaction_time_med[repo] = dataframes[repo].reaction_time.median()
    print(repo, reaction_time_med[repo])
    dataframes[repo].to_csv(f"../data/repos/final/{repo}.csv", index=False)

elasticsearch 1.9916666666666663
spring-framework 1346.7
spring-boot 46.25
okhttp 126.29166666666667
RxJava 13.316666666666665
guava 454.3333333333333
retrofit 168.63333333333333


In [22]:
dataframes["corss_7"] = df
dataframes["cross_without_sf"] = df[(df.repository != "spring-framework")]
dataframes["cross_without_bot"] = df[(df.repository != "spring-framework") & (df.repository != "guava")]
dataframes["cross_without_sf_el"] = df[(df.repository != "spring-framework") & (df.repository != "elasticsearch")]
dataframes["cross_without_bot_el"] =  df[(df.repository != "spring-framework") & (df.repository != "guava") & (df.repository != "elasticsearch")]

for repo in corss_repos:
    reaction_time_med[repo] = dataframes[repo].reaction_time.median()
    print(repo, reaction_time_med[repo])
    dataframes[repo].to_csv(f"../data/repos/final/{repo}.csv", index=False)

corss_7 54.05833333333334
cross_without_sf 14.0
cross_without_bot 12.15
cross_without_sf_el 66.61666666666666
cross_without_bot_el 55.15


### 3- Normalization

In [23]:
nontext_columns = [    
    'is_pull_request',
    'title_len',
    'body_len',
    'num_comments',
    'num_events',
    'author_followers',
    'closer_followers',
    'author_following',
    'closer_following',
    'author_public_repos',
    'closer_public_repos',
    'author_public_gists',
    'closer_public_gists',
    'author_core_team',
    'author_has_association',
    'author_issue_counts',
    'commits_count',
    'has_commit',
    'cm_developers_number',
    'cm_developers_ratio',
    'cm_developers_unique',
    'cm_authors_unique',
    'cm_developers_ratio_unique',
    'cm_mean_len',
    'time_to_discuss',
    'author_github_cntrb',
    'closer_github_cntrb',
    'author_repo_cntrb',
    'closer_repo_cntrb',
    'title_words_num',
    'body_words_num',   
    'title_alpha_len',
    'title_alphabet_ratio',
    'body_alpha_len',
    'body_alphabet_ratio',
    'body_processed_len',
    'title_processed_len',
    'title_processed_words_num',
    'body_processed_words_num',
    'num_of_sharps',
    'num_of_at',
    'num_of_qmark',
    'num_of_codesnippets',
    'num_of_functions',
    'num_of_issues',
    'num_of_paths',
    'num_of_dates',
    'num_of_times',
    'num_of_urls',
    'num_of_emails',
    'num_of_obligations',
    'has_email',
    'has_code',
    'ft_bug',
    'ft_feature',
    'ft_other',    
    'title_sentistrenght_p',
    'body_sentistrenght_p',
    'title_subjectivity',
    'body_subjectivity',
    'positive_body_sentistrenght_n',
    'positive_title_sentistrenght_n',
    'positive_title_polarity',
    'positive_body_polarity',
    'author_account_age',
    'closer_account_age',
    'has_assignee',
    'num_of_assignees',
    'has_milestone',
    'numeric_association'
]

In [24]:
for repo in repos:
    print(repo)
    df = dataframes[repo]
    train, test = train_test_split(df, test_size=0.2, random_state = 42, shuffle=True)
    train["test_tag"] = 0
    test["test_tag"] = 1
    min_max_scaler = MinMaxScaler()
    train[nontext_columns]  = min_max_scaler.fit_transform(train[nontext_columns])
    test[nontext_columns]  = min_max_scaler.transform(test[nontext_columns])
    df = pd.concat([train, test], ignore_index=True)
    
    df["priority"] = df.reaction_time.apply(lambda x: 2 if x<=reaction_time_med[repo] else 1 if x>reaction_time_med[repo] else 0)
    if repo in corss_repos:
        df["priority_per_repo"] = df.apply(lambda r: 2 if r.reaction_time<=reaction_time_med[r.repository] else 1 if r.reaction_time>reaction_time_med[r.repository] else 0, axis = 1)
    
    df.to_csv(f"../data/repos/final/{repo}_norm.csv", index=False)

elasticsearch
spring-framework
spring-boot
okhttp
RxJava
guava
retrofit
corss_7
cross_without_sf
cross_without_bot
cross_without_sf_el
cross_without_bot_el
