In [11]:
import os
import os.path as osp
import pandas as pd
import numpy as np
from scipy.stats import mannwhitneyu

from utils import helpers as hpr
from utils import constants
from utils import classifier_util as clas_util


### Dependent changes

In [2]:
df_dependencies = pd.read_csv(osp.join('.', 'Files', "Preliminary", 'deps_ident.csv'))

In [3]:
df_dependencies = df_dependencies[(df_dependencies['Source_status']!="NEW")&(df_dependencies['Target_status']!="NEW")]

In [4]:
dependent_changes = set(hpr.flatten_list(df_dependencies[['Source', 'Target']].values))

### Load changes

In [5]:
df_changes = hpr.combine_openstack_data()

Reading OpenStack changes...
OpenStack changes loaded successfully...


In [7]:
df_changes = df_changes[df_changes["status"]!="NEW"]
df_changes['is_dependent'] = df_changes['number'].map(lambda nbr: 1 if nbr in dependent_changes else 0)

In [8]:
combined_output = pd.read_csv(osp.join(".", "combined_output.csv"))
combined_output.drop(columns=["owner_account_id", 'status'], inplace=True)

In [9]:
def calc_mod_file_dep_cha(row):
    changed_files = row["changed_files"]
    if type(changed_files) is not list:
        changed_files = []
    return round(100*row['num_mod_file_dep_cha']/len(changed_files), 2) if len(changed_files) != 0 else 0

In [10]:
df = clas_util.combine_features()
df = pd.merge(
    left=df, 
    right=combined_output, 
    left_on='number', 
    right_on='number', 
    how='left',
    suffixes=('_source', '_target')
)
df['pctg_mod_file_dep_cha'] = df.apply(calc_mod_file_dep_cha, axis=1)
df['is_dependent'] = df['number'].map(lambda nbr: 1 if nbr in dependent_changes else 0)
df = df.drop(columns=[
   "changed_files", "num_mod_file_dep_cha", 'num_build_failures', 
    # 'cross_project_changes', 'cross_project_changes_owner', 'pctg_cross_project_changes_owner', 
    # 'min_num_mod_file_dep_cha', 'max_num_mod_file_dep_cha', 'mean_num_mod_file_dep_cha', 'median_num_mod_file_dep_cha',
    # 'pctg_cross_project_changes', 'last_mth_cro_proj_nbr'
    ])
df.fillna(0, inplace=True)
# del df_changes

In [None]:
def mann_u_test(sample1, sample2):
    _, p_value = mannwhitneyu(sample1, sample2)
    # Set significance level
    alpha = 0.05
    if p_value <= alpha:
        print("Reject the null hypothesis: There is a statistically significant difference between the two samples.")
        print(p_value)
    else:
        print("Fail to reject the null hypothesis: There is no statistically significant difference between the two samples.")

### Age of the project

In [None]:
sample1 = df.loc[df["is_dependent"]==1, "project_age"].tolist()
sample2 = df.loc[df["is_dependent"]==0, "project_age"].tolist()
mann_u_test(sample1, sample2)

### changes with number of changed files

In [None]:
sample1 = df_changes.loc[df_changes["is_dependent"]==1, "files_count"].sort_values().tolist()
sample2 = df_changes.loc[df_changes["is_dependent"]==0, "files_count"].sort_values().tolist()
# Perform the Mann-Whitney U test
_, p_value = mannwhitneyu(sample1, sample2)

### The experience of the developers with dependenct changes

In [None]:
dev_with_dep = df_changes.loc[df_changes['is_dependent']==1, "owner_account_id"].unique()

sample1 = df_changes[df_changes['owner_account_id'].isin(dev_with_dep)].groupby("owner_account_id").count()['id'].tolist()
sample2 = df_changes[~df_changes['owner_account_id'].isin(dev_with_dep)].groupby("owner_account_id").count()['id'].tolist()
_, p_value = mannwhitneyu(sample1, sample2)
p_value