In [20]:
import os
import json
import numpy as np
import pandas as pd

In [3]:
# Define the paths
project_path = '../../data/libraries/project'
dependencies_path = '../../data/libraries/dependencies'

# Initialize lists to store dataframes
project_df = pd.DataFrame()
dependencies_df  = pd.DataFrame()

# Read all JSON files from project directory
for file in os.listdir(project_path):
    if file.endswith('.json'):
        # Read the JSON file
        with open(os.path.join(project_path, file), 'r') as f:
            data = json.load(f)
            df = pd.DataFrame([{
                'platform': data['platform'],
                'name': data['name'],
                'contributions_count': data['contributions_count'],
                'dependent_repos_count': data['dependent_repos_count'],
                'dependents_count': data['dependents_count'],
                'forks': data['forks'],
                'rank': data['rank'],
                'stars': data['stars']
            }])
        
        # Concatenate to the main dataframe
        project_df = pd.concat([project_df, df], ignore_index=True)



In [4]:
# Read all CSV files from dependencies directory
for file in os.listdir(dependencies_path):
    if file.endswith('.json'):
        with open(os.path.join(dependencies_path, file), 'r') as f:
            data = json.load(f)
            df = pd.DataFrame([{
                'platform': data['platform'],
                'name': data['name'],
                'dependencies_count': len(data.get('dependencies', []))
            }])
        dependencies_df = pd.concat([dependencies_df, df], ignore_index=True)

# Display the first few rows of both dataframes
print("Project DataFrame:")
print(project_df.head())
print("\nDependencies DataFrame:")
print(dependencies_df.head())

Project DataFrame:
  platform                                           name  \
0      NPM  @scandipwa/scandipwa-development-toolkit-core   
1      NPM                    react-native-usb-serialport   
2      NPM         @stdlib/stats-base-dists-binomial-mean   
3      NPM                              @arcteryx/js-i18n   
4     Pypi                                     ae-literal   

   contributions_count  dependent_repos_count  dependents_count  forks  rank  \
0                    0                      0                 4      0     7   
1                    0                      1                 3     24     9   
2                    1                      7                 8      0    11   
3                    0                      0                 0      0     6   
4                    0                      1                 5      0     9   

   stars  
0      0  
1     26  
2      1  
3      0  
4      0  

Dependencies DataFrame:
  platform               name  dependencie

In [5]:
# Merge project_df and dependencies_df on platform and name with inner join
merged_df = pd.merge(project_df, dependencies_df, on=['platform', 'name'], how='inner')

# Display the first few rows to verify the merge
print(merged_df.head())

  platform                                           name  \
0      NPM  @scandipwa/scandipwa-development-toolkit-core   
1      NPM                    react-native-usb-serialport   
2      NPM         @stdlib/stats-base-dists-binomial-mean   
3      NPM                              @arcteryx/js-i18n   
4     Pypi                                     ae-literal   

   contributions_count  dependent_repos_count  dependents_count  forks  rank  \
0                    0                      0                 4      0     7   
1                    0                      1                 3     24     9   
2                    1                      7                 8      0    11   
3                    0                      0                 0      0     6   
4                    0                      1                 5      0     9   

   stars  dependencies_count  
0      0                  24  
1     26                   2  
2      1                   3  
3      0                   0

In [6]:
# Read MTTU and MTTR data
mttu_df = pd.read_csv('../../data/mttu/mttu.csv')
mttr_df = pd.read_csv('../../data/mttr/mttr.csv')

# Display the first few rows of both dataframes
print("MTTU DataFrame:")
print(mttu_df.head())
print("\nMTTR DataFrame:")
print(mttr_df.head())

MTTU DataFrame:
  system_name from_package_name  total_duration  out_of_date_duration  \
0       CARGO               a10             397                     0   
1       CARGO           aarch64             481                   325   
2       CARGO      aardvark-dns             618                     0   
3       CARGO            ab-av1             582                   100   
4       CARGO          ab_glyph            3084                    78   

      ratio  avg_total_duration  avg_out_of_date_duration  avg_ratio  
0  0.000000               397.0                       0.0   0.000000  
1  0.675676               481.0                     325.0   0.675676  
2  0.000000               618.0                       0.0   0.000000  
3  0.171821               582.0                     100.0   0.171821  
4  0.025292              1542.0                      39.0   0.025292  

MTTR DataFrame:
  system_name from_package_name  total_duration  total_post_fix_exposure_time  \
0       CARGO        

In [9]:
# Rename columns in merged_df
merged_df.rename(columns={'platform': 'system_name', 'name': 'from_package_name'}, inplace=True)
merged_df['system_name'] = merged_df['system_name'].str.upper()

# Rename columns in mttu_df and mttr_df
mttu_df.rename(columns={'avg_out_of_date_duration': 'mttu'}, inplace=True)
mttr_df.rename(columns={'avg_post_fix_exposure_time': 'mttr'}, inplace=True)

# Merge merged_df with mttu_df and mttr_df
final_df_with_mttu = pd.merge(merged_df, mttu_df[['system_name', 'from_package_name', 'mttu']], on=['system_name', 'from_package_name'], how='inner')
final_df_with_mttr = pd.merge(merged_df, mttr_df[['system_name', 'from_package_name', 'mttr']], on=['system_name', 'from_package_name'], how='inner')

# Display the first few rows of the final dataframe
print(final_df_with_mttu.head())
print(final_df_with_mttu.shape[0], final_df_with_mttr.shape[0])

  system_name                              from_package_name  \
0         NPM  @scandipwa/scandipwa-development-toolkit-core   
1         NPM                    react-native-usb-serialport   
2         NPM         @stdlib/stats-base-dists-binomial-mean   
3         NPM                              @arcteryx/js-i18n   
4        PYPI                                     ae-literal   

   contributions_count  dependent_repos_count  dependents_count  forks  rank  \
0                    0                      0                 4      0     7   
1                    0                      1                 3     24     9   
2                    1                      7                 8      0    11   
3                    0                      0                 0      0     6   
4                    0                      1                 5      0     9   

   stars  dependencies_count        mttu  
0      0                  24  588.954545  
1     26                   2  629.000000  
2    

In [21]:
# List of columns to test correlation with mttu
columns_to_test = ['contributions_count', 'dependent_repos_count', 'dependents_count', 'forks', 'rank', 'stars', 'dependencies_count', 'mttu']

# Initialize a dictionary to store correlation matrices
correlation_matrices = {'spearman': np.zeros((len(columns_to_test), len(columns_to_test))),
                        'pearson': np.zeros((len(columns_to_test), len(columns_to_test))),
                        'kendall': np.zeros((len(columns_to_test), len(columns_to_test)))}

# Calculate correlations for each pair of columns
for i, col1 in enumerate(columns_to_test):
    for j, col2 in enumerate(columns_to_test):
        correlation_matrices['spearman'][i, j] = final_df_with_mttu[col1].corr(final_df_with_mttu[col2], method='spearman')
        correlation_matrices['pearson'][i, j] = final_df_with_mttu[col1].corr(final_df_with_mttu[col2], method='pearson')
        correlation_matrices['kendall'][i, j] = final_df_with_mttu[col1].corr(final_df_with_mttu[col2], method='kendall')

# Display the correlation matrices in a table format
spearman_df = pd.DataFrame(correlation_matrices['spearman'], index=columns_to_test, columns=columns_to_test)
pearson_df = pd.DataFrame(correlation_matrices['pearson'], index=columns_to_test, columns=columns_to_test)
kendall_df = pd.DataFrame(correlation_matrices['kendall'], index=columns_to_test, columns=columns_to_test)

print("Spearman Correlation Matrix:")
display(spearman_df)
print("\nPearson Correlation Matrix:")
display(pearson_df)
print("\nKendall Correlation Matrix:")
display(kendall_df)

Spearman Correlation Matrix:


Unnamed: 0,contributions_count,dependent_repos_count,dependents_count,forks,rank,stars,dependencies_count,mttu
contributions_count,1.0,0.422183,0.333022,0.880195,0.816141,0.867372,-0.032715,-0.08063
dependent_repos_count,0.422183,1.0,0.826344,0.392798,0.654156,0.406826,0.097864,-0.036593
dependents_count,0.333022,0.826344,1.0,0.315604,0.629508,0.32576,0.130444,0.012382
forks,0.880195,0.392798,0.315604,1.0,0.783986,0.929283,-0.054309,-0.096607
rank,0.816141,0.654156,0.629508,0.783986,1.0,0.824636,-0.002752,-0.167455
stars,0.867372,0.406826,0.32576,0.929283,0.824636,1.0,-0.056667,-0.129301
dependencies_count,-0.032715,0.097864,0.130444,-0.054309,-0.002752,-0.056667,1.0,0.307006
mttu,-0.08063,-0.036593,0.012382,-0.096607,-0.167455,-0.129301,0.307006,1.0



Pearson Correlation Matrix:


Unnamed: 0,contributions_count,dependent_repos_count,dependents_count,forks,rank,stars,dependencies_count,mttu
contributions_count,1.0,0.037648,0.042115,0.864879,0.401677,0.802685,-0.075939,-0.078723
dependent_repos_count,0.037648,1.0,0.955022,0.032371,0.14063,0.07854,0.025165,0.000118
dependents_count,0.042115,0.955022,1.0,0.043097,0.140754,0.099028,0.023946,-0.001856
forks,0.864879,0.032371,0.043097,1.0,0.341531,0.880992,-0.100395,-0.076057
rank,0.401677,0.14063,0.140754,0.341531,1.0,0.451197,-0.006062,-0.150426
stars,0.802685,0.07854,0.099028,0.880992,0.451197,1.0,-0.069446,-0.075771
dependencies_count,-0.075939,0.025165,0.023946,-0.100395,-0.006062,-0.069446,1.0,0.100636
mttu,-0.078723,0.000118,-0.001856,-0.076057,-0.150426,-0.075771,0.100636,1.0



Kendall Correlation Matrix:


Unnamed: 0,contributions_count,dependent_repos_count,dependents_count,forks,rank,stars,dependencies_count,mttu
contributions_count,1.0,0.31651,0.240583,0.761893,0.661311,0.728262,-0.022745,-0.058088
dependent_repos_count,0.31651,1.0,0.736203,0.301432,0.518562,0.306868,0.071575,-0.028375
dependents_count,0.240583,0.736203,1.0,0.231721,0.484151,0.235117,0.092281,0.006186
forks,0.761893,0.301432,0.231721,1.0,0.633042,0.824964,-0.039431,-0.070858
rank,0.661311,0.518562,0.484151,0.633042,1.0,0.668946,-0.001621,-0.119591
stars,0.728262,0.306868,0.235117,0.824964,0.668946,1.0,-0.040598,-0.09336
dependencies_count,-0.022745,0.071575,0.092281,-0.039431,-0.001621,-0.040598,1.0,0.21804
mttu,-0.058088,-0.028375,0.006186,-0.070858,-0.119591,-0.09336,0.21804,1.0


In [19]:
# List of columns to test correlation with mttr
columns_to_test = ['contributions_count', 'dependent_repos_count', 'dependents_count', 'forks', 'rank', 'stars', 'dependencies_count', 'mttr']

# Initialize a dictionary to store correlation matrices
correlation_matrices = {'spearman': np.zeros((len(columns_to_test), len(columns_to_test))),
                        'pearson': np.zeros((len(columns_to_test), len(columns_to_test))),
                        'kendall': np.zeros((len(columns_to_test), len(columns_to_test)))}

# Calculate correlations for each pair of columns
for i, col1 in enumerate(columns_to_test):
    for j, col2 in enumerate(columns_to_test):
        correlation_matrices['spearman'][i, j] = final_df_with_mttr[col1].corr(final_df_with_mttr[col2], method='spearman')
        correlation_matrices['pearson'][i, j] = final_df_with_mttr[col1].corr(final_df_with_mttr[col2], method='pearson')
        correlation_matrices['kendall'][i, j] = final_df_with_mttr[col1].corr(final_df_with_mttr[col2], method='kendall')

# Display the correlation matrices in a table format
spearman_df = pd.DataFrame(correlation_matrices['spearman'], index=columns_to_test, columns=columns_to_test)
pearson_df = pd.DataFrame(correlation_matrices['pearson'], index=columns_to_test, columns=columns_to_test)
kendall_df = pd.DataFrame(correlation_matrices['kendall'], index=columns_to_test, columns=columns_to_test)

print("Spearman Correlation Matrix:")
display(spearman_df)
print("\nPearson Correlation Matrix:")
display(pearson_df)
print("\nKendall Correlation Matrix:")
display(kendall_df)

Spearman Correlation Matrix:


Unnamed: 0,contributions_count,dependent_repos_count,dependents_count,forks,rank,stars,dependencies_count,mttr
contributions_count,1.0,0.407538,0.288673,0.856183,0.842634,0.846599,0.056529,-0.226666
dependent_repos_count,0.407538,1.0,0.79608,0.383698,0.566637,0.360518,0.162565,-0.175643
dependents_count,0.288673,0.79608,1.0,0.276826,0.527289,0.2521,0.193369,-0.136298
forks,0.856183,0.383698,0.276826,1.0,0.806668,0.920216,0.044304,-0.211543
rank,0.842634,0.566637,0.527289,0.806668,1.0,0.827494,0.04861,-0.269086
stars,0.846599,0.360518,0.2521,0.920216,0.827494,1.0,0.027521,-0.218776
dependencies_count,0.056529,0.162565,0.193369,0.044304,0.04861,0.027521,1.0,0.01357
mttr,-0.226666,-0.175643,-0.136298,-0.211543,-0.269086,-0.218776,0.01357,1.0



Pearson Correlation Matrix:


Unnamed: 0,contributions_count,dependent_repos_count,dependents_count,forks,rank,stars,dependencies_count,mttr
contributions_count,1.0,0.072103,0.07831,0.724377,0.392282,0.646919,0.065613,-0.068925
dependent_repos_count,0.072103,1.0,0.971184,0.052792,0.109699,0.082994,0.03875,-0.016029
dependents_count,0.07831,0.971184,1.0,0.067869,0.108368,0.098199,0.041246,-0.016564
forks,0.724377,0.052792,0.067869,1.0,0.271408,0.813727,0.024181,-0.023074
rank,0.392282,0.109699,0.108368,0.271408,1.0,0.369442,0.064773,-0.221348
stars,0.646919,0.082994,0.098199,0.813727,0.369442,1.0,0.056337,-0.050688
dependencies_count,0.065613,0.03875,0.041246,0.024181,0.064773,0.056337,1.0,0.016707
mttr,-0.068925,-0.016029,-0.016564,-0.023074,-0.221348,-0.050688,0.016707,1.0



Kendall Correlation Matrix:


Unnamed: 0,contributions_count,dependent_repos_count,dependents_count,forks,rank,stars,dependencies_count,mttr
contributions_count,1.0,0.316513,0.214391,0.738372,0.693923,0.7104,0.042059,-0.160214
dependent_repos_count,0.316513,1.0,0.713029,0.304859,0.454272,0.282536,0.120165,-0.12865
dependents_count,0.214391,0.713029,1.0,0.209897,0.405795,0.187703,0.137781,-0.094875
forks,0.738372,0.304859,0.209897,1.0,0.663892,0.826041,0.032504,-0.15259
rank,0.693923,0.454272,0.405795,0.663892,1.0,0.680776,0.034709,-0.188549
stars,0.7104,0.282536,0.187703,0.826041,0.680776,1.0,0.020037,-0.155921
dependencies_count,0.042059,0.120165,0.137781,0.032504,0.034709,0.020037,1.0,0.009129
mttr,-0.160214,-0.12865,-0.094875,-0.15259,-0.188549,-0.155921,0.009129,1.0


In [15]:
# Read dep_status.csv file
dep_status_df = pd.read_csv('../../data/dep_status/dep_status.csv')

# Merge dep_status_df with final_df_with_mttu
merged_with_mttu = pd.merge(final_df_with_mttu, dep_status_df, on=['system_name', 'from_package_name'], how='inner')

# Merge dep_status_df with final_df_with_mttr
merged_with_mttr = pd.merge(final_df_with_mttr, dep_status_df, on=['system_name', 'from_package_name'], how='inner')

# Categorize 'dependency_status' column
dependency_status_mapping = {
    'all_floating': 0,
    'all_pinned': 1,
    'mixed': 2
}
merged_with_mttu['dependency_status_cat'] = merged_with_mttu['dependency_status'].map(dependency_status_mapping)
merged_with_mttr['dependency_status_cat'] = merged_with_mttr['dependency_status'].map(dependency_status_mapping)

# Calculate correlation between 'dependency_status_cat' and 'mttu' and 'mttr'
mttu_correlation = merged_with_mttu['dependency_status_cat'].corr(merged_with_mttu['mttu'], method='spearman')
mttr_correlation = merged_with_mttr['dependency_status_cat'].corr(merged_with_mttr['mttr'], method='spearman')

# Display the correlation results
print("Correlation between 'dependency_status' and 'mttu':", mttu_correlation)
print("Correlation between 'dependency_status' and 'mttr':", mttr_correlation)

Correlation between 'dependency_status' and 'mttu': 0.15352493526426683
Correlation between 'dependency_status' and 'mttr': -0.08181994147321954
