In [2]:
import os
import json
import pandas as pd

In [3]:
# Define the paths
project_path = '../../data/libraries/project'
dependencies_path = '../../data/libraries/dependencies'

# Initialize lists to store dataframes
project_df = pd.DataFrame()
dependencies_df  = pd.DataFrame()

# Read all JSON files from project directory
for file in os.listdir(project_path):
    if file.endswith('.json'):
        # Read the JSON file
        with open(os.path.join(project_path, file), 'r') as f:
            data = json.load(f)
            df = pd.DataFrame([{
                'platform': data['platform'],
                'name': data['name'],
                'contributions_count': data['contributions_count'],
                'dependent_repos_count': data['dependent_repos_count'],
                'dependents_count': data['dependents_count'],
                'forks': data['forks'],
                'rank': data['rank'],
                'stars': data['stars']
            }])
        
        # Concatenate to the main dataframe
        project_df = pd.concat([project_df, df], ignore_index=True)



In [4]:
# Read all CSV files from dependencies directory
for file in os.listdir(dependencies_path):
    if file.endswith('.json'):
        with open(os.path.join(dependencies_path, file), 'r') as f:
            data = json.load(f)
            df = pd.DataFrame([{
                'platform': data['platform'],
                'name': data['name'],
                'dependencies_count': len(data.get('dependencies', []))
            }])
        dependencies_df = pd.concat([dependencies_df, df], ignore_index=True)

# Display the first few rows of both dataframes
print("Project DataFrame:")
print(project_df.head())
print("\nDependencies DataFrame:")
print(dependencies_df.head())

Project DataFrame:
  platform                                           name  \
0      NPM  @scandipwa/scandipwa-development-toolkit-core   
1      NPM                    react-native-usb-serialport   
2      NPM         @stdlib/stats-base-dists-binomial-mean   
3      NPM                              @arcteryx/js-i18n   
4     Pypi                                     ae-literal   

   contributions_count  dependent_repos_count  dependents_count  forks  rank  \
0                    0                      0                 4      0     7   
1                    0                      1                 3     24     9   
2                    1                      7                 8      0    11   
3                    0                      0                 0      0     6   
4                    0                      1                 5      0     9   

   stars  
0      0  
1     26  
2      1  
3      0  
4      0  

Dependencies DataFrame:
  platform               name  dependencie

In [5]:
# Merge project_df and dependencies_df on platform and name with inner join
merged_df = pd.merge(project_df, dependencies_df, on=['platform', 'name'], how='inner')

# Display the first few rows to verify the merge
print(merged_df.head())

  platform                                           name  \
0      NPM  @scandipwa/scandipwa-development-toolkit-core   
1      NPM                    react-native-usb-serialport   
2      NPM         @stdlib/stats-base-dists-binomial-mean   
3      NPM                              @arcteryx/js-i18n   
4     Pypi                                     ae-literal   

   contributions_count  dependent_repos_count  dependents_count  forks  rank  \
0                    0                      0                 4      0     7   
1                    0                      1                 3     24     9   
2                    1                      7                 8      0    11   
3                    0                      0                 0      0     6   
4                    0                      1                 5      0     9   

   stars  dependencies_count  
0      0                  24  
1     26                   2  
2      1                   3  
3      0                   0

In [6]:
# Read MTTU and MTTR data
mttu_df = pd.read_csv('../../data/mttu/mttu.csv')
mttr_df = pd.read_csv('../../data/mttr/mttr.csv')

# Display the first few rows of both dataframes
print("MTTU DataFrame:")
print(mttu_df.head())
print("\nMTTR DataFrame:")
print(mttr_df.head())

MTTU DataFrame:
  system_name from_package_name  total_duration  out_of_date_duration  \
0       CARGO               a10             397                     0   
1       CARGO           aarch64             481                   325   
2       CARGO      aardvark-dns             618                     0   
3       CARGO            ab-av1             582                   100   
4       CARGO          ab_glyph            3084                    78   

      ratio  avg_total_duration  avg_out_of_date_duration  avg_ratio  
0  0.000000               397.0                       0.0   0.000000  
1  0.675676               481.0                     325.0   0.675676  
2  0.000000               618.0                       0.0   0.000000  
3  0.171821               582.0                     100.0   0.171821  
4  0.025292              1542.0                      39.0   0.025292  

MTTR DataFrame:
  system_name from_package_name  total_duration  total_post_fix_exposure_time  \
0       CARGO        

In [9]:
# Rename columns in merged_df
merged_df.rename(columns={'platform': 'system_name', 'name': 'from_package_name'}, inplace=True)
merged_df['system_name'] = merged_df['system_name'].str.upper()

# Rename columns in mttu_df and mttr_df
mttu_df.rename(columns={'avg_out_of_date_duration': 'mttu'}, inplace=True)
mttr_df.rename(columns={'avg_post_fix_exposure_time': 'mttr'}, inplace=True)

# Merge merged_df with mttu_df and mttr_df
final_df_with_mttu = pd.merge(merged_df, mttu_df[['system_name', 'from_package_name', 'mttu']], on=['system_name', 'from_package_name'], how='inner')
final_df_with_mttr = pd.merge(merged_df, mttr_df[['system_name', 'from_package_name', 'mttr']], on=['system_name', 'from_package_name'], how='inner')

# Display the first few rows of the final dataframe
print(final_df_with_mttu.head())
print(final_df_with_mttu.shape[0], final_df_with_mttr.shape[0])

  system_name                              from_package_name  \
0         NPM  @scandipwa/scandipwa-development-toolkit-core   
1         NPM                    react-native-usb-serialport   
2         NPM         @stdlib/stats-base-dists-binomial-mean   
3         NPM                              @arcteryx/js-i18n   
4        PYPI                                     ae-literal   

   contributions_count  dependent_repos_count  dependents_count  forks  rank  \
0                    0                      0                 4      0     7   
1                    0                      1                 3     24     9   
2                    1                      7                 8      0    11   
3                    0                      0                 0      0     6   
4                    0                      1                 5      0     9   

   stars  dependencies_count        mttu  
0      0                  24  588.954545  
1     26                   2  629.000000  
2    

In [10]:
# List of columns to test correlation with mttu
columns_to_test = ['contributions_count', 'dependent_repos_count', 'dependents_count', 'forks', 'rank', 'stars', 'dependencies_count']

# Initialize a dictionary to store correlation results
correlation_results = {'spearman': {}, 'pearson': {}, 'kendall': {}}

# Calculate correlations
for column in columns_to_test:
    correlation_results['spearman'][column] = final_df_with_mttu['mttu'].corr(final_df_with_mttu[column], method='spearman')
    correlation_results['pearson'][column] = final_df_with_mttu['mttu'].corr(final_df_with_mttu[column], method='pearson')
    correlation_results['kendall'][column] = final_df_with_mttu['mttu'].corr(final_df_with_mttu[column], method='kendall')

# Display the correlation results
print("Correlation Results:")
print(pd.DataFrame(correlation_results))

Correlation Results:
                       spearman   pearson   kendall
contributions_count   -0.080630 -0.078723 -0.058088
dependent_repos_count -0.036593  0.000118 -0.028375
dependents_count       0.012382 -0.001856  0.006186
forks                 -0.096607 -0.076057 -0.070858
rank                  -0.167455 -0.150426 -0.119591
stars                 -0.129301 -0.075771 -0.093360
dependencies_count     0.307006  0.100636  0.218040


In [11]:
# List of columns to test correlation with mttu
columns_to_test = ['contributions_count', 'dependent_repos_count', 'dependents_count', 'forks', 'rank', 'stars', 'dependencies_count']

# Initialize a dictionary to store correlation results
correlation_results = {'spearman': {}, 'pearson': {}, 'kendall': {}}

# Calculate correlations
for column in columns_to_test:
    correlation_results['spearman'][column] = final_df_with_mttr['mttr'].corr(final_df_with_mttr[column], method='spearman')
    correlation_results['pearson'][column] = final_df_with_mttr['mttr'].corr(final_df_with_mttr[column], method='pearson')
    correlation_results['kendall'][column] = final_df_with_mttr['mttr'].corr(final_df_with_mttr[column], method='kendall')

# Display the correlation results
print("Correlation Results:")
print(pd.DataFrame(correlation_results))

Correlation Results:
                       spearman   pearson   kendall
contributions_count   -0.226666 -0.068925 -0.160214
dependent_repos_count -0.175643 -0.016029 -0.128650
dependents_count      -0.136298 -0.016564 -0.094875
forks                 -0.211543 -0.023074 -0.152590
rank                  -0.269086 -0.221348 -0.188549
stars                 -0.218776 -0.050688 -0.155921
dependencies_count     0.013570  0.016707  0.009129


In [15]:
# Read dep_status.csv file
dep_status_df = pd.read_csv('../../data/dep_status/dep_status.csv')

# Merge dep_status_df with final_df_with_mttu
merged_with_mttu = pd.merge(final_df_with_mttu, dep_status_df, on=['system_name', 'from_package_name'], how='inner')

# Merge dep_status_df with final_df_with_mttr
merged_with_mttr = pd.merge(final_df_with_mttr, dep_status_df, on=['system_name', 'from_package_name'], how='inner')

# Categorize 'dependency_status' column
dependency_status_mapping = {
    'all_floating': 0,
    'all_pinned': 1,
    'mixed': 2
}
merged_with_mttu['dependency_status_cat'] = merged_with_mttu['dependency_status'].map(dependency_status_mapping)
merged_with_mttr['dependency_status_cat'] = merged_with_mttr['dependency_status'].map(dependency_status_mapping)

# Calculate correlation between 'dependency_status_cat' and 'mttu' and 'mttr'
mttu_correlation = merged_with_mttu['dependency_status_cat'].corr(merged_with_mttu['mttu'], method='spearman')
mttr_correlation = merged_with_mttr['dependency_status_cat'].corr(merged_with_mttr['mttr'], method='spearman')

# Display the correlation results
print("Correlation between 'dependency_status' and 'mttu':", mttu_correlation)
print("Correlation between 'dependency_status' and 'mttr':", mttr_correlation)

Correlation between 'dependency_status' and 'mttu': 0.15352493526426683
Correlation between 'dependency_status' and 'mttr': -0.08181994147321954
