In [7]:
import os

def delete_files(target_dir, file_ends, level=2):
    for dirpath, dirnames, filenames in os.walk(target_dir):
        cur_level = dirpath.count(os.sep) - target_dir.count(os.sep)
        if cur_level == level:
            for filename in filenames:
                if any(filename.endswith(end) for end in file_ends):
                    file_path = os.path.join(dirpath, filename)
                    os.remove(file_path)
                    print(f"Deleted: {file_path}")

target_dir = 'data/github'
file_ends = ['details.json', 'meta.json', 'active_dates_and_times.json', 'detail.json', 'contributor_email_suffixes.json']
delete_files(target_dir, file_ends, level=2)


Deleted: github/tadashi0713/wdio5-mocha-testrail-reporter/active_dates_and_times.json
Deleted: github/tadashi0713/wdio5-mocha-testrail-reporter/contributor_email_suffixes.json
Deleted: github/tadashi0713/wdio5-mocha-testrail-reporter/meta.json
Deleted: github/tadashi0713/wdio5-testrail-reporter/active_dates_and_times.json
Deleted: github/tadashi0713/wdio5-testrail-reporter/meta.json
Deleted: github/milesj/boost/bus_factor_detail.json
Deleted: github/milesj/boost/active_dates_and_times.json
Deleted: github/milesj/boost/activity_details.json
Deleted: github/milesj/boost/new_contributors_detail.json
Deleted: github/milesj/boost/contributor_email_suffixes.json
Deleted: github/milesj/boost/meta.json
Deleted: github/dmnd/dedent/bus_factor_detail.json
Deleted: github/dmnd/dedent/active_dates_and_times.json
Deleted: github/dmnd/dedent/activity_details.json
Deleted: github/dmnd/dedent/new_contributors_detail.json
Deleted: github/dmnd/dedent/contributor_email_suffixes.json
Deleted: github/dmnd/d

In [8]:
import os
import json

def extract_avg_from_json(target_dir, file_list):
    for dirpath, dirnames, filenames in os.walk(target_dir):
        for file_name in file_list:
            if file_name in filenames:
                print(file_name)
                old_file_path = os.path.join(dirpath, file_name)
                with open(old_file_path, 'r') as f:
                    data = json.load(f)
                avg_data = data['avg']
                new_file_name = file_name.replace('.json', '_avg.json')
                new_file_path = os.path.join(dirpath, new_file_name)
                with open(new_file_path, 'w') as out_file:
                    json.dump(avg_data, out_file)
                os.remove(old_file_path)  # 删除源文件
                print(f"Extracted 'avg' data from {file_name}, saved to {new_file_name}, and deleted the original file in directory {dirpath}")


target_dir = 'data/github'
file_list = ['change_request_resolution_duration.json', 'change_request_response_time.json', 'issue_resolution_duration.json', 'issue_response_time.json', 'issue_age.json', 'change_request_age.json']

extract_avg_from_json(target_dir, file_list)


change_request_resolution_duration.json
Extracted 'avg' data from change_request_resolution_duration.json, saved to change_request_resolution_duration_avg.json, and deleted the original file in directory github/milesj/boost
change_request_response_time.json
Extracted 'avg' data from change_request_response_time.json, saved to change_request_response_time_avg.json, and deleted the original file in directory github/milesj/boost
issue_resolution_duration.json
Extracted 'avg' data from issue_resolution_duration.json, saved to issue_resolution_duration_avg.json, and deleted the original file in directory github/milesj/boost
issue_response_time.json
Extracted 'avg' data from issue_response_time.json, saved to issue_response_time_avg.json, and deleted the original file in directory github/milesj/boost
issue_age.json
Extracted 'avg' data from issue_age.json, saved to issue_age_avg.json, and deleted the original file in directory github/milesj/boost
change_request_age.json
Extracted 'avg' data 

In [9]:
import os
import pandas as pd

def json_to_csv(target_directory, output_directory):
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    for dirpath, dirnames, filenames in os.walk(target_directory):
        dfs = {}  # Reset the dfs dictionary for each directory
        for filename in filenames:
            if filename.endswith('.json'):
                file_path = os.path.join(dirpath, filename)
                data = pd.read_json(file_path, typ='series')

                # Convert Series to DataFrame and reset its index
                df = data.to_frame().reset_index()

                # Rename columns
                df.columns = ['Date', filename[:-5]]

                # Convert Date column to string
                df['Date'] = df['Date'].astype(str)

                # Remove "-raw" from Date column
                df['Date'] = df['Date'].str.replace('-raw', '')

                # Convert Date column to year-month format
                df['Date'] = pd.to_datetime(df['Date']).dt.to_period('M')

                # Drop duplicate dates
                df = df.drop_duplicates(subset='Date')

                # Set Date as the index
                df.set_index('Date', inplace=True)

                # Add the DataFrame to the dfs dictionary with key as the filename without '.json'
                dfs[filename[:-5]] = df

        # If dfs dictionary is not empty, combine all DataFrames and save to CSV
        if dfs:
            combined_df = pd.concat(dfs.values(), axis=1)
            current_subdir = os.path.basename(dirpath)  # Get the name of the current directory
            combined_df.to_csv(os.path.join(output_directory, f'{current_subdir}.csv'))

# Call the function
json_to_csv('data/github', 'out')


## Transform to csv

In [14]:
import os
import pandas as pd

def check_csv_files(input_directory):
    expected_columns = 26
    problem_files = []

    for filename in os.listdir(input_directory):
        if filename.endswith('.csv'):
            file_path = os.path.join(input_directory, filename)

            # Check if the file is empty or contains only the header
            with open(file_path, 'r') as f:
                non_empty_lines = [line for line in f if line.strip()]
                if len(non_empty_lines) <= 1:  # Checks for only header or empty file
                    print(f"Skipping file with insufficient data: {filename}")
                    continue

            df_temp = pd.read_csv(file_path, skiprows=1, header=None, index_col=None)

            if df_temp.shape[1] != expected_columns:
                print(f"Column mismatch in file: {filename}. It has {df_temp.shape[1]} columns.")
                print("Columns in this file:", df_temp.columns.tolist())
                problem_files.append(filename)

    if problem_files:
        print("\nFiles with column mismatches:")
        for file in problem_files:
            print(file)
    else:
        print("All files have the expected number of columns.")

# Call the function
check_csv_files('data/out')


Column mismatch in file: vue-dropdown-filter.csv. It has 18 columns.
Columns in this file: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]
Column mismatch in file: mapslice.csv. It has 25 columns.
Columns in this file: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]
Column mismatch in file: sql.js.csv. It has 7 columns.
Columns in this file: [0, 1, 2, 3, 4, 5, 6]
Column mismatch in file: react-hooks.csv. It has 17 columns.
Columns in this file: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
Column mismatch in file: cutty.csv. It has 17 columns.
Columns in this file: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
Column mismatch in file: node-crackers.csv. It has 4 columns.
Columns in this file: [0, 1, 2, 3]
Column mismatch in file: ngx-swagger-client-generator.csv. It has 3 columns.
Columns in this file: [0, 1, 2]
Column mismatch in file: fd-ts-react-teaser.csv. It has 23 columns.
Columns in this file: [0

In [25]:
import os
import pandas as pd

def merge_csv_files(input_directory, output_file, last_time_file):
    dfs = []

    # Load the repo_name_last_time.csv file to get recent_time for each repo
    repo_last_time_df = pd.read_csv(last_time_file)

    # Process repo_name column to retain only the portion after the last '/'
    repo_last_time_df['repo_name'] = repo_last_time_df['repo_name'].str.split('/').str[-1]

    repo_time_dict = dict(zip(repo_last_time_df['repo_name'], repo_last_time_df['recent_time']))

    # Updated list of expected columns
    # expected_columns = ['Date', 'new_contributors', 'change_request_response_time_avg', 'issue_age_avg',
    #                     'code_change_lines_sum', 'openrank', 'issues_new', 'issues_and_change_request_active',
    #                     'code_change_lines_add', 'attention', 'issue_comments', 'change_requests_accepted',
    #                     'change_request_age_avg', 'participants', 'bus_factor', 'code_change_lines_remove',
    #                     'inactive_contributors', 'change_requests_reviews', 'activity', 'change_request_resolution_duration_avg',
    #                     'issues_closed', 'change_requests', 'issue_response_time_avg', 'issue_resolution_duration_avg', 'stars']
    expected_columns = ['Date', 'new_contributors', 'change_request_response_time_avg', 'issue_age_avg',
                        'code_change_lines_sum', 'issues_new', 'issues_and_change_request_active',
                        'code_change_lines_add', 'attention', 'issue_comments', 'change_requests_accepted',
                        'change_request_age_avg', 'participants', 'bus_factor', 'code_change_lines_remove',
                        'inactive_contributors', 'change_requests_reviews', 'activity', 'change_request_resolution_duration_avg',
                        'issues_closed', 'change_requests', 'issue_response_time_avg', 'issue_resolution_duration_avg', 'stars']

    for filename in os.listdir(input_directory):
        if filename.endswith('.csv'):
            file_path = os.path.join(input_directory, filename)
            repo_name = filename.replace('.csv', '')

            # If repo_name is not in the repo_time_dict, skip this file
            if repo_name not in repo_time_dict:
                print(f"Skipping {filename} as it's not found in repo_name_last_time.csv")
                continue

            df_temp = pd.read_csv(file_path)

            cutoff_time = pd.to_datetime(repo_time_dict[repo_name])
            df_temp['Date'] = pd.to_datetime(df_temp['Date'])
            df_temp = df_temp[df_temp['Date'] <= cutoff_time]

            # Add repo_name column to the dataframe
            df_temp['repo_name'] = repo_name

            # Handle missing columns
            missing_columns = set(expected_columns) - set(df_temp.columns)
            for col in missing_columns:
                df_temp[col] = 0

            # Reorder columns to place 'Date' and 'repo_name' at the beginning and match the expected columns order
            df_temp = df_temp[['Date', 'repo_name'] + [col for col in expected_columns if col not in ['Date', 'repo_name']]]
            dfs.append(df_temp)

    if dfs:
        # Concatenate all DataFrames using the concat function
        merged_df = pd.concat(dfs, ignore_index=True)

        # Reset the index
        merged_df = merged_df.reset_index(drop=True)

        # Fill missing values with 0
        merged_df.fillna(0, inplace=True)

        # Save the merged DataFrame as a CSV
        merged_df.to_csv(output_file, index=False)
        print("Merge success!")
    else:
        print("No matching csv files found!")

# Call the function
merge_csv_files('data/out', 'data/merged.csv', 'data/repo_name_last_time.csv')


Skipping connection-model.csv as it's not found in repo_name_last_time.csv
Skipping ngx-swagger-client-generator.csv as it's not found in repo_name_last_time.csv
Skipping adze.csv as it's not found in repo_name_last_time.csv
Skipping grunt-svg2png.csv as it's not found in repo_name_last_time.csv
Skipping streamr-client-javascript.csv as it's not found in repo_name_last_time.csv
Skipping docker-manager-npm.csv as it's not found in repo_name_last_time.csv
Skipping macaca-test-sample-nodejs.csv as it's not found in repo_name_last_time.csv
Skipping asset-pipe-client.csv as it's not found in repo_name_last_time.csv
Skipping jv-datepicker.csv as it's not found in repo_name_last_time.csv
Skipping typer.csv as it's not found in repo_name_last_time.csv
Skipping b24-sso.csv as it's not found in repo_name_last_time.csv
Skipping crc.csv as it's not found in repo_name_last_time.csv
Skipping web3-exchanges.csv as it's not found in repo_name_last_time.csv
Skipping depay-web3-wallets.csv as it's not f

In [26]:
import pandas as pd

# 读取两个数据文件
df_merged = pd.read_csv('data/merged.csv')
df_label = pd.read_csv('data/label_data.csv')

# 对label_data.csv中的repo_name列进行处理，提取 / 后面的值
df_label['repo_name'] = df_label['repo_name'].str.split('/').str[-1]

# 进行合并，使用repo_name列作为关键列
result_df = pd.merge(df_merged, df_label[['repo_name', 'type']], on='repo_name', how='left')

# 将type列设置为可进行机器学习的label
# 由于可能存在重复的repo_name与type的组合，我们可以选择丢弃重复项或者进行特定的处理（如选取最常见的type作为该repo的type）。
# 在此例中，我们选择丢弃重复项。
result_df = result_df.drop_duplicates(subset=['Date', 'repo_name'])

# 保存合并后的数据
result_df.to_csv('data/merged_with_type.csv', index=False)
