In [2]:
import os
import json
import pandas as pd
from pandas import json_normalize

def process_json_files(folder_path, output_file, start_file=None):
    # List all JSON files in the directory
    json_files = [f for f in os.listdir(folder_path) if f.endswith('.json')]
    
    # Sort the list of files
    json_files.sort()

    # Filter files if a start file is provided
    if start_file:
        start_index = json_files.index(start_file) + 1
        json_files = json_files[start_index:]

    # List to hold all processed DataFrames
    dfs = []

    # Loop through each file
    for file in json_files:
        file_path = os.path.join(folder_path, file)
        
        try:
            # Check if the file exists and is not empty
            if os.path.exists(file_path) and os.path.getsize(file_path) > 0:
                with open(file_path, 'r') as f:
                    data = json.load(f)
                
                # Normalize JSON data into DataFrame
                df = json_normalize(data)
                
                # Filter columns that start with 'info.' but not 'info.registry'
                info_columns = [col for col in df.columns if col.startswith('info.') and not col.startswith('info.registry')
                                and not col.startswith('info.players') and not col.startswith('info.supersubs.')]
                df = df[info_columns]
                
                # Remove 'info.' prefix from the column names
                df.columns = df.columns.str.replace("info.", "", regex=False)
                
                # Add match_id column with the filename
                df['match_id'] = os.path.splitext(file)[0]
                
                # Append the processed DataFrame to the list
                dfs.append(df)
            else:
                print(f"File not found or is empty: {file_path}")
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON from file: {file_path}, Error: {str(e)}")
        except Exception as e:
            print(f"An error occurred processing file: {file_path}, Error: {str(e)}")

    # Concatenate all DataFrames into a single DataFrame
    if dfs:
        final_df = pd.concat(dfs, ignore_index=True)
        # Save the final DataFrame to CSV
        final_df.to_csv(output_file, index=False)
        print(f"Processed data saved to: {output_file}")
    else:
        print("No data processed. Please check the input files.")

In [3]:
# Old Data
old_folder_path = r"H:\all matches\all_json"
old_output_file = r"D:\GITHUB\Predictive-Analytics-for-Cricket-Matches-Using-Machine-Learning\source_data\Summary\match_summary\old_match_summary.csv"
process_json_files(old_folder_path, old_output_file)

Processed data saved to: D:\GITHUB\Predictive-Analytics-for-Cricket-Matches-Using-Machine-Learning\source_data\Summary\match_summary\old_match_summary.csv


In [4]:
# New Data
new_folder_path = r"H:\all matches\all_json"
new_output_file = r"D:\GITHUB\Predictive-Analytics-for-Cricket-Matches-Using-Machine-Learning\source_data\Summary\match_summary\new_match_summary.csv"
start_file = "1436482.json"
process_json_files(new_folder_path, new_output_file, start_file=start_file)

Processed data saved to: D:\GITHUB\Predictive-Analytics-for-Cricket-Matches-Using-Machine-Learning\source_data\Summary\match_summary\new_match_summary.csv


In [5]:
old_output_file = r"D:\GITHUB\Predictive-Analytics-for-Cricket-Matches-Using-Machine-Learning\source_data\Summary\match_summary\old_match_summary.csv"
new_output_file = r"D:\GITHUB\Predictive-Analytics-for-Cricket-Matches-Using-Machine-Learning\source_data\Summary\match_summary\new_match_summary.csv"

In [6]:
# Concatenate old and new DataFrames
old_final_df = pd.read_csv(old_output_file, low_memory=False)
new_final_df = pd.read_csv(new_output_file, low_memory=False)
output_file = r"D:\GITHUB\Predictive-Analytics-for-Cricket-Matches-Using-Machine-Learning\source_data\Summary\match_summary\match_summary.csv"
final_df = pd.concat([new_final_df, old_final_df], ignore_index=True)
final_df.to_csv(output_file, index=False)
print(f"Concatenated data saved to: {output_file}")

Concatenated data saved to: D:\GITHUB\Predictive-Analytics-for-Cricket-Matches-Using-Machine-Learning\source_data\Summary\match_summary\match_summary.csv


In [7]:
# Display the head of the final DataFrame
print(final_df.head())

   balls_per_over       city           dates  \
0               6  Spinaceto  ['2024-06-13']   
1               6       Rome  ['2024-06-13']   
2               6  Spinaceto  ['2024-06-13']   
3               6       Rome  ['2024-06-13']   
4               6  Spinaceto  ['2024-06-15']   

                                          event.name  event.match_number  \
0  ICC Men's T20 World Cup Sub Regional Europe Qu...                13.0   
1  ICC Men's T20 World Cup Sub Regional Europe Qu...                14.0   
2  ICC Men's T20 World Cup Sub Regional Europe Qu...                15.0   
3  ICC Men's T20 World Cup Sub Regional Europe Qu...                16.0   
4  ICC Men's T20 World Cup Sub Regional Europe Qu...                17.0   

  event.group gender match_type  match_type_number officials.match_referees  \
0           B   male        T20             2673.0             ['G McCrea']   
1           A   male        T20             2674.0             ['G McCrea']   
2           B   m