### **1. Data Collection**

In [2]:
import os
import json
import pandas as pd
from pandas import json_normalize

def process_json_files(folder_path, output_file, start_file=None, end_file=None):
    # List all JSON files in the directory
    json_files = [f for f in os.listdir(folder_path) if f.endswith('.json')]
    
    # Sort the list of files
    json_files.sort()

    # List to hold all processed DataFrames
    dfs = []

    # Loop through each file
    for file in json_files:
        file_path = os.path.join(folder_path, file)
        
        try:
            # Check if the file exists and is not empty
            if os.path.exists(file_path) and os.path.getsize(file_path) > 0:
                with open(file_path, 'r') as f:
                    data = json.load(f)
                
                # Normalize JSON data into DataFrame
                df = json_normalize(data)
                
                # Filter columns that start with 'info.' but not 'info.registry'
                info_columns = [col for col in df.columns if col.startswith('info.') and not col.startswith('info.registry')
                                and not col.startswith('info.players') and not col.startswith('info.supersubs.')]
                df = df[info_columns]
                
                # Remove 'info.' prefix from the column names
                df.columns = df.columns.str.replace("info.", "", regex=False)
                
                # Add match_id column with the filename
                df['match_id'] = os.path.splitext(file)[0]
                
                # Append the processed DataFrame to the list
                dfs.append(df)
            else:
                print(f"File not found or is empty: {file_path}")
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON from file: {file_path}, Error: {str(e)}")
        except Exception as e:
            print(f"An error occurred processing file: {file_path}, Error: {str(e)}")

    # Concatenate all DataFrames into a single DataFrame
    if dfs:
        final_df = pd.concat(dfs, ignore_index=True)
        # Save the final DataFrame to CSV
        final_df.to_csv(output_file, index=False)
        print(f"Processed data saved to: {output_file}")
    else:
        print("No data processed. Please check the input files.")

In [3]:
# Old Data
folder_path = r"F:\all_matches"
output_file = r"D:\GITHUB\Predictive-Analytics-for-Cricket-Matches-Using-Machine-Learning\source_data\Summary\match_summary\match_summary.csv"
process_json_files(folder_path, output_file)

Processed data saved to: D:\GITHUB\Predictive-Analytics-for-Cricket-Matches-Using-Machine-Learning\source_data\Summary\match_summary\match_summary.csv


In [4]:
# Display the head of the final DataFrame
df = pd.read_csv(r'D:\GITHUB\Predictive-Analytics-for-Cricket-Matches-Using-Machine-Learning\source_data\Summary\match_summary\match_summary.csv', low_memory=False)
print("Length of Data Frame: ", len(df))
df.head()

Length of Data Frame:  17960


Unnamed: 0,balls_per_over,city,dates,event.match_number,event.name,gender,match_type,match_type_number,officials.match_referees,officials.reserve_umpires,...,event.sub_name,outcome.result,outcome.method,missing,event.stage,event.group,outcome.eliminator,toss.uncontested,bowl_out,outcome.bowl_out
0,6,Perth,"['2016-11-03', '2016-11-04', '2016-11-05', '20...",1.0,South Africa in Australia Test Series,male,Test,2230.0,['AJ Pycroft'],['SJ Nogajski'],...,,,,,,,,,,
1,6,Hobart,"['2016-11-12', '2016-11-13', '2016-11-14', '20...",2.0,South Africa in Australia Test Series,male,Test,2233.0,['AJ Pycroft'],['MD Martell'],...,,,,,,,,,,
2,6,,"['2016-11-24', '2016-11-25', '2016-11-26', '20...",3.0,South Africa in Australia Test Series,male,Test,2236.0,['AJ Pycroft'],['P Wilson'],...,,,,,,,,,,
3,6,Brisbane,"['2016-12-15', '2016-12-16', '2016-12-17', '20...",1.0,Pakistan in Australia Test Series,male,Test,2240.0,['RS Madugalle'],['MD Martell'],...,,,,,,,,,,
4,6,,"['2016-12-26', '2016-12-27', '2016-12-28', '20...",2.0,Pakistan in Australia Test Series,male,Test,2242.0,['RS Madugalle'],['SJ Nogajski'],...,,,,,,,,,,
