In [4]:
import pandas as pd
import json
import os
from datetime import datetime


def parse_custom_date(date_string):
    try:
        # Remove the 'T' and 'PM' from the string
        date_string = date_string.replace("T", " ").replace("PM", "")
        # Parse the date using a custom format
        return datetime.strptime(date_string, "%m/%d/%Y %I:%M:%S")
    except ValueError:
        # If parsing fails, return the original string
        return date_string


def extract_player_data(participants):
    player_data = {}
    for i, team in enumerate(participants[:2], 1):
        players = team.get("players_involved", [])
        for j, player in enumerate(players[:5], 1):  # Limiting to 5 players per team
            prefix = f"team{i}_player{j}"
            player_data.update(
                {
                    f"{prefix}_name": player.get("name", ""),
                    f"{prefix}_id": player.get("id", ""),
                    f"{prefix}_value": player.get("value", ""),
                    f"{prefix}_type": player.get("type", ""),
                }
            )
    return player_data


def process_json_file(file_path):
    with open(file_path, "r") as file:
        data = json.load(file)

    matches = data["matches"]

    columns = [
        "tour_name",
        "result_code",
        "series_id",
        "end_date",
        "event_sub_status",
        "start_date",
        "event_status",
        "winning_margin",
        "venue_id",
        "sport",
        "venue_name",
        "game_id",
        "event_stage",
        "event_name",
        "league_code",
        "series_name",
        "tour_id",
        "venue_gmt_offset",
    ]

    extracted_data = []

    for match in matches:
        row = {col: match.get(col, "") for col in columns}

        # Convert date strings to datetime objects
        for date_field in ["start_date", "end_date"]:
            if row[date_field]:
                row[date_field] = parse_custom_date(row[date_field])

        participants = match.get("participants", [])
        if len(participants) >= 2:
            row["team1_name"] = participants[0].get("name", "")
            row["team1_score"] = participants[0].get("value", "")
            row["team1_highlight"] = participants[0].get("highlight", "")
            row["team2_name"] = participants[1].get("name", "")
            row["team2_score"] = participants[1].get("value", "")
            row["team2_highlight"] = participants[1].get("highlight", "")

            # Determine the winner
            if match["result_code"] == "W":
                row["winner"] = (
                    participants[0]["name"]
                    if participants[0].get("highlight") == "true"
                    else participants[1]["name"]
                )
            elif match["result_code"] == "T":
                row["winner"] = "Tie"
            else:
                row["winner"] = "Unknown"

            # Extract player data
            row.update(extract_player_data(participants))

        # Extract additional match details
        row["win_by_coin_toss"] = match.get("win_by_coin_toss", {}).get("winner", "")
        row["extra_time"] = any(
            p.get("extra_time", {}).get("value") for p in participants
        )
        row["golden_raid"] = any(
            p.get("golden_raid", {}).get("value") for p in participants
        )

        extracted_data.append(row)

    return pd.DataFrame(extracted_data)


def process_directory(directory_path):
    dataframes = {}
    for filename in os.listdir(directory_path):
        if filename.endswith(".json"):
            file_path = os.path.join(directory_path, filename)
            df = process_json_file(file_path)
            dataframes[filename] = df
            print(f"Processed {filename}: {len(df)} rows")
    return dataframes


# Usage
directory_path = "./matchesoverview_json"
all_dataframes = process_directory(directory_path)



Processed s_1.json: 60 rows
Processed s_6.json: 138 rows
Processed s_7.json: 137 rows
Processed s_4.json: 60 rows
Processed s_8.json: 137 rows
Processed s_9.json: 137 rows
Processed s_5.json: 138 rows
Processed s_2.json: 60 rows
Processed s_10.json: 137 rows
Processed s_3.json: 60 rows


In [20]:
for k,v in all_dataframes.items():
    all_dataframes[k].to_csv(f"{k}_df.csv")


In [7]:
first_df = next(iter(all_dataframes.values()))
first_df.head()

# print(first_df.info())


Unnamed: 0,tour_name,result_code,series_id,end_date,event_sub_status,start_date,event_status,winning_margin,venue_id,sport,...,team2_player4_id,team2_player4_value,team2_player4_type,team2_player5_name,team2_player5_id,team2_player5_value,team2_player5_type,win_by_coin_toss,extra_time,golden_raid
0,"Pro Kabaddi League Season 1, 2014",,1,07/26/2014 8:00:00,U Mumba beat Jaipur Pink Panthers (45 - 31),07/26/2014 8:00:00,Completed,14,5,kabaddi,...,44,3,,Balbir Singh,32,0,All Rounder,,False,False
1,"Pro Kabaddi League Season 1, 2014",,1,07/26/2014 9:00:00,Bengaluru Bulls beat Dabang Delhi (47 - 28),07/26/2014 9:00:00,Completed,19,5,kabaddi,...,85,2,,Dharmaraj Cheralathan,42,1,,,False,False
2,"Pro Kabaddi League Season 1, 2014",,1,07/27/2014 8:00:00,Bengaluru Bulls beat Puneri Paltan (40 - 37),07/27/2014 8:00:00,Completed,3,5,kabaddi,...,151,4,Defender,Jitesh Joshi,55,2,Defender,,False,False
3,"Pro Kabaddi League Season 1, 2014",,1,07/27/2014 9:00:00,U Mumba beat Bengal Warriors (36 - 25),07/27/2014 9:00:00,Completed,11,5,kabaddi,...,98,2,Defender,Sunil Jaipal,141,2,Raider,,False,False
4,"Pro Kabaddi League Season 1, 2014",,1,07/28/2014 8:00:00,Dabang Delhi beat Puneri Paltan (35 - 31),07/28/2014 8:00:00,Completed,4,5,kabaddi,...,28,4,Defender,Shrikant Tewthia,107,3,,,False,False


In [None]:

# To combine all DataFrames into one
# combined_df = pd.concat(all_dataframes.values(), ignore_index=True)
# print(combined_df.shape)
# print(combined_df.info())


In [5]:
import os
import pandas as pd


In [7]:
df_l = []

for file_name in os.listdir("./csvs"):
    print(file_name)
    df = pd.read_csv(f"./csvs/{file_name}")
    df_l.append(df)




df_s4_json.csv
df_s5_json.csv
df_s3_json.csv
df_s2_json.csv
df_s9_json.csv
df_s8_json.csv
df_s10_json.csv
df_s7_json.csv
df_s6_json.csv
df_s1_json.csv


In [8]:
merged = pd.concat(df_l)
merged.head()


Unnamed: 0.1,Unnamed: 0,tour_name,result_code,series_id,end_date,event_sub_status,start_date,event_status,winning_margin,venue_id,...,team2_player4_id,team2_player4_value,team2_player4_type,team2_player5_name,team2_player5_id,team2_player5_value,team2_player5_type,win_by_coin_toss,extra_time,golden_raid
0,0,"Pro Kabaddi League Season 4, 2016",W,4,06/25/2016 8:00:00,Puneri Paltan beat Telugu Titans (28-24),06/25/2016 8:00:00,Completed,4.0,5,...,391.0,2.0,All Rounder,Jasmer Singh Gulia,51.0,1.0,,,False,False
1,1,"Pro Kabaddi League Season 4, 2016",W,4,06/25/2016 9:00:00,U Mumba beat Jaipur Pink Panthers (36-34),06/25/2016 9:00:00,Completed,2.0,5,...,212.0,3.0,,Tushar Patil,242.0,2.0,,,False,False
2,2,"Pro Kabaddi League Season 4, 2016",W,4,06/26/2016 8:00:00,Bengaluru Bulls beat Bengal Warriors (24-23),06/26/2016 8:00:00,Completed,1.0,5,...,73.0,3.0,Defender,Nitin Madane,75.0,2.0,Raider,,False,False
3,3,"Pro Kabaddi League Season 4, 2016",W,4,06/26/2016 9:00:00,Puneri Paltan beat U Mumba (41-19),06/26/2016 9:00:00,Completed,22.0,5,...,380.0,2.0,Raider,Sunil,207.0,1.0,,,False,False
4,4,"Pro Kabaddi League Season 4, 2016",W,4,06/27/2016 8:00:00,Bengal Warriors beat Dabang Delhi K.C. (31-23),06/27/2016 8:00:00,Completed,8.0,5,...,58.0,2.0,,Deepak Narwal,211.0,1.0,,,False,False


In [9]:
pd.unique(merged['tour_name'])


array(['Pro Kabaddi League Season 4, 2016',
       'Pro Kabaddi League Season 5, 2017',
       'Pro Kabaddi League Season 3, 2016',
       'Pro Kabaddi League Season 2, 2015',
       'Pro Kabaddi League Season 9, 2022',
       'Pro Kabaddi League Season 8, 2021',
       'Pro Kabaddi League Season 10, 2023',
       'Pro Kabaddi League Season 7, 2019',
       'Pro Kabaddi League Season 6, 2018',
       'Pro Kabaddi League Season 1, 2014'], dtype=object)

In [10]:
len(merged)


1064

In [11]:
merged.to_csv("merged_match_overview.csv")
