# Calculation and saving of xG for World Cup data



In [None]:
import os
import sys

# ==========================================
# Environment Setup: Automatic Detection
# ==========================================
try:
    # Check if running in Google Colab
    from google.colab import drive
    print("üöÄ Running in Google Colab")

    # 1. Mount Google Drive
    drive.mount('/content/drive')

    # 2. Change working directory to the project folder in Google Drive
    colab_path = 'put your Google Drive path here'

    if os.path.exists(colab_path):
        os.chdir(colab_path)
        print(f"üìÇ Current Directory moved to: {os.getcwd()}")
    else:
        print(f"‚ö†Ô∏è Path not found: {colab_path}")

except ImportError:
    # Running Locally
    print("üíª Running Locally")

    # Identify the project root directory regardless of the notebook location.
    # If the current directory is 'notebooks', move up one level to the project root.
    if os.path.basename(os.getcwd()) == "notebooks":
        os.chdir("..")
        print("‚¨ÜÔ∏è Moved up from 'notebooks' folder to project root.")

    print(f"üìÇ Current Directory: {os.getcwd()}")

# ==========================================
# Add Project Root to Path & Import Utils
# ==========================================
# Ensure the root directory is in sys.path to find EN_soccer_utils.py
if os.getcwd() not in sys.path:
    sys.path.append(os.getcwd())

import EN_soccer_utils as utils
print("‚úÖ Successfully imported EN_soccer_utils")

üöÄ Running in Google Colab
Mounted at /content/drive
üìÇ Current Directory moved to: /content/drive/MyDrive/„Éá„Éº„Çø„Çµ„Ç§„Ç®„É≥„ÇπÂÄã‰∫∫Â≠¶Áøí/Football_Tactical_Analysis
‚úÖ Successfully imported EN_soccer_utils


# Install necessary libraries


In [None]:
!pip install mplsoccer

import pandas as pd
import numpy as np
import pickle
import sys
import os
from mplsoccer import Sbopen
import warnings
from tqdm import tqdm # For progress bar display

import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

warnings.filterwarnings('ignore')

from EN_soccer_utils import EnsembleXGModel # Load definition with just this

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)
pd.set_option("display.max_colwidth", None)



# Convert World Cup data to fit the created model

In [None]:
# =================================================
# Function to completely convert WC data structure to model data
# =================================================

def reconstruct_statsbomb_features(events_df, freeze_df):
    """
    Function to restore "merged features" required for model training
    from raw mplsoccer data (events, freeze).
    """
    df = events_df.copy()

    # ---------------------------------------------------------
    # 1. Basic Renaming (Only for columns with different names)
    # ---------------------------------------------------------
    rename_map = {
        'sub_type_name': 'shot_type_name',
        'outcome_name': 'shot_outcome_name',
        'body_part_name': 'shot_body_part_name',
        'technique_name': 'shot_technique_name',
        'play_pattern_name': 'play_pattern_name'
    }
    df = df.rename(columns=rename_map)

    # Fill NaN in shot_type_name etc. with 'Open Play'
    if 'shot_type_name' in df.columns:
        df['shot_type_name'] = df['shot_type_name'].fillna('Open Play')

    # ---------------------------------------------------------
    # 2. Merging Assist (Last Pass) Information
    #    In StatsBomb, shots and passes are separate events,
    #    so we need to fetch pass info using shot_key_pass_id.
    # ---------------------------------------------------------

    # Extract only pass events
    passes = events_df[events_df['type_name'] == 'Pass'].copy()

    # Select required columns on the pass side and rename them (e.g., to "assist_~")
    # *Creating missing columns here (e.g., is_cross)
    pass_cols = {
        'id': 'shot_key_pass_id',       # Join key
        'x': 'assist_x',                # Passer position X
        'y': 'assist_y',                # Passer position Y
        'pass_cross': 'is_cross',       # Is it a cross?
        'pass_cut_back': 'is_cut_back', # Is it a cut back?
        'pass_through_ball': 'is_through_ball', # Is it a through ball? # This column does not exist in World Cup data
        'pass_height_name': 'pass_height', # Height of the pass
        'pass_switch': 'is_switch',        # Is it a switch of play?
    }

    # Select and rename only the columns that exist
    available_pass_cols = list(set(passes.columns) & set(pass_cols.keys()))
    passes_subset = passes[available_pass_cols].rename(columns=pass_cols)

    # Merge into shot data
    # Linking shot_key_pass_id from the left (df) and right (passes)
    if 'shot_key_pass_id' in df.columns:
        df = df.merge(passes_subset, on='shot_key_pass_id', how='left')

    # If there is no assist (e.g., individual dribble), the result of the merge will be NaN; fill these.
    bool_cols = ['is_cross', 'is_cut_back', 'is_through_ball', 'is_switch']
    for col in bool_cols:
        if col in df.columns:
            df[col] = df[col].fillna(False) # False if there is no assist

    # Handle naming variation for is_cutback (if utils looks for is_cutback)
    if 'is_cut_back' in df.columns:
        df['is_cutback'] = df['is_cut_back']

    # ---------------------------------------------------------
    # 3. Restoring Freeze Frame (shot_freeze_frame)
    #    In mplsoccer, this is a separate table (freeze_df),
    #    so we convert it back to a "list type" in the shot rows.
    # ---------------------------------------------------------
    if freeze_df is not None and not freeze_df.empty:
        # Group by event ID and convert to a list of dictionaries
        # Restore to the format parsable by utils.py (JSON-like list)
        freeze_dict = (
            freeze_df.groupby('id')
            # For each group (ID), convert the dataframe into a list of dictionaries (column: value)
            .apply(lambda x: x.to_dict('records'))
            # Convert the entire object into a Python dictionary
            .to_dict()
        )
        # Map back to shot rows
        df['shot_freeze_frame'] = df['id'].map(freeze_dict)
    else:
        df['shot_freeze_frame'] = None

    return df

In [None]:
# -------------------------------------------------
# Get a list of all matches
# -------------------------------------------------
parser = Sbopen()
matches = parser.match(competition_id=43, season_id=106) # WC 2022
print(f"Number of target matches: {len(matches)} matches")

Number of target matches: 64 matches


In [None]:
matches.head()

Unnamed: 0,match_id,match_date,kick_off,home_score,away_score,match_status,match_status_360,last_updated,last_updated_360,match_week,competition_id,country_name,competition_name,season_id,season_name,home_team_id,home_team_name,home_team_gender,home_team_group,home_team_country_id,home_team_country_name,home_team_managers_id,home_team_managers_name,home_team_managers_nickname,home_team_managers_dob,home_team_managers_country_id,home_team_managers_country_name,away_team_id,away_team_name,away_team_gender,away_team_group,away_team_country_id,away_team_country_name,away_team_managers_id,away_team_managers_name,away_team_managers_nickname,away_team_managers_dob,away_team_managers_country_id,away_team_managers_country_name,metadata_data_version,metadata_shot_fidelity_version,metadata_xy_fidelity_version,competition_stage_id,competition_stage_name,stadium_id,stadium_name,stadium_country_id,stadium_country_name,referee_id,referee_name,referee_country_id,referee_country_name
0,3857256,2022-12-02,2022-12-02 21:00:00,2,3,available,available,2023-02-17 23:45:15.306706,2023-04-26 23:49:58.956186,3,43,International,FIFA World Cup,106,2022,786,Serbia,male,G,203,Serbia,5908,Dragan Stojkoviƒá,Dragan Stojkoviƒá,1965-03-03,203,Serbia,773,Switzerland,male,G,221,Switzerland,2832,Murat Yakin,Murat Yakin,1974-09-15,221,Switzerland,1.1.0,2,2,10,Group Stage,1001115,Stadium 974,185,Qatar,1121.0,Fernando Andr√©s Rapallini,11.0,Argentina
1,3869151,2022-12-03,2022-12-03 21:00:00,2,1,available,available,2023-07-30 07:46:05.382784,2023-07-30 07:48:51.865595,4,43,International,FIFA World Cup,106,2022,779,Argentina,male,,11,Argentina,5677,Lionel Sebasti√°n Scaloni,Lionel Sebasti√°n Scaloni,1978-05-16,11,Argentina,792,Australia,male,,14,Australia,5696,Graham James Arnold,Graham James Arnold,1963-08-03,14,Australia,1.1.0,2,2,33,Round of 16,1000793,Ahmad bin Ali Stadium,185,Qatar,367.0,Szymon Marciniak,182.0,Poland
2,3857257,2022-11-30,2022-11-30 17:00:00,1,0,available,available,2023-04-28 17:13:10.958725,2023-06-20 11:04:37.638969,3,43,International,FIFA World Cup,106,2022,792,Australia,male,D,14,Australia,5696,Graham James Arnold,Graham James Arnold,1963-08-03,14,Australia,776,Denmark,male,D,61,Denmark,255,Kasper Hjulmand,Kasper Hjulmand,1972-04-09,61,Denmark,1.1.0,2,2,10,Group Stage,117897,Al Janoub Stadium,185,Qatar,2311.0,Mustapha Ghorbal,4.0,Algeria
3,3857258,2022-11-24,2022-11-24 21:00:00,2,0,available,available,2023-06-24 17:17:27.911026,2023-07-11 14:56:31.096588,1,43,International,FIFA World Cup,106,2022,781,Brazil,male,G,31,Brazil,4704,Tel√™ Santana da Silva,Tel√™ Santana,1931-07-26,31,Brazil,786,Serbia,male,G,203,Serbia,5908,Dragan Stojkoviƒá,Dragan Stojkoviƒá,1965-03-03,203,Serbia,1.1.0,2,2,10,Group Stage,1001114,Lusail Stadium,185,Qatar,741.0,Alireza Faghani,107.0,"Iran, Islamic Republic of"
4,3857288,2022-11-26,2022-11-26 12:00:00,0,1,available,available,2023-02-28 21:25:20.888552,2023-04-27 00:30:07.835815,2,43,International,FIFA World Cup,106,2022,777,Tunisia,male,D,232,Tunisia,1001303,Jalel Kadri,Jalel Kadri,NaT,232,Tunisia,792,Australia,male,D,14,Australia,5696,Graham James Arnold,Graham James Arnold,1963-08-03,14,Australia,1.1.0,2,2,10,Group Stage,117897,Al Janoub Stadium,185,Qatar,225.0,Daniel Siebert,85.0,Germany


In [None]:
PATH =  "data/wc_matches_basic_info.csv"
matches.to_csv(PATH, index=False)

In [None]:
# -------------------------------------------------
# Loop processing to fetch and merge data for all matches
# -------------------------------------------------
all_shots_data = []

print("üöÄ Fetching and processing data for all matches... (This may take a while ‚òïÔ∏è)")

for i, row in tqdm(matches.iterrows(), total=len(matches)):
    match_id = row['match_id']
    match_name = f"{row['home_team_name']} vs {row['away_team_name']}"

    try:
        # Fetch data for one match
        events, related, freeze, players = parser.event(match_id)

        # Use the existing reconstruct_statsbomb_features function here
        events_full = reconstruct_statsbomb_features(events, freeze)

        # Extract only shot data
        shots_only = events_full[events_full['type_name'] == 'Shot'].copy()

        # Add match information
        shots_only['match_name'] = match_name
        shots_only['match_date'] = row['match_date']
        shots_only['home_team'] = row['home_team_name']
        shots_only['away_team'] = row['away_team_name']
        shots_only['match_id'] = match_id # ID for reference

        all_shots_data.append(shots_only)

    except Exception as e:
        print(f"‚ö†Ô∏è Error in match {match_id}: {e}")
        continue

# Merge
if len(all_shots_data) > 0:
    shots_wc_all = pd.concat(all_shots_data, ignore_index=True)
    print(f"\n‚úÖ Data merging complete: {len(shots_wc_all)} shot records")
else:
    print("‚ùå No data could be retrieved")

üöÄ Fetching and processing data for all matches... (This may take a while ‚òïÔ∏è)


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 64/64 [01:09<00:00,  1.09s/it]


‚úÖ Data merging complete: 1494 shot records





In [None]:
# =================================================
# üöÄ Prediction, Restoration, Quality Check, & Saving (Full Penalty Support)
# =================================================

import pandas as pd
import numpy as np
import pickle

# -------------------------------------------------
# 1. Preparation: Column Definitions
# -------------------------------------------------

# (A) Features required for model prediction
FEATURES = [] # To be retrieved from the model later

# (B) Columns that need restoration (as they disappear during preprocessing)
restore_cols = [
    "shot_outcome_name",
    "shot_type_name",
    "play_pattern_name",
    "shot_body_part_name",
    "shot_technique_name"
]

# (C) Columns required for the final output
keep_cols = [
    "match_id", "id", "period", "minute", "second",
    "team_name", "player_name", "position_name",
    "x", "y",
    "is_goal"
]


# -------------------------------------------------
# 2. Execution
# -------------------------------------------------

if 'shots_wc_all' in locals() and len(shots_wc_all) > 0:
    print(f"üõ† 1. Data splitting and preprocessing... (Total: {len(shots_wc_all)} cases)")

    # 1. Penalty (PK) data
    df_pk = shots_wc_all[shots_wc_all['shot_type_name'] == 'Penalty'].copy()

    # 2. Non-penalty data
    df_non_pk = shots_wc_all[shots_wc_all['shot_type_name'] != 'Penalty'].copy()

    print(f"   -> PK: {len(df_pk)} cases, Others: {len(df_non_pk)} cases")


    # -------------------------------------------------------
    # A. Predict Non-PK data
    # -------------------------------------------------------
    # Preprocessing
    df_processed = utils.preprocess_pipeline(df_non_pk)

    # Load model
    model_path = 'models/best_xg_model.pkl'
    with open(model_path, 'rb') as f:
        model = pickle.load(f)

    # Retrieve feature list
    if hasattr(model, "feature_names_in_"):
        FEATURES = list(model.feature_names_in_)
    elif hasattr(model, "xgb_model"):
        FEATURES = list(model.xgb_model.feature_names_in_)

    # -------------------------------------------------
    # 3. Prediction (xG Calculation)
    # -------------------------------------------------
    print("ü§ñ 2. Calculating xG...")

    # Prepare prediction data (filling missing & type conversion)
    X_wc = df_processed.reindex(columns=FEATURES, fill_value=0)
    for col in X_wc.columns:
        if X_wc[col].dtype == 'object':
            X_wc[col] = pd.to_numeric(X_wc[col], errors='coerce').fillna(0)

    # Predict
    df_processed['xg'] = model.predict_proba(X_wc)[:, 1]



    # -------------------------------------------------------
    # B. Restoration Process (Non-PK)
    # -------------------------------------------------------
    # Restore columns that disappeared in the pipeline
    cols_to_restore = [c for c in restore_cols if c not in df_processed.columns]

    # Also restore any missing columns from keep_cols
    cols_to_keep_check = [c for c in keep_cols if c not in df_processed.columns]
    cols_needed = list(set(cols_to_restore + cols_to_keep_check))

    # df_source holds all original data. We merge it with df_processed
    # based on 'id' to bring back original columns for non-PK shots.
    if cols_needed:
        df_source = shots_wc_all[['id'] + cols_needed].copy()
        df_processed_full = pd.merge(df_processed, df_source, on='id', how='left')
    else:
        df_processed_full = df_processed.copy()
        print("   ‚ÑπÔ∏è No columns required restoration.")

    # -------------------------------------------------------
    # C. Penalty Data Processing & Merging
    # -------------------------------------------------------

    print("ü§ñ 3. Integrating and filling Penalty data...")

    # 1. Assign fixed xG value for PKs
    df_pk['xg'] = 0.76

    # 2. Create is_goal column
    # Set to 1 if shot_outcome_name is 'Goal', else 0
    if 'is_goal' not in df_pk.columns:
        df_pk['is_goal'] = (df_pk['shot_outcome_name'] == "Goal").astype(int)

    # 3. Check coordinates (x, y)
    # Expand from 'location' list if x/y are missing, otherwise default to PK spot (108, 40)
    if 'x' not in df_pk.columns and 'location' in df_pk.columns:
        df_pk['x'] = df_pk['location'].apply(lambda loc: loc[0] if isinstance(loc, list) and len(loc)>0 else 108.0)
        df_pk['y'] = df_pk['location'].apply(lambda loc: loc[1] if isinstance(loc, list) and len(loc)>1 else 40.0)
    elif 'x' not in df_pk.columns:
         df_pk['x'] = 108.0 # X-coord of PK spot
         df_pk['y'] = 40.0  # Y-coord of PK spot (Center)

    # 4. Fill missing values in feature columns
    for col in FEATURES:
        if col not in df_pk.columns:
            df_pk[col] = 0
        else:
            df_pk[col] = df_pk[col].fillna(0)

    # Merge PK and Non-PK datasets
    df_final = pd.concat([df_processed_full, df_pk], ignore_index=True)



    # -------------------------------------------------
    # D. Emergency Restore
    # -------------------------------------------------
    # List of all target output columns
    target_output_cols = keep_cols + restore_cols + ["xg"] + FEATURES
    target_output_cols = list(dict.fromkeys(target_output_cols))

    missing_output_cols = [c for c in target_output_cols if c not in df_final.columns]

    if missing_output_cols:
        print(f"‚ö†Ô∏è [Warning] Following columns missing from final data: {missing_output_cols}")
        print("   -> Attempting emergency restoration from raw data (shots_wc_all)...")

        emergency_cols = [c for c in missing_output_cols if c in shots_wc_all.columns]
        if emergency_cols:
            df_source_emergency = shots_wc_all[['id'] + emergency_cols].copy()
            df_final = pd.merge(df_final, df_source_emergency, on='id', how='left')
            print(f"   ‚úÖ Successfully restored {len(emergency_cols)} columns.")

    # (2) Fill missing values in numeric columns -> 0
    num_cols = df_final.select_dtypes(include=[np.number]).columns
    df_final[num_cols] = df_final[num_cols].fillna(0)

    # (3) Fill missing values in categorical columns -> 'None'
    cat_cols = df_final.select_dtypes(include=['object', 'category']).columns
    df_final[cat_cols] = df_final[cat_cols].fillna('None')

    # Sort data chronologically
    df_final = df_final.sort_values(['match_id', 'period', 'minute', 'second'])

    print("   ‚úÖ Full data cleaning complete")


    # -------------------------------------------------
    # 3. Final Data Quality Assurance (QA)
    # -------------------------------------------------
    print("\nüè• 4. Final Data Quality Check (QA)...")
    is_data_clean = True

    # (A) Null check for restored columns
    null_restored = df_final[restore_cols].isnull().sum()
    if null_restored.sum() > 0:
        print("   ‚ùå [CRITICAL WARNING] Restored columns contain missing values (NaN)!")
        print(null_restored[null_restored > 0])
        is_data_clean = False
    else:
        print("   ‚úÖ Restored data merge: OK (No missing values)")

    # (B) Null check for features
    null_features = df_final[FEATURES].isnull().sum().sum()
    if null_features > 0:
        print(f"   ‚ùå [CRITICAL WARNING] Features contain {null_features} missing values!")
        is_data_clean = False
    else:
        print("   ‚úÖ Feature values: OK (No missing values)")

    # (C) Range check for coordinate data
    if 'x' in df_final.columns and 'y' in df_final.columns:
        # Range is set slightly wider to accommodate PK spot (x=108)
        invalid_x = df_final[(df_final['x'] < -5) | (df_final['x'] > 125)]
        invalid_y = df_final[(df_final['y'] < -5) | (df_final['y'] > 85)]

        if len(invalid_x) + len(invalid_y) > 0:
            print(f"   ‚ö†Ô∏è [Note] Some coordinates are outside pitch bounds (X errors: {len(invalid_x)}, Y errors: {len(invalid_y)})")
        else:
            print("   ‚úÖ Coordinate data (x,y): OK (Within normal range)")


    # (D) Range check for predicted xG values
    if 'xg' in df_final.columns:
        invalid_xg = df_final[(df_final['xg'] < 0) | (df_final['xg'] > 1)]
        if len(invalid_xg) > 0:
            print(f"   ‚ùå [CRITICAL WARNING] xG values out of 0-1 range! ({len(invalid_xg)} cases)")
            is_data_clean = False
        else:
            print("   ‚úÖ Prediction values (xG): OK (Within 0-1 range)")

    # -------------------------------------------------
    # 4. Save (Executed only if data is clean)
    # -------------------------------------------------
    if is_data_clean:
        print("\nüíæ 5. Saving data...")

        # --- A. Create "Chosen Columns" version ---
        # Filter only columns that actually exist
        final_save_cols = [c for c in target_output_cols if c in df_final.columns]
        df_save = df_final[final_save_cols]

        # --- B. Execute Saving ---
        SAVE_PATH_ALL = "data/wc_all_matches_scored_with_all_cols.csv"
        SAVE_PATH_CHOSEN = "data/wc_all_matches_scored_with_chosen_cols.csv"

        df_final.to_csv(SAVE_PATH_ALL, index=False)
        df_save.to_csv(SAVE_PATH_CHOSEN, index=False)

        print(f"   üíæ Full Version (All) saved: {SAVE_PATH_ALL}")
        print(f"     -> Rows: {len(df_final)}, Columns: {len(df_final.columns)}")

        print(f"   üíæ Selected Version (Chosen) saved: {SAVE_PATH_CHOSEN}")
        print(f"     -> Rows: {len(df_save)}, Columns: {len(df_save.columns)}")

        print("\n   üéâ All data required for analysis is now ready!")

    else:
        print("\nüõë [Aborted] Saving cancelled due to critical issues found during Quality Check.")

else:
    print("‚ö†Ô∏è Error: 'shots_wc_all' not found.")

üõ† 1. Data splitting and preprocessing... (Total: 1494 cases)
   -> PK: 64 cases, Others: 1430 cases
ü•∂ Calculating Freeze Frame features (Universal fixed version)...
ü§ñ 2. Calculating xG...
ü§ñ 3. Integrating and filling Penalty data...
   ‚úÖ Full data cleaning complete

üè• 4. Final Data Quality Check (QA)...
   ‚úÖ Restored data merge: OK (No missing values)
   ‚úÖ Feature values: OK (No missing values)
   ‚úÖ Coordinate data (x,y): OK (Within normal range)
   ‚úÖ Prediction values (xG): OK (Within 0-1 range)

üíæ 5. Saving data...
   üíæ Full Version (All) saved: data/wc_all_matches_scored_with_all_cols.csv
     -> Rows: 1494, Columns: 143
   üíæ Selected Version (Chosen) saved: data/wc_all_matches_scored_with_chosen_cols.csv
     -> Rows: 1494, Columns: 62

   üéâ All data required for analysis is now ready!


# Explore the saved data

In [None]:
print(FEATURES)

for col in df_final.columns:
    print(col)

[np.str_('shot_distance'), np.str_('shot_angle'), np.str_('under_pressure'), np.str_('body_Head'), np.str_('body_Left Foot'), np.str_('body_Other'), np.str_('body_Right Foot'), np.str_('shoot_type_Corner'), np.str_('shoot_type_Free Kick'), np.str_('shoot_type_Open Play'), np.str_('pattern_From Corner'), np.str_('pattern_From Counter'), np.str_('pattern_From Free Kick'), np.str_('pattern_From Goal Kick'), np.str_('pattern_From Keeper'), np.str_('pattern_From Kick Off'), np.str_('pattern_From Throw In'), np.str_('pattern_Other'), np.str_('pattern_Regular Play'), np.str_('shoot_technic_Backheel'), np.str_('shoot_technic_Diving Header'), np.str_('shoot_technic_Half Volley'), np.str_('shoot_technic_Lob'), np.str_('shoot_technic_Normal'), np.str_('shoot_technic_Overhead Kick'), np.str_('shoot_technic_Volley'), np.str_('effective_goal_width'), np.str_('angle_from_center'), np.str_('in_penalty_area'), np.str_('in_six_yard_box'), np.str_('distance_angle_interaction'), np.str_('distance_squared'

In [None]:
df_final[FEATURES].head()

Unnamed: 0,shot_distance,shot_angle,under_pressure,body_Head,body_Left Foot,body_Other,body_Right Foot,shoot_type_Corner,shoot_type_Free Kick,shoot_type_Open Play,pattern_From Corner,pattern_From Counter,pattern_From Free Kick,pattern_From Goal Kick,pattern_From Keeper,pattern_From Kick Off,pattern_From Throw In,pattern_Other,pattern_Regular Play,shoot_technic_Backheel,shoot_technic_Diving Header,shoot_technic_Half Volley,shoot_technic_Lob,shoot_technic_Normal,shoot_technic_Overhead Kick,shoot_technic_Volley,effective_goal_width,angle_from_center,in_penalty_area,in_six_yard_box,distance_angle_interaction,distance_squared,has_assist,pass_length,pass_progress_x,is_cross,is_cutback,is_through_ball,is_pass_ground,is_pass_high,is_pass_low,gk_distance_to_shooter,num_opponents_in_shot_cone,num_teammates_in_shot_cone,total_players_in_shot_cone
1406,29.912539,0.244454,0.0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,7.348866,12.0,0,0,7.312234,894.76,1,16.717057,12.5,0,0,0,1,0,0,27.637836,1,0,1
1407,15.969972,0.198168,0.0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,3.175131,14.8,1,0,3.164734,255.04,0,0.0,0.0,0,0,0,0,0,0,12.04201,1,0,1
1408,26.977954,0.290499,0.0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,7.892649,4.5,0,0,7.837066,727.81,1,18.343936,5.5,0,0,0,1,0,0,22.78991,1,0,1
1409,11.672618,0.338988,0.0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,3.99521,10.4,1,0,3.956878,136.25,1,29.972321,-5.3,0,0,0,0,1,0,10.176935,0,0,0
1410,8.850989,0.542728,0.0,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,4.925172,7.5,1,1,4.80368,78.34,1,21.708293,8.9,1,0,0,0,0,1,5.269725,0,0,0


In [None]:
df_final["shot_type_name"].unique()

array(['Open Play', 'Corner', 'Free Kick', 'Penalty'], dtype=object)

In [None]:
df_final[df_final["shot_type_name"] == "Penalty"][["is_goal","xg"]]

Unnamed: 0,is_goal,xg
1453,0,0.76
1493,0,0.76
1430,1,0.76
1492,0,0.76
1482,1,0.76
1491,1,0.76
1484,1,0.76
1483,1,0.76
1485,1,0.76
1490,1,0.76
