Get the raw data from https://www.kaggle.com/datasets/paololol/league-of-legends-ranked-matches/data?select=stats1.csv

The data obtained from Kaggle is already cleaned, so further cleaning is unnecessary.

Due to the large dataset size, the data is divided into two files. We concatenate these files and then merge them with participant data and match data. 
After merging, we filter the data to include only matches with exactly 10 participants, ensuring that roles are not duplicated within each team.

In [None]:
import pandas as pd
import numpy as np

# Load the data
participant_data = pd.read_csv('./data/participants.csv')
stat1_data = pd.read_csv('./data/stats1.csv')
stat2_data = pd.read_csv('./data/stats2.csv')
matches_data = pd.read_csv('./data/matches.csv')

# Concatenate stat1 and stat2
merged_stat_data = pd.concat([stat1_data, stat2_data])

# Step 1: Merge participant_data with merged_stat_data on 'id'
merged_data = pd.merge(participant_data, merged_stat_data, on='id', how='outer')

# Step 2: Merge the resulting data with matches_data on 'matchid' and 'id'
merged_data = pd.merge(merged_data, matches_data, left_on='matchid', right_on='id', how='inner')

# Step 3: Remove the 'id_y' column and rename 'id_x' to 'participantid'
merged_data = merged_data.drop(columns=['id_y']).rename(columns={'id_x': 'participantid'})

# Step 4: Filter records where dmgtoobj >= dmgtoturrets
merged_data = merged_data[
    pd.to_numeric(merged_data['dmgtoobj'], errors='coerce') >= pd.to_numeric(merged_data['dmgtoturrets'], errors='coerce')
]
merged_data = merged_data.dropna(subset=['dmgtoobj', 'dmgtoturrets'])

# Step 5: Count the occurrences of each match_id in the filtered merged_data
match_id_counts = merged_data['matchid'].value_counts()

# Step 6: Find match_ids that appear exactly 10 times
match_ids_with_10_records = match_id_counts[match_id_counts == 10].index

# Step 7: Filter merged_data to include only those records where the match_id appears 10 times
participants_with_10_records = merged_data[merged_data['matchid'].isin(match_ids_with_10_records)]

# Step 8: Define conditions for assigning roles
conditions = [
    (participants_with_10_records['role'] == 'SOLO') & (participants_with_10_records['position'] == 'TOP'),
    (participants_with_10_records['role'] == 'NONE') & (participants_with_10_records['position'] == 'JUNGLE'),
    (participants_with_10_records['role'] == 'SOLO') & (participants_with_10_records['position'] == 'MID'),
    (participants_with_10_records['role'] == 'DUO_CARRY') & (participants_with_10_records['position'] == 'BOT'),
    (participants_with_10_records['role'] == 'DUO_SUPPORT') & (participants_with_10_records['position'] == 'BOT')
]

# Define the corresponding values for each condition
values = ['TOP', 'JUNGLE', 'MID', 'ADC', 'SUPPORT']

# Create a new column 'role_position' and apply the conditions
participants_with_10_records['role_position'] = np.select(conditions, values, default='UNKNOWN')

# Step 9: Check each match_id to ensure both teams have exactly 5 unique roles
valid_matches = []

# Loop through each match_id
for match_id in match_ids_with_10_records:
    # Get the participants for this match
    match_participants = participants_with_10_records[participants_with_10_records['matchid'] == match_id]
    
    # Split participants by their win/loss (1 = win, 0 = loss)
    team1 = match_participants[match_participants['win'] == 1]
    team2 = match_participants[match_participants['win'] == 0]
    
    # Ensure both teams have exactly 5 players and 5 unique roles
    if len(team1) == 5 and len(team2) == 5:
        if len(team1['role_position'].unique()) == 5 and len(team2['role_position'].unique()) == 5:
            valid_matches.append(match_id)

# Step 10: Filter the merged_data again to keep only valid matches
valid_participants = participants_with_10_records[participants_with_10_records['matchid'].isin(valid_matches)]

print(valid_participants.shape)

# Step 11: Save the result to a new CSV file
# valid_participants.to_csv('../data/new/parti10records_unique_role.csv', index=False)

print(f"Number of valid matches with exactly 5 unique roles per team based on win/loss: {len(valid_matches)}")


Filter the columns to reduce the dataset size while retaining an overview of key metrics for each role.

In [None]:
# Specify the columns you want to keep
columns_to_keep = [
    'participantid', 'matchid', 'championid', 'win', 'kills', 'deaths', 'assists', 'largestkillingspree', 'largestmultikill', 
    'killingsprees', 'doublekills', 'triplekills', 'quadrakills', 'pentakills', 'legendarykills',
    'totdmgdealt', 'totdmgtochamp', 'dmgtoobj', 'visionscore', 'totdmgtaken', 'goldearned', 'inhibkills', 
    'totminionskilled', 'neutralminionskilled', 'wardsbought', 'wardsplaced', 'wardskilled', 'role_position', 'duration', 'dmgtoturrets'
]

# Filter the data
filtered_data = valid_participants[columns_to_keep]

print(filtered_data.head())

Separate the dataset into subsets based on each role.

In [None]:
top_laners_df = filtered_data[filtered_data['role_position'] == 'TOP']
junglers_df = filtered_data[filtered_data['role_position'] == 'JUNGLE']
mid_laners_df = filtered_data[filtered_data['role_position'] == 'MID']
adc_df = filtered_data[filtered_data['role_position'] == 'ADC']
supports_df = filtered_data[filtered_data['role_position'] == 'SUPPORT']

print(top_laners_df.shape)
print(junglers_df.shape)
print(mid_laners_df.shape)
print(adc_df.shape)
print(supports_df.shape)

Higher Death, Less Performance

In [None]:
# Define a function to make the 'deaths' column negative
def make_column_negative(df, column_name='deaths'):
    if column_name in df.columns:
        df[column_name] = -df[column_name]
    return df

# Apply the function to each role-specific DataFrame
top_laners_df = make_column_negative(filtered_data[filtered_data['role_position'] == 'TOP'])
junglers_df = make_column_negative(filtered_data[filtered_data['role_position'] == 'JUNGLE'])
mid_laners_df = make_column_negative(filtered_data[filtered_data['role_position'] == 'MID'])
adc_df = make_column_negative(filtered_data[filtered_data['role_position'] == 'ADC'])
supports_df = make_column_negative(filtered_data[filtered_data['role_position'] == 'SUPPORT'])

Metric Transformation by dividing with duration

In [None]:
# match_duration_col = 'duration'  # Column that contains match duration (in seconds or minutes)

def divide_by_duration(df, metrics, suffix='_per_minute'):
    df['duration'] = df['duration'] / 60
    for metric in metrics:
        # Convert match duration to minutes (if it's in seconds)
        # Create a new column with the suffix to store the divided value
        new_column_name = metric + suffix
        df[new_column_name] = df[metric] / df['duration']
    return df


def generate_dmg_per_kill(df, dmg_col='totdmgtochamp', kills_col='kills', new_feature_name='dmg_per_kill'):
    # Create the new feature by dividing totdmgtochamp by kills
    df[new_feature_name] = df[dmg_col] / df[kills_col]
    
    return df

def generate_visionscore_per_wardplaced(df, visionscore_col='visionscore', wardsplaced_col='wardsplaced', new_feature_name='visionscore_per_wardplaced'):
    # Create the new feature by dividing visionscore by wardsplaced
    df[new_feature_name] = df[visionscore_col] / df[wardsplaced_col]
    
    return df

def fill_wardsbought_with_average(df):
    # Replace non-numeric entries (e.g., '\N') with NaN
    df['wardsbought'].replace('\\N', np.nan, inplace=True)
    
    # Convert to numeric, coercing any remaining non-numeric values to NaN
    df['wardsbought'] = pd.to_numeric(df['wardsbought'], errors='coerce')
    
    # Calculate the average, ignoring NaN
    average_wardsbought = df['wardsbought'].mean()
    
    # Fill NaN values with the average
    df['wardsbought'].fillna(average_wardsbought, inplace=True)
    
    # Ensure the column is of numeric type (float)
    df['wardsbought'] = df['wardsbought'].astype(float)
    # print(df['wardsbought'].head())
    
    return df

Special Handle for Support due to 'wardplaced' is Object (it cannot be divided by float)

In [None]:
# Support Role
support_fill_nan_data = fill_wardsbought_with_average(supports_df)
support_metrics_to_divide = ['visionscore', 'wardsplaced', 'wardskilled', 'wardsbought', 'totdmgtaken', 'kills', 'deaths', 'assists']
# support_visionscore_per_wardplaced = generate_visionscore_per_wardplaced(support_fill_nan_data)

# print(support_visionscore_per_wardplaced['wardsbought'].head())

In [None]:
# for column in support_metrics_to_divide:
#     if column in support_visionscore_per_wardplaced.columns:
#         print(f"{column}: {support_visionscore_per_wardplaced[column].dtype}")
#     else:
#         print(f"{column}: Column not found in DataFrame")

In [None]:
# ADC Role
adc_metrics_to_divide = ['totdmgtochamp', 'totminionskilled', 'dmgtoobj', 'goldearned', 'kills', 'deaths', 'assists', 'dmgtoturrets']
adc_divided = divide_by_duration(adc_df, adc_metrics_to_divide)

support_divided = divide_by_duration(support_fill_nan_data, support_metrics_to_divide)

# Mid Role
mid_metrics_to_divide = ['totdmgtochamp', 'goldearned','visionscore', 'dmgtoobj', 'totdmgdealt','totminionskilled', 'kills', 'deaths', 'assists']
mid_divided = divide_by_duration(mid_laners_df, mid_metrics_to_divide)

# Top Role
top_metrics_to_divide = ['totdmgtaken', 'deaths', 'assists', 'goldearned', 'totdmgtochamp', 'totminionskilled', 'totdmgdealt', 'kills']
top_damage_per_kill = generate_dmg_per_kill(top_laners_df)
top_divided = divide_by_duration(top_damage_per_kill, top_metrics_to_divide)

# Jungle Role
jungle_metrics_to_divide = ['neutralminionskilled', 'kills', 'assists', 'goldearned','visionscore', 'totdmgdealt', 'dmgtoobj', 'totdmgtochamp', 'deaths']
jungle_divided = divide_by_duration(junglers_df, jungle_metrics_to_divide)

In [None]:
# print(mid_divided.head())
print(adc_divided.shape)
print(jungle_divided.shape)
print(support_divided.shape)
print(mid_divided.shape)
print(top_divided.shape)

Finding Feature Correlation

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

def visualize_correlation_matrix(df, features, role_name):
    # Combine tiers
    metrics = df[features]

    # Check for columns with string values and report them
    non_numeric_columns = metrics.select_dtypes(include=['object']).columns
    if not non_numeric_columns.empty:
        print(f"Warning: The following columns in {role_name} contain non-numeric data:")
        print(non_numeric_columns)
        for col in non_numeric_columns:
            print(f"Non-numeric values in '{col}':")
            print(metrics[col].unique())

    # Convert all columns to numeric, forcing errors to NaN
    metrics = metrics.apply(pd.to_numeric, errors='coerce')
    
    # Drop rows with NaN values
    metrics = metrics.dropna()

    # Calculate correlation matrix
    correlation_matrix = metrics.corr()

    # Visualize the correlation matrix using a heatmap
    plt.figure(figsize=(12, 10))  # Increase figure size to give more space for labels
    heatmap = sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
    
    # Set the title of the heatmap
    plt.title(f'Correlation Matrix for {role_name} Features')
    
    # Adjust layout to avoid cutting off labels
    plt.subplots_adjust(left=0.25, right=0.9, top=0.9, bottom=0.25)  # Adjust both left and bottom margins
    
    # Show the plot
    plt.show()


Plot the heatmap showing the correlation of the features (after first consideration based on team's domain knowledge)

In [None]:
normalized_before_correlation_metrics = {
    "adc": ['totdmgtochamp_per_minute', 'totminionskilled_per_minute', 'goldearned_per_minute', 'largestkillingspree', 'largestmultikill', 'dmgtoobj_per_minute', 'kills_per_minute', 'deaths_per_minute', 'assists_per_minute', 'dmgtoturrets_per_minute'],
    "support": ['wardskilled_per_minute', 'wardsplaced_per_minute', 'visionscore_per_minute', 'wardsbought_per_minute', 'totdmgtaken_per_minute', 'kills_per_minute', 'deaths_per_minute', 'assists_per_minute'],
    "mid": ['totdmgtochamp_per_minute', 'goldearned_per_minute', 'largestkillingspree', 'visionscore_per_minute', 'totdmgdealt_per_minute', 'dmgtoobj_per_minute', 'totminionskilled_per_minute', 'kills_per_minute', 'deaths_per_minute', 'assists_per_minute'],
    "top": ['totdmgtaken_per_minute', 'deaths_per_minute', 'dmg_per_kill', 'goldearned_per_minute', 'totdmgtochamp_per_minute', 'assists_per_minute', 'totminionskilled_per_minute', 'totdmgdealt_per_minute', 'kills_per_minute', 'largestkillingspree'],
    "jungle": ['neutralminionskilled_per_minute', 'kills_per_minute', 'assists_per_minute', 'goldearned_per_minute', 'visionscore_per_minute', 'totdmgdealt_per_minute', 'dmgtoobj_per_minute', 'totdmgtochamp_per_minute', 'killingsprees', 'largestmultikill', 'deaths_per_minute']
}

visualize_correlation_matrix(adc_divided, normalized_before_correlation_metrics['adc'], 'ADC')
visualize_correlation_matrix(support_divided, normalized_before_correlation_metrics['support'], 'Support')
visualize_correlation_matrix(mid_divided, normalized_before_correlation_metrics['mid'], 'Mid')
visualize_correlation_matrix(top_divided, normalized_before_correlation_metrics['top'], 'Top')
visualize_correlation_matrix(jungle_divided, normalized_before_correlation_metrics['jungle'], 'Jungle')

Remove the feature pairs having high correlation and plot the heatmap

In [None]:
print(adc_divided.head()) 

In [None]:
normalized_after_correlation_metrics = {
    "adc": ['totdmgtochamp_per_minute', 'totminionskilled_per_minute', 'largestkillingspree', 'largestmultikill', 'dmgtoobj_per_minute', 'deaths_per_minute', 'assists_per_minute', 'dmgtoturrets_per_minute'],
    "support": ['wardskilled_per_minute', 'wardsplaced_per_minute', 'visionscore_per_minute', 'wardsbought_per_minute', 'totdmgtaken_per_minute', 'kills_per_minute', 'deaths_per_minute', 'assists_per_minute'],    
    "mid": ['totdmgtochamp_per_minute', 'visionscore_per_minute', 'totdmgdealt_per_minute', 'dmgtoobj_per_minute', 'totminionskilled_per_minute', 'kills_per_minute', 'deaths_per_minute', 'assists_per_minute'],
    "top": ['totdmgtaken_per_minute', 'deaths_per_minute', 'dmg_per_kill', 'totdmgtochamp_per_minute', 'assists_per_minute', 'totdmgdealt_per_minute', 'kills_per_minute', 'totminionskilled_per_minute'],
    "jungle": ['neutralminionskilled_per_minute', 'kills_per_minute', 'assists_per_minute', 'visionscore_per_minute', 'totdmgdealt_per_minute', 'dmgtoobj_per_minute', 'totdmgtochamp_per_minute', 'largestmultikill', 'deaths_per_minute']
}

visualize_correlation_matrix(adc_divided, normalized_after_correlation_metrics['adc'], 'ADC')
visualize_correlation_matrix(support_divided, normalized_after_correlation_metrics['support'], 'Support')
visualize_correlation_matrix(mid_divided, normalized_after_correlation_metrics['mid'], 'Mid')
visualize_correlation_matrix(top_divided, normalized_after_correlation_metrics['top'], 'Top')
visualize_correlation_matrix(jungle_divided, normalized_after_correlation_metrics['jungle'], 'Jungle')

In [None]:
print("ADC", len(adc_divided))
print("Support", len(support_divided))
print("Mid", len(mid_divided))
print("Top", len(top_divided))
print("Jungle", len(jungle_divided))

In [None]:
print(adc_divided['duration'].head())

In [None]:
color_for_plot = {
    0: 'orange',
    1: 'red',
    2: 'blue',
    3: 'green',
    4: 'purple',
    5: 'brown',
    6: 'pink',
    7: 'gray',
}

In [None]:
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import itertools
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from scipy.stats import zscore

# Inertia = measure of cluster compactness
# lower values = data points are closer to their centroids
# meaning clusters are more compact, well-defined.

# In the elbow method, plot the inertia values against the number of clusters 𝑘
# As k increases, inertia decreases 
# adding more clusters reduces the average distance between data points and their cluster centers.

# The elbow point = the point where the inertia starts decreasing at a slower rate, 
# forming an elbow in the plot.
# The elbow point = Ideal number of clusters because 
# Balances compact clusters + avoiding over-segmentation

def plot_elbow_silhouette(df, features, role):
    X_scaled = StandardScaler().fit_transform(df[features])
    # X_scaled = MinMaxScaler().fit_transform(df[features])
    max_clusters = 10
    inertia_values, silhouette_values = [], []
    # generates a sequence of integers from 2 to max_clusters (inclusive) (cluster 2 to 10)
    cluster_range = range(2, max_clusters + 1)

    for k in cluster_range:
        kmeans = KMeans(n_clusters=k, random_state=42)
        labels = kmeans.fit_predict(df[features])
        inertia_values.append(kmeans.inertia_)
        silhouette_values.append(silhouette_score(df[features], labels))

    # Elbow plot
    plt.figure(figsize=(12, 6))
    plt.subplot(1, 2, 1)
    plt.plot(cluster_range, inertia_values, 'bo-')
    plt.title(f'{role.capitalize()} Elbow Plot')
    plt.xlabel('Number of clusters')
    plt.ylabel('Inertia (within cluster sum of squares)')

    # Silhouette plot
    plt.subplot(1, 2, 2)
    plt.plot(cluster_range, silhouette_values, 'go-')
    plt.title(f'{role.capitalize()} Silhouette Score Plot')
    plt.xlabel('Number of clusters')
    plt.ylabel('Silhouette Score')

    plt.suptitle(f'Elbow and Silhouette Analysis for {role.capitalize()} Role')
    plt.show()
    # plt.savefig(f"./Isolation-forest-getInsight/Elbow-silhouette/{role}_elbow_silhouette_plot.png", dpi=100)


In [None]:
def isolation_forest_plot_elbow_silhouette(role, data_role):
    print(f"Processing role: {role}")
    
    # Drop unnecessary columns
    df = data_role.drop(columns=['wardsbought', 'role_position'], errors='ignore')
    
    # Select features and preprocess
    features = normalized_after_correlation_metrics[role]
    X = df[features].apply(pd.to_numeric, errors='coerce').dropna()
    X.replace([np.inf, -np.inf], np.nan, inplace=True)
    X = np.clip(X, a_min=-1e5, a_max=1e5).dropna()
    df = df.loc[X.index]

    # Scale data
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Isolation Forest
    iso_forest = IsolationForest(contamination=0.01, random_state=42)
    df['anomaly'] = iso_forest.fit_predict(X_scaled)
    anomaly_data = df[df['anomaly'] == -1]
    normal_data = df[df['anomaly'] != -1]

    # Plot elbow and silhouette for anomaly data
    plot_elbow_silhouette(anomaly_data, features, role)

    # Plot normal vs anomaly data
    plot_normal_vs_anomaly(normal_data, anomaly_data, features, role)

    return anomaly_data, normal_data


def plot_normal_vs_anomaly(normal_data, anomaly_data, features, role):
    """
    Plots normal (blue) and anomaly (red) data points for all feature combinations.

    Args:
        normal_data (DataFrame): Data classified as normal.
        anomaly_data (DataFrame): Data classified as anomalies.
        features (list): List of feature columns.
        role (str): Role name for the plot title.
        alpha_normal (float): Transparency level for normal data points (default: 0.4).
        alpha_anomaly (float): Transparency level for anomaly data points (default: 0.6).
    """
    import itertools

    # Generate feature combinations for scatter plots
    feature_combinations = list(itertools.combinations(features, 2))
    n_cols = 4
    n_rows = (len(feature_combinations) + n_cols - 1) // n_cols
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(18, n_rows * 3))
    axes = axes.flatten()

    for i, (feature_x, feature_y) in enumerate(feature_combinations):
        ax = axes[i]

        # Plot normal data points (blue, customizable alpha)
        ax.scatter(
            normal_data[feature_x], 
            normal_data[feature_y], 
            c='blue', 
            label='Normal', 
            alpha=0.2
        )

        # Plot anomaly data points (red, customizable alpha)
        ax.scatter(
            anomaly_data[feature_x], 
            anomaly_data[feature_y], 
            c='red', 
            label='Anomaly', 
            alpha=0.2
        )

        # Add labels and title
        ax.set_title(f'{feature_x} vs {feature_y}', fontsize=8)
        ax.set_xlabel(feature_x, fontsize=8)
        ax.set_ylabel(feature_y, fontsize=8)

    # Remove extra subplots if any
    for j in range(i + 1, len(axes)):
        fig.delaxes(axes[j])

    # Adjust layout
    plt.tight_layout(rect=[0, 0, 1, 0.96])
    fig.suptitle(f'Normal vs Anomaly Plot for {role.capitalize()} Role', fontsize=16)
    plt.legend(loc='upper right')
    plt.show()




In [None]:
(adc_include_anomaly, adc_normal) = isolation_forest_plot_elbow_silhouette('adc', adc_divided)
(support_include_anomaly, support_normal) = isolation_forest_plot_elbow_silhouette('support', support_divided)
(mid_include_anomaly, mid_normal) = isolation_forest_plot_elbow_silhouette('mid', mid_divided)    
(top_include_anomaly, top_normal) = isolation_forest_plot_elbow_silhouette('top', top_divided)
(jungle_include_anomaly, jungle_normal) = isolation_forest_plot_elbow_silhouette('jungle', jungle_divided)

In [None]:
def analyze_outliers_with_kmeans(outliers_df, features, role, n_clusters):
    scaler = StandardScaler()
    X_outliers_scaled = scaler.fit_transform(outliers_df[features])

    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    outliers_df['kmeans_cluster'] = kmeans.fit_predict(X_outliers_scaled)
        
    plot_clusters(outliers_df, features, f"{role.capitalize()} KMeans Cluster Analysis with {n_clusters} Clusters")

    for cluster in outliers_df['kmeans_cluster'].unique():
        plot_each_cluster(outliers_df, features, cluster, f"{role.capitalize()} KMeans Cluster Analysis with {n_clusters} Clusters")
        
    return outliers_df

def plot_clusters(df, features, title):
    feature_combinations = list(itertools.combinations(features, 2))
    n_cols = 4
    n_rows = (len(feature_combinations) + n_cols - 1) // n_cols
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(18, n_rows * 3))
    axes = axes.flatten()

    for i, (feature_x, feature_y) in enumerate(feature_combinations):
        ax = axes[i]
        for cluster in df['kmeans_cluster'].unique():
            cluster_data = df[df['kmeans_cluster'] == cluster]
            cluster_color = color_for_plot.get(cluster, 'gray')  # Default to 'gray' if cluster color is not defined
            ax.scatter(cluster_data[feature_x], cluster_data[feature_y], label=f'Cluster {cluster}', color=cluster_color, alpha=0.5)
        ax.set_title(f'{feature_x} vs {feature_y}', fontsize=8)
        ax.set_xlabel(feature_x, fontsize=8)
        ax.set_ylabel(feature_y, fontsize=8)

    for j in range(i + 1, len(axes)):
        fig.delaxes(axes[j])
    fig.suptitle(title, fontsize=16)
    plt.tight_layout(rect=[0, 0, 1, 0.96])
    plt.legend()
    plt.show()

def plot_each_cluster(df, features, cluster, title):
    cluster_data = df[df['kmeans_cluster'] == cluster]
    print(f"Cluster {cluster}: {len(cluster_data)} data points")
    feature_combinations = list(itertools.combinations(features, 2))
    n_cols = 4
    n_rows = (len(feature_combinations) + n_cols - 1) // n_cols
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(18, n_rows * 3))
    axes = axes.flatten()

    for i, (feature_x, feature_y) in enumerate(feature_combinations):
        ax = axes[i]
        cluster_color = color_for_plot.get(cluster, 'gray')  # Default to 'gray' if cluster color is not defined
        ax.scatter(cluster_data[feature_x], cluster_data[feature_y], color=cluster_color, alpha=0.5)
        ax.set_title(f'{feature_x} vs {feature_y}', fontsize=8)
        ax.set_xlabel(feature_x, fontsize=8)
        ax.set_ylabel(feature_y, fontsize=8)

    # Remove unused subplots
    for j in range(i + 1, len(axes)):
        fig.delaxes(axes[j])
    
    fig.suptitle(f'{title} - Cluster {cluster}', fontsize=16)
    plt.tight_layout(rect=[0, 0, 1, 0.96])
    plt.show()


In [None]:
# role_cluster_counts = {'adc': 3, 'support': 6 , 'mid': 5, 'top': 5, 'jungle': 5}

# adc_cluster_counts = 3
# support_cluster_counts = 6
# mid_cluster_counts =5
# top_cluster_counts = 5
# jungle_cluster_counts = 5

adc_cluster_counts = 4
support_cluster_counts = 4
mid_cluster_counts = 4
top_cluster_counts = 5
jungle_cluster_counts = 5

adc_outlier_kmean = analyze_outliers_with_kmeans(adc_include_anomaly, normalized_before_correlation_metrics['adc'], 'adc', adc_cluster_counts)
support_outlier_kmean = analyze_outliers_with_kmeans(support_include_anomaly, normalized_before_correlation_metrics['support'], 'support', support_cluster_counts)
mid_outlier_kmean = analyze_outliers_with_kmeans(mid_include_anomaly, normalized_before_correlation_metrics['mid'], 'mid', mid_cluster_counts)
top_outlier_kmean = analyze_outliers_with_kmeans(top_include_anomaly, normalized_before_correlation_metrics['top'], 'top', top_cluster_counts)
jungle_outlier_kmean = analyze_outliers_with_kmeans(jungle_include_anomaly, normalized_before_correlation_metrics['jungle'], 'jungle', jungle_cluster_counts)

# print(len(adc_outlier_kmean))


In [None]:
def normalize_features(df, feature_cols):
    """
    Normalize features to the range [0, 1].

    Parameters:
        df (pd.DataFrame): The input dataframe.
        feature_cols (list): List of columns to normalize.

    Returns:
        pd.DataFrame: A dataframe with normalized features.
    """
    df_normalized = df.copy()
    for col in feature_cols:
        min_val = df[col].min()
        max_val = df[col].max()
        df_normalized[col] = (df[col] - min_val) / (max_val - min_val)
    return df_normalized

In [None]:
def zscore_plot_normalized(df, cluster_col, feature_cols, role):
    # Normalize the selected features
    df_normalized = normalize_features(df, feature_cols)

    # Combine normalized scores by summing across features
    df_normalized['normalized_score_combined'] = df_normalized[feature_cols].sum(axis=1)

    # Ensure the index is a range index for consistent plotting
    df = df.reset_index(drop=True)
    df_normalized = df_normalized.reset_index(drop=True)

    # Scatter plot by cluster
    plt.figure(figsize=(12, 8))
    unique_clusters = df[cluster_col].unique()

    for cluster in unique_clusters:
        cluster_data = df[df[cluster_col] == cluster]
        color = color_for_plot.get(cluster, 'gray')  # Default to gray if cluster color is not defined
        plt.scatter(
            cluster_data.index,
            df_normalized.loc[cluster_data.index, 'normalized_score_combined'],
            label=f'Cluster {cluster}',
            color=color,
            alpha=0.7
        )
    
    plt.title(f'Normalized Metrics Plot by Cluster {role.capitalize()}')
    plt.xlabel('Record Index')
    plt.ylabel('Normalized Combined Score')
    plt.legend()
    # plt.savefig(f"action/initial-plot/cross-validation/plot/{role}_normalized_zscore.png", dpi=100)

    plt.show()



In [None]:
print(len(adc_outlier_kmean))
print(len(mid_outlier_kmean))

In [None]:
zscore_plot_normalized(adc_outlier_kmean, 'kmeans_cluster', normalized_after_correlation_metrics['adc'], 'ADC')
zscore_plot_normalized(support_outlier_kmean, 'kmeans_cluster', normalized_after_correlation_metrics['support'], 'Support')
zscore_plot_normalized(mid_outlier_kmean, 'kmeans_cluster', normalized_after_correlation_metrics['mid'], 'Mid')
zscore_plot_normalized(top_outlier_kmean, 'kmeans_cluster', normalized_after_correlation_metrics['top'], 'Top')
zscore_plot_normalized(jungle_outlier_kmean, 'kmeans_cluster', normalized_after_correlation_metrics['jungle'], 'Jungle')

In [None]:
def process_role_data(df, feature_cols, thresholds=None):
    df_normalized = normalize_features(df, feature_cols)

    print(df_normalized.head())

    # Combine normalized scores by summing across features
    df_normalized['normalized_score_combined'] = df_normalized[feature_cols].sum(axis=1)

    # Drop rows with NaN values
    df_normalized = df_normalized.dropna(subset=feature_cols)

    overall_performance = df_normalized['normalized_score_combined']
    
    # Create histogram
    bins = np.linspace(overall_performance.min(), overall_performance.max(), 50)
    counts, edges = np.histogram(overall_performance, bins=bins)

    # Plot the histogram
    plt.figure(figsize=(10, 6))
    plt.bar(edges[:-1], counts, width=np.diff(edges), color='skyblue', edgecolor='black')

    # Process thresholds if provided
    outliers_count = {}
    if thresholds:
        for idx, threshold in enumerate(thresholds):
            threshold_value = overall_performance.mean() + threshold * overall_performance.std()
            
            # Identify outliers
            outliers = df_normalized[overall_performance > threshold_value]
            outliers_count[threshold] = len(outliers)

            # Plot the threshold line
            plt.axvline(threshold_value, color='red', linestyle='dashed', linewidth=1, label=f'Threshold {threshold}')
            vertical_offset = 0.9 - idx * 0.1  # Adjust vertical position for each threshold
            plt.text(threshold_value, max(counts) * vertical_offset, f'Threshold {threshold}', color='red', rotation=45, ha='right')

        # Annotate the number of outliers for each threshold
        annotation_text = "\n".join([f"Threshold {threshold}: {count} outliers" for threshold, count in outliers_count.items()])
        plt.text(overall_performance.max() * 0.95, max(counts) * 0.8, annotation_text, fontsize=10,
                 bbox=dict(facecolor='white', alpha=0.7), ha='right', va='top')

        # Print outliers for each threshold
        for threshold, count in outliers_count.items():
            print(f"Outliers at threshold {threshold}: {count} outliers")
            print(f"It is about {count / len(df) * 100:.2f}% of the total data")
    else:
        print("No thresholds provided, skipping outlier calculation.")

    # Add labels and title
    plt.xlabel('Overall Performance Score')
    plt.ylabel('Frequency')
    plt.title(f'Performance Distribution {("(Thresholds: " + str(thresholds) + ")") if thresholds else ""}')
    plt.legend(loc='upper right' if thresholds else None)

    plt.show()

    return overall_performance


# Define thresholds for detecting outliers
thresholds = [2, 2.5, 3, 3.5]

process_role_data(adc_divided, normalized_after_correlation_metrics['adc'], thresholds)
process_role_data(support_divided, normalized_after_correlation_metrics['support'], thresholds)
process_role_data(mid_divided, normalized_after_correlation_metrics['mid'], thresholds)
process_role_data(top_divided, normalized_after_correlation_metrics['top'], thresholds)
process_role_data(jungle_divided, normalized_after_correlation_metrics['jungle'], thresholds)

Plot the z-score distribution for normal data detected by isolation forest

In [None]:
process_role_data(adc_outlier_kmean, normalized_after_correlation_metrics['adc'])
process_role_data(support_outlier_kmean, normalized_after_correlation_metrics['support'])
process_role_data(mid_outlier_kmean, normalized_after_correlation_metrics['mid'])
process_role_data(top_outlier_kmean, normalized_after_correlation_metrics['top'])
process_role_data(jungle_outlier_kmean, normalized_after_correlation_metrics['jungle'])

In [None]:
process_role_data(adc_normal, normalized_after_correlation_metrics['adc'])
process_role_data(support_normal, normalized_after_correlation_metrics['support'])
process_role_data(mid_normal, normalized_after_correlation_metrics['mid'])
process_role_data(top_normal, normalized_after_correlation_metrics['top'])
process_role_data(jungle_normal, normalized_after_correlation_metrics['jungle'])