In [7]:
# Standard Libraries
import os  # For file and directory handling
import datetime as dt  # For working with dates and times
from math import radians, sin, cos, sqrt, atan2  # For mathematical computations

# Data Manipulation and Computation Libraries
import pandas as pd  # For data manipulation
import numpy as np  # For numerical computations

# Visualization Libraries
import matplotlib.pyplot as plt  # For creating plots
import seaborn as sns  # For advanced visualization
import folium  # For geospatial mapping

# Machine Learning and Preprocessing Libraries
from sklearn.preprocessing import StandardScaler, LabelEncoder  # For scaling and encoding
from sklearn.decomposition import PCA  # For Principal Component Analysis
from sklearn.cluster import KMeans  # For clustering
from sklearn.model_selection import train_test_split  # For splitting data
from sklearn.ensemble import RandomForestClassifier  # Random Forest model
from sklearn.linear_model import LogisticRegression  # Logistic Regression model
from sklearn.svm import SVC  # Support Vector Machine model
from xgboost import XGBClassifier  # XGBoost model

# Time Series Analysis Libraries
from statsmodels.tsa.arima.model import ARIMA  # For ARIMA forecasting
from statsmodels.tsa.holtwinters import ExponentialSmoothing  # For time series smoothing
from pmdarima import auto_arima  # For automatic ARIMA model selection

# Evaluation Metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix  # For model evaluation


In [8]:
# File path to the dataset
matches_file_path = "../FIFA_datasets/laliga2023_34/team_stats/Laliga_matches_23_24.csv"

# Verify if the file exists
if os.path.exists(matches_file_path):
    # Try reading the file with a different encoding
    try:
        matches_df = pd.read_csv(matches_file_path, encoding='latin1')
        print(matches_df.head())
    except UnicodeDecodeError:
        print("Encoding issue. Try using a different encoding, such as 'cp1252'.")
else:
    print("File not found. Please check the file path.")



   Round  Round Name  Match ID              UTC Time      Home Team  xG_home  \
0      1           1   4205343  2023-08-11T17:30:00Z        Almeria      1.4   
1      1           1   4205347  2023-08-11T20:00:00Z        Sevilla      0.7   
2      1           1   4205351  2023-08-12T15:00:00Z  Real Sociedad      1.0   
3      1           1   4205348  2023-08-12T17:30:00Z     Las Palmas      0.9   
4      1           1   4205344  2023-08-12T19:30:00Z  Athletic Club      0.4   

   home_goals  away_goals  xG_away       Away Team  ...              Referee  \
0           0           2      2.1  Rayo Vallecano  ...      Javier Alberola   
1           1           2      1.1        Valencia  ...         Jos Snchez   
2           1           1      0.8          Girona  ...  Francisco Hernndez   
3           1           1      1.6        Mallorca  ...        Vctor Garca   
4           0           2      0.9     Real Madrid  ...            Jess Gil   

    Match Outcome goal_total xg_goal_t

In [9]:
matches_df.head()

Unnamed: 0,Round,Round Name,Match ID,UTC Time,Home Team,xG_home,home_goals,away_goals,xG_away,Away Team,...,Referee,Match Outcome,goal_total,xg_goal_total,home_xg_diff,away_xg_diff,home_goal_diff,away_goal_diff,home_pts_earned,away_pts_earned
0,1,1,4205343,2023-08-11T17:30:00Z,Almeria,1.4,0,2,2.1,Rayo Vallecano,...,Javier Alberola,Away Team Wins,2,3.5,-1.4,-0.1,-2,2,0,3
1,1,1,4205347,2023-08-11T20:00:00Z,Sevilla,0.7,1,2,1.1,Valencia,...,Jos Snchez,Away Team Wins,3,1.8,0.3,0.9,-1,1,0,3
2,1,1,4205351,2023-08-12T15:00:00Z,Real Sociedad,1.0,1,1,0.8,Girona,...,Francisco Hernndez,Tied Game,2,1.8,0.0,0.2,0,0,1,1
3,1,1,4205348,2023-08-12T17:30:00Z,Las Palmas,0.9,1,1,1.6,Mallorca,...,Vctor Garca,Tied Game,2,2.5,0.1,-0.6,0,0,1,1
4,1,1,4205344,2023-08-12T19:30:00Z,Athletic Club,0.4,0,2,0.9,Real Madrid,...,Jess Gil,Away Team Wins,2,1.3,-0.4,1.1,-2,2,0,3


In [10]:
matches_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 380 entries, 0 to 379
Data columns (total 22 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Round            380 non-null    int64  
 1   Round Name       380 non-null    int64  
 2   Match ID         380 non-null    int64  
 3   UTC Time         380 non-null    object 
 4   Home Team        380 non-null    object 
 5   xG_home          380 non-null    float64
 6   home_goals       380 non-null    int64  
 7   away_goals       380 non-null    int64  
 8   xG_away          380 non-null    float64
 9   Away Team        380 non-null    object 
 10  Attendance       380 non-null    int64  
 11  Venue            380 non-null    object 
 12  Referee          380 non-null    object 
 13  Match Outcome    380 non-null    object 
 14  goal_total       380 non-null    int64  
 15  xg_goal_total    380 non-null    float64
 16  home_xg_diff     380 non-null    float64
 17  away_xg_diff    

In [11]:
# Summary statistics of numerical columns
print("Summary statistics of numerical columns")
matches_df.describe()


Summary statistics of numerical columns


Unnamed: 0,Round,Round Name,Match ID,xG_home,home_goals,away_goals,xG_away,Attendance,goal_total,xg_goal_total,home_xg_diff,away_xg_diff,home_goal_diff,away_goal_diff,home_pts_earned,away_pts_earned
count,380.0,380.0,380.0,380.0,380.0,380.0,380.0,380.0,380.0,380.0,380.0,380.0,380.0,380.0,380.0,380.0
mean,19.5,19.5,4205532.0,1.484474,1.484211,1.160526,1.120526,29019.768421,2.644737,2.605,-0.000263,0.04,0.323684,-0.323684,1.6,1.118421
std,10.980313,10.980313,109.8408,0.811003,1.310507,1.122107,0.694,17406.285968,1.772734,1.04896,1.104405,0.910872,1.67646,1.67646,1.296637,1.243103
min,1.0,1.0,4205343.0,0.1,0.0,0.0,0.0,7558.0,0.0,0.3,-4.5,-2.5,-7.0,-7.0,0.0,0.0
25%,10.0,10.0,4205438.0,0.9,1.0,0.0,0.6,16023.5,1.0,1.9,-0.8,-0.6,-1.0,-1.0,0.0,0.0
50%,19.5,19.5,4205532.0,1.4,1.0,1.0,1.0,20455.0,2.0,2.5,-0.1,-0.1,0.0,0.0,1.0,1.0
75%,29.0,29.0,4205627.0,1.9,2.0,2.0,1.6,42497.25,4.0,3.3,0.7,0.6,1.0,1.0,3.0,3.0
max,38.0,38.0,4205722.0,4.6,7.0,7.0,4.5,77981.0,8.0,6.7,3.3,3.6,7.0,7.0,3.0,3.0


In [13]:
def calculate_comprehensive_league_table(matches_df):
    # Create a dataframe to track comprehensive stats for each team
    teams = pd.concat([matches_df['Home Team'], matches_df['Away Team']]).unique()
    league_table = pd.DataFrame({
        'Team': teams,
        'Played': 0,
        'Wins': 0,
        'Draws': 0,
        'Losses': 0,
        'Goals For': 0,
        'Goals Against': 0,
        'Goal Difference': 0,
        'Points': 0,
        'xG Total': 0,
        'xG Difference': 0,
        'Home Wins': 0,
        'Away Wins': 0,
        'Home Draws': 0,
        'Away Draws': 0,
        'Home Losses': 0,
        'Away Losses': 0
    })
    league_table.set_index('Team', inplace=True)

    # Iterate through each match and update the league table
    for _, match in matches_df.iterrows():
        home_team = match['Home Team']
        away_team = match['Away Team']
        home_score = match['home_goals']
        away_score = match['away_goals']
        home_xg = match['xG_home']
        away_xg = match['xG_away']

        # Update Played
        league_table.loc[home_team, 'Played'] += 1
        league_table.loc[away_team, 'Played'] += 1

        # Update Goals For and Against
        league_table.loc[home_team, 'Goals For'] += home_score
        league_table.loc[home_team, 'Goals Against'] += away_score
        league_table.loc[away_team, 'Goals For'] += away_score
        league_table.loc[away_team, 'Goals Against'] += home_score

        # Update Goal Difference
        league_table.loc[home_team, 'Goal Difference'] = (
            league_table.loc[home_team, 'Goals For'] - league_table.loc[home_team, 'Goals Against']
        )
        league_table.loc[away_team, 'Goal Difference'] = (
            league_table.loc[away_team, 'Goals For'] - league_table.loc[away_team, 'Goals Against']
        )

        # Update xG Total and Difference
        league_table.loc[home_team, 'xG Total'] += home_xg
        league_table.loc[away_team, 'xG Total'] += away_xg
        league_table.loc[home_team, 'xG Difference'] += (home_xg - away_xg)
        league_table.loc[away_team, 'xG Difference'] += (away_xg - home_xg)

        # Update Points, Wins, Draws, Losses
        if home_score > away_score:
            league_table.loc[home_team, 'Wins'] += 1
            league_table.loc[home_team, 'Points'] += 3
            league_table.loc[home_team, 'Home Wins'] += 1
            league_table.loc[away_team, 'Losses'] += 1
            league_table.loc[away_team, 'Away Losses'] += 1
        elif home_score < away_score:
            league_table.loc[away_team, 'Wins'] += 1
            league_table.loc[away_team, 'Points'] += 3
            league_table.loc[away_team, 'Away Wins'] += 1
            league_table.loc[home_team, 'Losses'] += 1
            league_table.loc[home_team, 'Home Losses'] += 1
        else:
            league_table.loc[home_team, 'Draws'] += 1
            league_table.loc[away_team, 'Draws'] += 1
            league_table.loc[home_team, 'Points'] += 1
            league_table.loc[away_team, 'Points'] += 1
            league_table.loc[home_team, 'Home Draws'] += 1
            league_table.loc[away_team, 'Away Draws'] += 1

    # Sort by Points, Goal Difference, and Goals For
    league_table.sort_values(
        by=['Points', 'Goal Difference', 'Goals For'],
        ascending=[False, False, False],
        inplace=True
    )

    # Add Rank as the index
    league_table.reset_index(inplace=True)
    league_table.index += 1  # Start ranking from 1
    league_table.rename_axis("Rank", inplace=True)

    return league_table

# Generate the comprehensive league table
league_table = calculate_comprehensive_league_table(matches_df)

from IPython.display import display

# Display the league table
display(league_table)


  league_table.loc[home_team, 'xG Total'] += home_xg
  league_table.loc[home_team, 'xG Difference'] += (home_xg - away_xg)


Unnamed: 0_level_0,Team,Played,Wins,Draws,Losses,Goals For,Goals Against,Goal Difference,Points,xG Total,xG Difference,Home Wins,Away Wins,Home Draws,Away Draws,Home Losses,Away Losses
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,Real Madrid,38,29,8,1,87,26,61,95,69.7,33.6,16,13,3,5,0,1
2,Barcelona,38,26,7,5,79,44,35,85,78.0,37.2,15,11,1,6,3,2
3,Girona,38,25,6,7,85,46,39,81,68.9,17.0,15,10,2,4,2,5
4,Atletico Madrid,38,24,4,10,70,43,27,76,62.4,24.2,16,8,1,3,2,8
5,Athletic Club,38,19,11,8,61,37,24,68,52.7,10.6,12,7,6,5,1,7
6,Real Sociedad,38,16,12,10,51,39,12,60,47.0,4.2,8,8,6,6,5,5
7,Real Betis,38,14,15,9,48,45,3,57,43.9,-9.5,9,5,7,8,3,6
8,Villarreal,38,14,11,13,65,65,0,53,55.3,-3.6,7,7,5,6,7,6
9,Valencia,38,13,10,15,40,45,-5,49,41.2,-3.8,8,5,6,4,5,10
10,Deportivo Alaves,38,12,10,16,36,46,-10,46,47.5,3.0,9,3,4,6,6,10


In [62]:
# Include all possible features
potential_features = [
    'Round', 'Round Name', 'Home Team', 'xG_home', 'home_goals', 'away_goals',
    'xG_away', 'Away Team', 'Attendance', 'Venue', 'Referee', 'goal_total',
    'xg_goal_total', 'home_xg_diff', 'away_xg_diff', 'home_goal_diff', 
    'away_goal_diff', 'home_pts_earned', 'away_pts_earned'
]

# Target variable
target = 'Match Outcome'

# Define feature set
X = matches_df[potential_features]
y = matches_df[target]

print("All potential features included for analysis.")


All potential features included for analysis.


In [63]:
from sklearn.preprocessing import LabelEncoder

# Encode categorical features
categorical_features = ['Home Team', 'Away Team', 'Venue', 'Referee']
label_encoders = {}
for col in categorical_features:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    label_encoders[col] = le

print("Categorical features encoded.")


Categorical features encoded.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = le.fit_transform(X[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = le.fit_transform(X[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = le.fit_transform(X[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_index

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
import pandas as pd

# Function to calculate VIF
def calculate_vif(df):
    vif_data = pd.DataFrame()
    vif_data["feature"] = df.columns
    vif_data["VIF"] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]
    return vif_data

# Calculate VIF for all features
numerical_features = X.select_dtypes(include=['float64', 'int64']).columns
vif_data = calculate_vif(X[numerical_features])

print("Variance Inflation Factor (VIF):")
print(vif_data)


Variance Inflation Factor (VIF):
            feature        VIF
0             Round        inf
1        Round Name        inf
2         Home Team   3.811240
3           xG_home        inf
4        home_goals        inf
5        away_goals        inf
6           xG_away        inf
7         Away Team   3.495369
8        Attendance   4.277196
9             Venue   3.750635
10          Referee   3.585964
11       goal_total        inf
12    xg_goal_total        inf
13     home_xg_diff        inf
14     away_xg_diff        inf
15   home_goal_diff        inf
16   away_goal_diff        inf
17  home_pts_earned  10.621452
18  away_pts_earned   8.768907


  vif = 1. / (1. - r_squared_i)


In [65]:
# Drop features with perfect multicollinearity or high VIF iteratively
while True:
    # Calculate VIF for current features
    vif_data = calculate_vif(X)
    
    # Find the feature with the highest VIF
    high_vif_feature = vif_data.sort_values(by="VIF", ascending=False).iloc[0]
    
    # Stop if all VIF values are below the threshold
    if high_vif_feature['VIF'] <= 5:
        break

    # Drop the feature with the highest VIF
    print(f"Dropping '{high_vif_feature['feature']}' due to high VIF ({high_vif_feature['VIF']:.2f})")
    X.drop(columns=[high_vif_feature['feature']], inplace=True)

# Final VIF
vif_data = calculate_vif(X)
print("Final Variance Inflation Factor (VIF):")
print(vif_data)



Dropping 'Round' due to high VIF (inf)
Dropping 'away_xg_diff' due to high VIF (inf)


  vif = 1. / (1. - r_squared_i)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.drop(columns=[high_vif_feature['feature']], inplace=True)
  vif = 1. / (1. - r_squared_i)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.drop(columns=[high_vif_feature['feature']], inplace=True)
  vif = 1. / (1. - r_squared_i)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.drop(columns=[high_vif_feature['feature']], inplace=True)
  vif = 1. / (1. - r_squared_i)


Dropping 'home_xg_diff' due to high VIF (inf)
Dropping 'xG_home' due to high VIF (inf)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.drop(columns=[high_vif_feature['feature']], inplace=True)
  vif = 1. / (1. - r_squared_i)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.drop(columns=[high_vif_feature['feature']], inplace=True)


Dropping 'home_goals' due to high VIF (inf)
Dropping 'away_goals' due to high VIF (inf)
Dropping 'home_goal_diff' due to high VIF (inf)
Dropping 'xg_goal_total' due to high VIF (17.42)
Dropping 'home_pts_earned' due to high VIF (9.90)
Dropping 'away_pts_earned' due to high VIF (5.81)
Dropping 'xG_away' due to high VIF (5.14)


  vif = 1. / (1. - r_squared_i)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.drop(columns=[high_vif_feature['feature']], inplace=True)
  vif = 1. / (1. - r_squared_i)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.drop(columns=[high_vif_feature['feature']], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.drop(columns=[high_vif_feature['feature']], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/p

Final Variance Inflation Factor (VIF):
          feature       VIF
0      Round Name  3.502813
1       Home Team  3.452732
2       Away Team  3.125733
3      Attendance  3.795054
4           Venue  3.614161
5         Referee  3.043487
6      goal_total  2.887801
7  away_goal_diff  1.165263


In [None]:
from sklearn.preprocessing import StandardScaler

# Scale numerical features
scaler = StandardScaler()
numerical_features = X.select_dtypes(include=['float64', 'int64']).columns
X[numerical_features] = scaler.fit_transform(X[numerical_features])

print("Numerical features scaled.")


Numerical features scaled.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[numerical_features] = scaler.fit_transform(X[numerical_features])


In [67]:
from sklearn.model_selection import train_test_split

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("Data split into training and testing sets.")
print(f"Training set size: {X_train.shape}")
print(f"Testing set size: {X_test.shape}")


Data split into training and testing sets.
Training set size: (304, 8)
Testing set size: (76, 8)


In [68]:
from sklearn.preprocessing import StandardScaler

# Identify numerical features
numerical_features = X.select_dtypes(include=['float64', 'int64']).columns

# Initialize scaler
scaler = StandardScaler()

# Scale numerical features
X[numerical_features] = scaler.fit_transform(X[numerical_features])

print("Numerical features scaled.")


Numerical features scaled.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[numerical_features] = scaler.fit_transform(X[numerical_features])


In [69]:
from sklearn.model_selection import train_test_split

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Training set size: {X_train.shape}")
print(f"Testing set size: {X_test.shape}")


Training set size: (304, 8)
Testing set size: (76, 8)


In [70]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Train Logistic Regression
log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg.fit(X_train, y_train)

# Evaluate Logistic Regression
log_reg_preds = log_reg.predict(X_test)
log_reg_accuracy = accuracy_score(y_test, log_reg_preds)

print(f"Logistic Regression Accuracy: {log_reg_accuracy}")
print("Classification Report:\n", classification_report(y_test, log_reg_preds))
print("Confusion Matrix:\n", confusion_matrix(y_test, log_reg_preds))


Logistic Regression Accuracy: 1.0
Classification Report:
                 precision    recall  f1-score   support

Away Team Wins       1.00      1.00      1.00        21
Home Team Wins       1.00      1.00      1.00        34
     Tied Game       1.00      1.00      1.00        21

      accuracy                           1.00        76
     macro avg       1.00      1.00      1.00        76
  weighted avg       1.00      1.00      1.00        76

Confusion Matrix:
 [[21  0  0]
 [ 0 34  0]
 [ 0  0 21]]


In [71]:
from sklearn.ensemble import RandomForestClassifier

# Train Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Evaluate Random Forest
rf_preds = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_preds)

print(f"Random Forest Accuracy: {rf_accuracy}")
print("Classification Report:\n", classification_report(y_test, rf_preds))
print("Confusion Matrix:\n", confusion_matrix(y_test, rf_preds))


Random Forest Accuracy: 1.0
Classification Report:
                 precision    recall  f1-score   support

Away Team Wins       1.00      1.00      1.00        21
Home Team Wins       1.00      1.00      1.00        34
     Tied Game       1.00      1.00      1.00        21

      accuracy                           1.00        76
     macro avg       1.00      1.00      1.00        76
  weighted avg       1.00      1.00      1.00        76

Confusion Matrix:
 [[21  0  0]
 [ 0 34  0]
 [ 0  0 21]]


In [72]:
import lightgbm as lgb

# Train LightGBM
lgb_model = lgb.LGBMClassifier(random_state=42)
lgb_model.fit(X_train, y_train)

# Evaluate LightGBM
lgb_preds = lgb_model.predict(X_test)
lgb_accuracy = accuracy_score(y_test, lgb_preds)

print(f"LightGBM Accuracy: {lgb_accuracy}")
print("Classification Report:\n", classification_report(y_test, lgb_preds))
print("Confusion Matrix:\n", confusion_matrix(y_test, lgb_preds))


ModuleNotFoundError: No module named 'lightgbm'

In [None]:
from xgboost import XGBClassifier

# Train XGBoost
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_train, y_train)

# Evaluate XGBoost
xgb_preds = xgb_model.predict(X_test)
xgb_accuracy = accuracy_score(y_test, xgb_preds)

print(f"XGBoost Accuracy: {xgb_accuracy}")
print("Classification Report:\n", classification_report(y_test, xgb_preds))
print("Confusion Matrix:\n", confusion_matrix(y_test, xgb_preds))


In [None]:
# Compare model accuracies
results = {
    "Logistic Regression": log_reg_accuracy,
    "Random Forest": rf_accuracy,
    "LightGBM": lgb_accuracy,
    "XGBoost": xgb_accuracy
}

print("Model Accuracy Comparison:")
for model, accuracy in results.items():
    print(f"{model}: {accuracy:.4f}")


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Prepare data for visualization
model_names = list(results.keys())
accuracies = list(results.values())

# Create a DataFrame for Seaborn compatibility
import pandas as pd
results_df = pd.DataFrame({
    'Model': model_names,
    'Accuracy': accuracies
})

# Set Seaborn style and palette
sns.set_theme(style="whitegrid")
palette = sns.color_palette("viridis", len(model_names))

# Create the bar plot
plt.figure(figsize=(10, 6))
barplot = sns.barplot(
    x='Model', 
    y='Accuracy', 
    data=results_df, 
    palette=palette
)

# Annotate bars with accuracy values
for i, bar in enumerate(barplot.patches):
    barplot.annotate(
        f"{accuracies[i]:.2f}", 
        (bar.get_x() + bar.get_width() / 2, bar.get_height()), 
        ha='center', 
        va='bottom', 
        fontsize=10, 
        color='black'
    )

# Enhance plot aesthetics
plt.title("Model Accuracy Comparison", fontsize=16, weight='bold')
plt.ylabel("Accuracy", fontsize=12)
plt.xlabel("Models", fontsize=12)
plt.xticks(rotation=45, fontsize=10)
plt.yticks(fontsize=10)
sns.despine()  # Remove top and right spines for cleaner look
plt.tight_layout()  # Ensure layout fits nicely

# Show plot
plt.show()
