In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.impute import SimpleImputer
import itertools
from collections import Counter
import warnings
warnings.filterwarnings('ignore')



In [13]:


class FIFAWorldCupPredictor8Groups:
    def __init__(self):
        # Initialize models and parameters
        self.scaler = StandardScaler()
        self.imputer = SimpleImputer(strategy='median')
        self.feature_importance = {
            'rank': 0.5,
            'market_value_million': 0.4,
            'goals_per_match': 0.05,
            'goals_conceded_per_match': -0.05,
            'win_percentage': 0.05,
            'continental_strength': 0.05
        }
        
        self.continent_strength = {
            'UEFA': 1.0,
            'CONMEBOL': 0.95,
            'AFC': 0.75,
            'CAF': 0.75,
            'CONCACAF': 0.7,
            'OFC': 0.6,
            'Host': 0.7
        }

    def _clean_data(self, team_data):
        """Handle missing values and infinite values"""
        team_data = team_data.copy()
        
        # Ensure numeric columns are properly typed
        numeric_cols = ['games_played', 'goals_scored', 'goals_conceded', 'market_value_million', 
                       'wins', 'draws', 'losses', 'rank']
        
        for col in numeric_cols:
            if col in team_data.columns:
                team_data[col] = pd.to_numeric(team_data[col], errors='coerce')
        
        # Calculate derived metrics with proper numeric handling
        team_data['win_percentage'] = np.where(
            team_data['games_played'] > 0,
            team_data['wins'] / team_data['games_played'],
            0.5
        )
        team_data['continental_strength'] = team_data['continent'].map(
            lambda x: self.continent_strength.get(x, 0.7)
        )
        team_data['goals_per_match'] = np.where(
            team_data['games_played'] > 0,
            team_data['goals_scored'] / team_data['games_played'],
            1.5
        )
        team_data['goals_conceded_per_match'] = np.where(
            team_data['games_played'] > 0,
            team_data['goals_conceded'] / team_data['games_played'],
            1.0
        )
        
        team_data['rank'] = 1/team_data['rank']
        
        # Handle infinite and NaN values for numeric columns only
        numeric_feature_cols = ['rank', 'market_value_million', 'goals_per_match', 
                               'goals_conceded_per_match', 'win_percentage', 'continental_strength']
        
        for col in numeric_feature_cols:
            if col in team_data.columns:
                # Replace infinite values with NaN
                team_data[col] = team_data[col].replace([np.inf, -np.inf], np.nan)
                # Fill any remaining NaN with median
                team_data[col] = team_data[col].fillna(team_data[col].median())
        
        team_data['team'] = team_data['team'].astype(str)
        return team_data
        
    def preprocess_data(self, team_data):
        """Prepare and engineer features with robust data handling"""
        # Clean the data first
        team_data = self._clean_data(team_data)
        
        # Select features
        self.feature_cols = list(self.feature_importance.keys())
        self.team_features = team_data[['team'] + self.feature_cols].copy()
        
        # Impute missing values (if any remain after cleaning)
        self.team_features[self.feature_cols] = self.imputer.fit_transform(
            self.team_features[self.feature_cols]
        )
        
        # Scale features
        self.team_features[self.feature_cols] = self.scaler.fit_transform(
            self.team_features[self.feature_cols]
        )
        
        # Verify no NaN/infinite values remain in numeric columns only
        numeric_data = self.team_features[self.feature_cols].select_dtypes(include=[np.number])
        if len(numeric_data.columns) > 0:
            if np.isinf(numeric_data.values).any() or np.isnan(numeric_data.values).any():
                print("Warning: Some invalid values found and will be handled by imputer")
                # Additional cleanup if needed
                for col in self.feature_cols:
                    if col in self.team_features.columns:
                        self.team_features[col] = pd.to_numeric(self.team_features[col], errors='coerce')
                        self.team_features[col] = self.team_features[col].fillna(self.team_features[col].median())
        
        # Cluster teams to understand style similarities
        self._cluster_teams()
        
        # Generate synthetic training data
        self._generate_synthetic_matches()
        
    def _cluster_teams(self):
        """Cluster teams using unsupervised learning with error handling"""
        try:
            # KMeans for style clustering
            self.kmeans = KMeans(n_clusters=6, random_state=42)
            self.team_features['style_cluster'] = self.kmeans.fit_predict(
                self.team_features[self.feature_cols]
            )
            
            # Gaussian Mixture for strength estimation
            self.gmm = GaussianMixture(n_components=7, random_state=42)
            self.team_features['strength_component'] = self.gmm.fit_predict(
                self.team_features[self.feature_cols]
            )
            self.team_features['strength_score'] = -self.gmm.score_samples(
                self.team_features[self.feature_cols]
            )
        except Exception as e:
            raise ValueError(f"Clustering failed: {str(e)}")

    def _generate_synthetic_matches(self):
        """Create synthetic training data based on team features"""
        # Generate all possible team pairs
        all_teams = self.team_features['team'].values
        team_pairs = list(itertools.combinations(all_teams, 2))
        
        # Create synthetic outcomes based on feature comparisons
        synthetic_X = []
        synthetic_y = []
        
        for team1, team2 in team_pairs:
            # Get team features
            t1 = self.team_features[self.team_features['team'] == team1].iloc[0]
            t2 = self.team_features[self.team_features['team'] == team2].iloc[0]
            
            # Create match features with proper numeric conversion
            features = {
                'rank_diff': float(t1['rank']) - float(t2['rank']),
                'market_value_diff': float(t1['market_value_million']) - float(t2['market_value_million']),
                'goals_scored_diff': float(t1['goals_per_match']) - float(t2['goals_per_match']),
                'goals_conceded_diff': float(t1['goals_conceded_per_match']) - float(t2['goals_conceded_per_match']),
                'strength_diff': float(t1['strength_score']) - float(t2['strength_score']),
                'same_continent': 1 if float(t1['continental_strength']) == float(t2['continental_strength']) else 0,
                'cluster_distance': float(np.linalg.norm(t1[self.feature_cols].astype(float) - t2[self.feature_cols].astype(float)))
            }
            
            # Determine synthetic outcome based on feature differences
            strength_diff = float(t1['strength_score']) - float(t2['strength_score'])
            if strength_diff > 1.0:  # Strong team1 advantage
                outcome = 2  # team1 win
            elif strength_diff < -1.0:  # Strong team2 advantage
                outcome = 0  # team2 win
            else:  # Close match
                outcome = 1  # draw
                
            # Add some noise
            if np.random.random() < 0.2:  # 20% chance to flip outcome
                outcome = np.random.choice([0, 1, 2])
            
            synthetic_X.append(features)
            synthetic_y.append(outcome)
        
        # Train ML model on synthetic data
        self.model = RandomForestClassifier(n_estimators=150, random_state=42)
        self.model.fit(pd.DataFrame(synthetic_X), synthetic_y)
        
        # Neural network with more capacity
        self.nn_model = MLPClassifier(hidden_layer_sizes=(128, 64), activation='relu', 
                                    random_state=42, max_iter=500)
        self.nn_model.fit(pd.DataFrame(synthetic_X), synthetic_y)
        
    def predict_match(self, team1, team2):
        """Predict match outcome using ML models"""
        # Get team features
        t1 = self.team_features[self.team_features['team'] == team1].iloc[0]
        t2 = self.team_features[self.team_features['team'] == team2].iloc[0]
        
        # Create feature vector with proper numeric conversion
        features = {
            'rank_diff': float(t1['rank']) - float(t2['rank']),
            'market_value_diff': float(t1['market_value_million']) - float(t2['market_value_million']),
            'goals_scored_diff': float(t1['goals_per_match']) - float(t2['goals_per_match']),
            'goals_conceded_diff': float(t1['goals_conceded_per_match']) - float(t2['goals_conceded_per_match']),
            'strength_diff': float(t1['strength_score']) - float(t2['strength_score']),
            'same_continent': 1 if float(t1['continental_strength']) == float(t2['continental_strength']) else 0,
            'cluster_distance': float(np.linalg.norm(t1[self.feature_cols].astype(float) - t2[self.feature_cols].astype(float)))
        }
        
        # Predict with both models
        rf_proba = self.model.predict_proba(pd.DataFrame([features]))[0]
        nn_proba = self.nn_model.predict_proba(pd.DataFrame([features]))[0]
        
        # Ensemble probabilities
        proba = (rf_proba + nn_proba) / 2
        
        return {
            'team1': team1,
            'team2': team2,
            'team1_win_prob': proba[2],
            'draw_prob': proba[1],
            'team2_win_prob': proba[0],
            'strength_diff': features['strength_diff']
        }
    
    def simulate_group_stage(self, groups):
        """Simulate all group stage matches with detailed results"""
        group_results = {}
        all_standings = {}

        for group_name, teams in groups.items():
            print(f"\n--- Group {group_name} ---")
            standings = {team: {'points': 0, 'goals_for': 0, 'goals_against': 0} for team in teams}

            # Each team plays every other team once
            for team1, team2 in itertools.combinations(teams, 2):
                prediction = self.predict_match(team1, team2)

                # Determine outcome with randomness
                rand = np.random.random()
                if prediction['team1_win_prob'] > prediction['team2_win_prob']:  # Team1 win
                    standings[team1]['points'] += 3
                    lam1 = 1.5 + max(0, prediction['strength_diff'] * 0.3)
                    lam2 = 1.0 - max(0, prediction['strength_diff'] * 0.3)
                elif prediction['team1_win_prob'] == prediction['draw_prob']:  # Draw
                    standings[team1]['points'] += 1
                    standings[team2]['points'] += 1
                    lam1 = lam2 = 1.0
                else:  # Team2 win
                    standings[team2]['points'] += 3
                    lam1 = 1.0 - max(0, -prediction['strength_diff'] * 0.3)
                    lam2 = 1.5 + max(0, -prediction['strength_diff'] * 0.3)

                # Ensure lambda values are valid
                lam1 = max(0, lam1) if not np.isnan(lam1) else 0
                lam2 = max(0, lam2) if not np.isnan(lam2) else 0

                goals_team1 = np.random.poisson(lam1)
                goals_team2 = np.random.poisson(lam2)

                # Update standings
                standings[team1]['goals_for'] += goals_team1
                standings[team1]['goals_against'] += goals_team2
                standings[team2]['goals_for'] += goals_team2
                standings[team2]['goals_against'] += goals_team1

                print(f"{team1} {goals_team1}-{goals_team2} {team2}")

            # Sort standings
            sorted_standings = sorted(
                standings.items(),
                key=lambda x: (-x[1]['points'],
                               -(x[1]['goals_for'] - x[1]['goals_against']),
                               -x[1]['goals_for'])
            )

            group_results[group_name] = [team[0] for team in sorted_standings]
            all_standings[group_name] = standings

            print("\nFinal Standings:")
            for i, (team, stats) in enumerate(sorted_standings, 1):
                gd = stats['goals_for'] - stats['goals_against']
                print(f"{i}. {team}: {stats['points']} pts (GD: {gd})")

        return group_results, all_standings


    def simulate_knockout_phase(self, qualified_teams):
        """Simulate knockout rounds (Round of 16, QF, SF, Final)"""
        print("\n=== KNOCKOUT STAGE ===")
        current_round = qualified_teams
        round_names = ["Round of 16", "Quarterfinals", "Semifinals", "Final"]
        round_num = 0

        while len(current_round) > 1:
            next_round = []
            print(f"\n--- {round_names[round_num]} ---")

            for i in range(0, len(current_round), 2):
                team1 = current_round[i]
                team2 = current_round[i+1] if (i+1) < len(current_round) else None

                if team2 is None:
                    next_round.append(team1)
                    continue

                prediction = self.predict_match(team1, team2)
                strength_diff = prediction['strength_diff']

                def safe_poisson(lam):
                    return np.random.poisson(max(0, lam)) if not np.isnan(lam) else 0

                rand = np.random.random()
                if rand < prediction['team1_win_prob']:
                    winner = team1
                    score = f"{safe_poisson(1.5)}-{safe_poisson(1.0)}"

                elif rand < (prediction['team1_win_prob'] + prediction['draw_prob']):
                    # Extra time or penalties
                    if strength_diff > 0.5:
                        winner = team1
                        score = f"{safe_poisson(1.2)}-{safe_poisson(1.0)} (AET)"
                    elif strength_diff < -0.5:
                        winner = team2
                        score = f"{safe_poisson(1.0)}-{safe_poisson(1.2)} (AET)"
                    else:
                        # Penalties
                        winner = team1 if np.random.random() < 0.5 + strength_diff / 4 else team2
                        score = f"{safe_poisson(1.0)}-{safe_poisson(1.0)} ({winner} wins on penalties)"
                else:
                    winner = team2
                    score = f"{safe_poisson(1.0)}-{safe_poisson(1.5)}"

                print(f"{team1} vs {team2}: {score} -> {winner} advances")
                next_round.append(winner)

            current_round = next_round
            round_num += 1

        champion = current_round[0]
        print(f"\n🏆 TOURNAMENT CHAMPION: {champion} 🏆")
        return champion
    
    def simulate_tournament(self, groups, n_simulations=1):
        """Simulate the entire tournament"""
        if n_simulations == 1:
            # Detailed single simulation
            print("=== FIFA CLUB WORLD CUP SIMULATION ===")
            
            # Group stage
            group_winners_runners_up, _ = self.simulate_group_stage(groups)
            
            # Determine qualified teams (winners and runners-up from each group)
            qualified_teams = []
            for group in group_winners_runners_up.values():
                qualified_teams.extend(group[:2])  # Top 2 advance from each group
            
            # Knockout phase
            champion = self.simulate_knockout_phase(qualified_teams)
            return champion
        else:
            # Fast multiple simulations for probability analysis
            print(f"Running {n_simulations} simulations...")
            champions = []
            
            for sim in range(n_simulations):
                if (sim+1) % 100 == 0:
                    print(f"Completed {sim+1}/{n_simulations} simulations...")
                
                # Simulate group stage (fast version)
                qualified_teams = []
                for group_name, teams in groups.items():
                    points = {team: 0 for team in teams}
                    
                    for team1, team2 in itertools.combinations(teams, 2):
                        prediction = self.predict_match(team1, team2)
                        rand = np.random.random()
                        if rand < prediction['team1_win_prob']:
                            points[team1] += 3
                        elif rand < (prediction['team1_win_prob'] + prediction['draw_prob']):
                            points[team1] += 1
                            points[team2] += 1
                        else:
                            points[team2] += 3
                    
                    # Get top 2 teams
                    top2 = sorted(points.items(), key=lambda x: -x[1])[:2]
                    qualified_teams.extend([team[0] for team in top2])
                
                # Simulate knockout phase (fast version)
                while len(qualified_teams) > 1:
                    next_round = []
                    for i in range(0, len(qualified_teams), 2):
                        if i+1 >= len(qualified_teams):
                            next_round.append(qualified_teams[i])
                            continue
                        
                        team1, team2 = qualified_teams[i], qualified_teams[i+1]
                        prediction = self.predict_match(team1, team2)
                        rand = np.random.random()
                        if rand < prediction['team1_win_prob']:
                            next_round.append(team1)
                        elif rand < (prediction['team1_win_prob'] + prediction['draw_prob']):
                            next_round.append(team1 if prediction['strength_diff'] > 0 else team2)
                        else:
                            next_round.append(team2)
                    qualified_teams = next_round
                
                champions.append(qualified_teams[0])
            
            # Analyze results
            champ_counts = Counter(champions)
            total = sum(champ_counts.values())
            
            print("\n=== SIMULATION RESULTS ===")
            print("\nChampionship Probabilities:")
            for team, count in champ_counts.most_common():
                print(f"{team}: {count/total:.1%}")
            
            return champ_counts



In [14]:
# Example Usage
if __name__ == "__main__":
    # Create sample data from your CSV string
    csv_data = """team,games_played,goals_scored,goals_conceded,league,continent,market_value_million,wins,draws,losses,country,rank,goals_per_match,goals_conceded_per_match
inter miami cf,38,65,39,MLS,Host,80,20,11,7,USA,535,1.7105263157894737,1.0263157894736843
al-hilal,45,85,40,Saudi Pro League,AFC,160,28,10,7,Saudi Arabia,67,1.8888888888888888,0.8888888888888888
urawa red diamonds,40,70,38,J1 League,AFC,65,25,8,7,Japan,394,1.75,0.95
al ain,38,60,32,UAE Pro League,AFC,50,23,9,6,UAE,423,1.5789473684210527,0.8421052631578947
ulsan hyundai,42,68,34,K League 1,AFC,70,26,10,6,South Korea,290,1.619047619047619,0.8095238095238095
al ahly,44,75,36,Egyptian Premier League,CAF,55,27,11,6,Egypt,59,1.7045454545454546,0.8181818181818182
wydad ac,40,66,31,Botola Pro,CAF,38,24,9,7,Morocco,245,1.65,0.775
espérance de tunis,42,62,28,Tunisian Ligue 1,CAF,42,23,11,8,Tunisia,100,1.4761904761904763,0.6666666666666666
mamelodi sundowns,41,65,35,South African Premier League,CAF,40,22,12,7,South Africa,95,1.5853658536585367,0.8536585365853658
monterrey,43,72,34,Liga MX,CONCACAF,120,26,10,7,Mexico,127,1.6744186046511629,0.7906976744186046
seattle sounders,40,68,36,MLS,CONCACAF,110,25,8,7,USA,476,1.7,0.9
pachuca,42,70,33,Liga MX,CONCACAF,115,27,9,6,Mexico,146,1.6666666666666667,0.7857142857142857
los angeles fc,39,75,38,MLS,CONCACAF,105,28,7,4,USA,367,1.9230769230769231,0.9743589743589743
palmeiras,50,95,45,Brasileirão,CONMEBOL,350,32,12,6,Brazil,24,1.9,0.9
flamengo,48,90,40,Brasileirão,CONMEBOL,380,30,14,4,Brazil,27,1.875,0.8333333333333334
fluminense,45,85,42,Brasileirão,CONMEBOL,180,28,13,4,Brazil,70,1.8888888888888888,0.9333333333333333
botafogo,44,78,40,Brasileirão,CONMEBOL,160,26,11,7,Brazil,57,1.7727272727272727,0.9090909090909091
river plate,46,88,44,Argentine Primera División,CONMEBOL,250,29,14,3,Argentina,51,1.9130434782608696,0.9565217391304348
boca juniors,47,85,43,Argentine Primera División,CONMEBOL,230,28,15,4,Argentina,93,1.8085106382978724,0.9148936170212766
auckland city,36,64,30,NZ National League,OFC,12,21,10,5,New Zealand,175,1.7777777777777777,0.8333333333333334
chelsea,42,80,37,Premier League,UEFA,850,24,10,8,England,12,1.9047619047619047,0.8809523809523809
real madrid,45,98,33,La Liga,UEFA,1100,31,9,5,Spain,7,2.1777777777777776,0.7333333333333333
manchester city,44,102,28,Premier League,UEFA,1200,33,7,4,England,8,2.3181818181818183,0.6363636363636364
bayern munich,43,95,30,Bundesliga,UEFA,900,30,8,5,Germany,2,2.2093023255813953,0.6976744186046512
paris saint-germain,42,90,35,Ligue 1,UEFA,950,29,7,6,France,1,2.142857142857143,0.8333333333333334
borussia dortmund,41,82,35,Bundesliga,UEFA,680,25,11,5,Germany,9,2.0,0.8536585365853658
inter milan,44,80,40,Serie A,UEFA,700,26,11,7,Italy,6,1.8181818181818181,0.9090909090909091
porto,42,75,33,Primeira Liga,UEFA,350,27,9,6,Portugal,47,1.7857142857142858,0.7857142857142857
atletico madrid,42,77,36,La Liga,UEFA,800,24,12,6,Spain,11,1.8333333333333333,0.8571428571428571
benfica,43,78,30,Primeira Liga,UEFA,340,26,10,7,Portugal,17,1.813953488372093,0.6976744186046512
juventus,40,70,38,Serie A,UEFA,750,22,11,7,Italy,22,1.75,0.95
red bull salzburg,43,85,40,Austrian Bundesliga,UEFA,200,28,10,5,Austria,145,1.9767441860465116,0.9302325581395349"""
    
    from io import StringIO
    team_df = pd.read_csv(StringIO(csv_data))
    
    # Create predictor
    predictor = FIFAWorldCupPredictor8Groups()
    
    try:
        predictor.preprocess_data(team_df)
        print("Data preprocessing completed successfully!")
        
        # Define 8 groups with 4 teams each
        groups = {
            'A': ['palmeiras', 'inter miami cf', 'porto', 'al ahly'],
            'B': ['botafogo', 'paris saint-germain', 'atletico madrid', 'seattle sounders'],
            'C': ['bayern munich', 'benfica', 'boca juniors', 'auckland city'],
            'D': ['flamengo', 'chelsea', 'espérance de tunis', 'los angeles fc'],
            'E': ['river plate', 'monterrey', 'inter milan', 'urawa red diamonds'],
            'F': ['mamelodi sundowns', 'borussia dortmund', 'fluminense', 'ulsan hyundai'],
            'G': ['juventus', 'manchester city', 'wydad ac', 'al ain'],
            'H': ['red bull salzburg', 'real madrid', 'al-hilal', 'pachuca']
        }
        
        # Verify all team names exist in the data
        all_teams_in_groups = set([team for group in groups.values() for team in group])
        all_teams_in_data = set(team_df['team'].str.lower().values)
        
        missing_teams = all_teams_in_groups - all_teams_in_data
        if missing_teams:
            print(f"\nWarning: The following teams in groups are not found in the data:")
            for team in missing_teams:
                print(f"- {team}")
            print("\nPlease check for typos or add data for these teams.")
        else:
            # Run simulation
            print("\n=== SINGLE TOURNAMENT SIMULATION ===")
            champion = predictor.simulate_tournament(groups, n_simulations=1)
            
            # Uncomment to run probability analysis
            print("\n=== PROBABILITY ANALYSIS (100 SIMULATIONS) ===")
            results = predictor.simulate_tournament(groups, n_simulations=100)
            
    except ValueError as e:
        print(f"\nError: {str(e)}")
        print("Please check your input data for issues.")
    except Exception as e:
        print(f"\nUnexpected error: {str(e)}")

Data preprocessing completed successfully!

=== SINGLE TOURNAMENT SIMULATION ===
=== FIFA CLUB WORLD CUP SIMULATION ===

--- Group A ---
palmeiras 10-0 inter miami cf
palmeiras 0-4 porto
palmeiras 0-4 al ahly
inter miami cf 0-10 porto
inter miami cf 0-5 al ahly
porto 0-0 al ahly

Final Standings:
1. porto: 9 pts (GD: 14)
2. al ahly: 6 pts (GD: 9)
3. palmeiras: 3 pts (GD: 2)
4. inter miami cf: 0 pts (GD: -25)

--- Group B ---
botafogo 16-0 paris saint-germain
botafogo 0-10 atletico madrid
botafogo 5-0 seattle sounders
paris saint-germain 0-16 atletico madrid
paris saint-germain 0-6 seattle sounders
atletico madrid 6-0 seattle sounders

Final Standings:
1. atletico madrid: 9 pts (GD: 32)
2. botafogo: 6 pts (GD: 11)
3. seattle sounders: 3 pts (GD: -5)
4. paris saint-germain: 0 pts (GD: -38)

--- Group C ---
bayern munich 0-9 benfica
bayern munich 0-5 boca juniors
bayern munich 0-4 auckland city
benfica 8-0 boca juniors
benfica 5-0 auckland city
boca juniors 0-2 auckland city

Final Standi