In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# --- 1. Load Data ---
quaterfinals = pd.read_csv("quaterfinals.csv")
wteam_avg = pd.read_csv("wteam_avg.csv")

# --- 2. Prepare Quarterfinal Data ---
# Create Win Label (Home Win = 1, Away Win = 0)
quaterfinals['HomeWin'] = (quaterfinals['HomeGoals'] > quaterfinals['AwayGoals']).astype(int)

# Select Features
features = [
    'HomeShots', 'AwayShots',
    'HomeShotsOnTarget', 'AwayShotsOnTarget',
    'HomexG', 'AwayxG',
    'HomeBigChances', 'AwayBigChances',
    'HomePossession(%)', 'AwayPossession(%)'
]

X = quaterfinals[features]
y = quaterfinals['HomeWin']

# --- 3. Data Preprocessing ---
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split into Train/Test
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.25, random_state=42)

# --- 4. Train Models ---
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

log_model = LogisticRegression()
log_model.fit(X_train, y_train)

# --- 5. Evaluate Models ---
y_pred_rf = rf_model.predict(X_test)
y_pred_log = log_model.predict(X_test)

print("=== Random Forest Performance ===")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf):.2f}")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))

print("\n=== Logistic Regression Performance ===")
print(f"Accuracy: {accuracy_score(y_test, y_pred_log):.2f}")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_log))
print("Classification Report:\n", classification_report(y_test, y_pred_log))

# --- 6. Create Semifinal Match Data ---
def create_match_data(team_home, team_away, wteam_avg):
    home = wteam_avg[wteam_avg['Team'] == team_home].iloc[0]
    away = wteam_avg[wteam_avg['Team'] == team_away].iloc[0]
    return pd.DataFrame([[
        home['Shots'], away['Shots'],
        home['ShotsOnTarget'], away['ShotsOnTarget'],
        home['xG'], away['xG'],
        home['BigChances'], away['BigChances'],
        home['Possession(%)'], away['Possession(%)']
    ]], columns=features)

match1 = create_match_data('England', 'Italy', wteam_avg)
match2 = create_match_data('Germany', 'Spain', wteam_avg)

# Scale the Match Data
match1_scaled = scaler.transform(match1)
match2_scaled = scaler.transform(match2)

# --- 7. Predict Win Probabilities ---
eng_ita_rf = rf_model.predict_proba(match1_scaled)[0][1]
ger_spa_rf = rf_model.predict_proba(match2_scaled)[0][1]

eng_ita_log = log_model.predict_proba(match1_scaled)[0][1]
ger_spa_log = log_model.predict_proba(match2_scaled)[0][1]

print("\n--- Semifinal Win Probabilities ---")
print(f"[England vs Italy] RF (England Win): {eng_ita_rf:.2f}, Logistic: {eng_ita_log:.2f}")
print(f"[Germany vs Spain] RF (Germany Win): {ger_spa_rf:.2f}, Logistic: {ger_spa_log:.2f}")


=== Random Forest Performance ===
Accuracy: 0.57
Confusion Matrix:
 [[3 0]
 [3 1]]
Classification Report:
               precision    recall  f1-score   support

           0       0.50      1.00      0.67         3
           1       1.00      0.25      0.40         4

    accuracy                           0.57         7
   macro avg       0.75      0.62      0.53         7
weighted avg       0.79      0.57      0.51         7


=== Logistic Regression Performance ===
Accuracy: 0.57
Confusion Matrix:
 [[3 0]
 [3 1]]
Classification Report:
               precision    recall  f1-score   support

           0       0.50      1.00      0.67         3
           1       1.00      0.25      0.40         4

    accuracy                           0.57         7
   macro avg       0.75      0.62      0.53         7
weighted avg       0.79      0.57      0.51         7


--- Semifinal Win Probabilities ---
[England vs Italy] RF (England Win): 0.58, Logistic: 0.54
[Germany vs Spain] RF (Germany

In [19]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# CSV 파일 읽기
quater_df = pd.read_csv('quaterfinals.csv')
group_avg_df = pd.read_csv('wteam_avg.csv')

# 4강 진출 팀 리스트(예시: quaterfinals.csv에서 승리한 팀 기준)
# (여기서는 실제 4강 진출팀을 예시로 넣었으니 필요에 맞게 수정 가능)
semi_final_teams = ['England', 'Germany', 'Italy', 'Spain']

# 4강 팀 데이터 추출 (조별리그 평균 데이터에서)
semi_teams_avg = group_avg_df[group_avg_df['Team'].isin(semi_final_teams)].reset_index(drop=True)

# quaterfinals.csv 데이터를 각 팀별로 정리 (home/away 각각)
home = quater_df[['HomeTeam', 'HomeGoals', 'HomeShots', 'HomeShotsOnTarget', 'HomexG', 'HomeBigChances',
                  'HomePossession(%)', 'HomePasses(%)', 'HomeFouls', 'HomeTackles(%)', 'HomeClearances',
                  'HomeInterceptions', 'HomeErrorsLeadingtoShot', 'HomeErrorsLeadingtoGoal', 'HomeConer',
                  'HomeYellow', 'HomeRed', 'HomeSaves', 'HxGOTfaced', 'HGoalsprevented']]
home.columns = ['Team', 'Goals', 'Shots', 'ShotsOnTarget', 'xG', 'BigChances', 'Possession(%)',
                'Passes(%)', 'Fouls', 'Tackles(%)', 'Clearances', 'Interceptions', 'ErrorsLeadingtoShot',
                'ErrorsLeadingtoGoal', 'Coner', 'Yellow', 'Red', 'Saves', 'xGOTFaced', 'GoalsPrevented']

away = quater_df[['AwayTeam', 'AwayGoals', 'AwayShots', 'AwayShotsOnTarget', 'AwayxG', 'AwayBigChances',
                  'AwayPossession(%)', 'AwayPasses(%)', 'AwayFouls', 'AwayTackles(%)', 'AwayClearances',
                  'AwayInterceptions', 'AwayErrorsLeadingtoShot', 'AwayErrorsLeadingtoGoal', 'AwayConer',
                  'AwayYellow', 'AwayRed', 'AwaySaves', 'AxGOTfaced', 'AGoalsprevented']]
away.columns = ['Team', 'Goals', 'Shots', 'ShotsOnTarget', 'xG', 'BigChances', 'Possession(%)',
                'Passes(%)', 'Fouls', 'Tackles(%)', 'Clearances', 'Interceptions', 'ErrorsLeadingtoShot',
                'ErrorsLeadingtoGoal', 'Coner', 'Yellow', 'Red', 'Saves', 'xGOTFaced', 'GoalsPrevented']

quater_team_df = pd.concat([home, away], ignore_index=True)

# quaterfinals에서 승/패 결과 생성 (무승부는 제외, 필요 시 수정)
def get_result(row):
    if row['Goals'] > 1.5:  # 임시 기준, 실제로는 각 경기 결과별 따로 계산 필요
        return 1
    else:
        return 0

# 여기선 quaterfinals 결과 데이터로 'Result' 컬럼 추가 필요 (임시로 승리팀=1, 패배팀=0)
# 실제 경기별 승패 데이터를 만들어야 정확함
# quaterfinals.csv 원본에서 승리팀 기준으로 만드는 게 이상적

# 간단히 임시로 quaterfinals 승리팀 결과 데이터 만들기 (예시)
quater_df['Result'] = quater_df.apply(
    lambda x: 1 if x['HomeGoals'] > x['AwayGoals'] else (0 if x['HomeGoals'] < x['AwayGoals'] else None), axis=1)

home.loc[:, 'Result'] = quater_df['Result'].values
away.loc[:, 'Result'] = quater_df['Result'].apply(lambda x: 1 - x if x is not None else None).values
quater_team_df = pd.concat([home, away], ignore_index=True)

# 그룹 평균 데이터에는 결과가 없으므로 'Result' 컬럼 추가 (NaN)
group_avg_df['Result'] = None

# full data 합치기
full_data = pd.concat([group_avg_df, quater_team_df], ignore_index=True)

# 'Result'가 NaN인 행은 학습에서 제외 (즉, 조별리그 평균 데이터 제외)
train_data = full_data.dropna(subset=['Result'])

X = train_data.drop(columns=['Team', 'Result'])
y = train_data['Result'].astype(int)  # int형으로 변환

# 모델 + 스케일러 파이프라인 생성 및 학습
model = make_pipeline(StandardScaler(), LogisticRegression())
model.fit(X, y)

# 4강 팀의 조별리그 평균 데이터로 승률 예측
X_test = semi_teams_avg.drop(columns=['Team'])
pred_probs = model.predict_proba(X_test)[:, 1]

for team, prob in zip(semi_teams_avg['Team'], pred_probs):
    print(f"{team} 승리 확률: {prob:.2f}")

England 승리 확률: 0.98
Germany 승리 확률: 0.60
Italy 승리 확률: 0.25
Spain 승리 확률: 1.00


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  home.loc[:, 'Result'] = quater_df['Result'].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  away.loc[:, 'Result'] = quater_df['Result'].apply(lambda x: 1 - x if x is not None else None).values
  full_data = pd.concat([group_avg_df, quater_team_df], ignore_index=True)
