<a href="https://colab.research.google.com/github/kelvin1242/Accident-severity/blob/main/epl_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [50]:
import requests
import zipfile
import os

# URL for the Premier League data (you can replace this with the actual URL for the latest dataset)
url = "https://www.football-data.co.uk/mmz4281/2223/E0.csv"  # Update this URL based on current season

# Download the CSV file
response = requests.get(url)

# Save the content to a file
with open("premier_league_2223.csv", "wb") as file:
    file.write(response.content)

# If the data is in a zip file, you can use zipfile to extract it:
# with zipfile.ZipFile("file.zip", 'r') as zip_ref:
#     zip_ref.extractall("data_folder")


In [52]:
import pandas as pd

# Load the data into a DataFrame
df = pd.read_csv("premier_league_2223.csv")

# Let's take a look at the first few rows
df.head()

# Clean up the dataset (you can filter columns you need for the model)
df = df[['HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HS', 'AS', 'HST', 'AST', 'HF', 'AF', 'HC', 'AC', 'HY', 'AY', 'HR', 'AR']]

# Rename columns to be more understandable
df.columns = ['HomeTeam', 'AwayTeam', 'HomeGoals', 'AwayGoals', 'Result', 'HomeShots', 'AwayShots', 'HomeShotsOnTarget', 'AwayShotsOnTarget',
              'HomeFouls', 'AwayFouls', 'HomeCorners', 'AwayCorners', 'HomeYellowCards', 'AwayYellowCards', 'HomeRedCards', 'AwayRedCards']



In [53]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

# Preprocessing - Encoding categorical variables and handling missing data
df['HomeTeam'] = df['HomeTeam'].astype('category').cat.codes
df['AwayTeam'] = df['AwayTeam'].astype('category').cat.codes
df['Result'] = df['Result'].map({'H': 1, 'D': 0, 'A': -1})  # 1 for home win, 0 for draw, -1 for away win

# Fill missing values with mean or median
df = df.fillna(df.mean())

# Features (X) and target variable (y)
X = df[['HomeTeam', 'AwayTeam', 'HomeShots', 'AwayShots', 'HomeShotsOnTarget', 'AwayShotsOnTarget', 'HomeFouls', 'AwayFouls',
        'HomeCorners', 'AwayCorners', 'HomeYellowCards', 'AwayYellowCards', 'HomeRedCards', 'AwayRedCards']]

y = df['Result']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predicting results on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.5131578947368421
Confusion Matrix:
 [[13  4  5]
 [ 6  0 13]
 [ 8  1 26]]


In [59]:
# Load the dataset
df = pd.read_csv("premier_league_2223.csv")

# Check the columns of the dataframe
print(df.columns)


Index(['Div', 'Date', 'Time', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR',
       'HTHG', 'HTAG',
       ...
       'AvgC<2.5', 'AHCh', 'B365CAHH', 'B365CAHA', 'PCAHH', 'PCAHA', 'MaxCAHH',
       'MaxCAHA', 'AvgCAHH', 'AvgCAHA'],
      dtype='object', length=106)


In [60]:
# Load the dataset
df = pd.read_csv("premier_league_2223.csv")

# Check the columns of the dataframe
print(df.columns)

# Remove any unnecessary columns (if necessary)
df = df[['HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HS', 'AS', 'HST', 'AST', 'HF', 'AF', 'HC', 'AC', 'HY', 'AY', 'HR', 'AR']]


Index(['Div', 'Date', 'Time', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR',
       'HTHG', 'HTAG',
       ...
       'AvgC<2.5', 'AHCh', 'B365CAHH', 'B365CAHA', 'PCAHH', 'PCAHA', 'MaxCAHH',
       'MaxCAHA', 'AvgCAHH', 'AvgCAHA'],
      dtype='object', length=106)


In [61]:
print(df.shape)  # This will give you the number of rows and columns


(380, 17)


In [62]:
print(df.head())


         HomeTeam       AwayTeam  FTHG  FTAG FTR  HS  AS  HST  AST  HF  AF  \
0  Crystal Palace        Arsenal     0     2   A  10  10    2    2  16  11   
1          Fulham      Liverpool     2     2   D   9  11    3    4   7   9   
2     Bournemouth    Aston Villa     2     0   H   7  15    3    2  18  16   
3           Leeds         Wolves     2     1   H  12  15    4    6  13   9   
4       Newcastle  Nott'm Forest     2     0   H  23   5   10    0   9  14   

   HC  AC  HY  AY  HR  AR  
0   3   5   1   2   0   0  
1   4   4   2   0   0   0  
2   5   5   3   3   0   0  
3   6   4   2   0   0   0  
4  11   1   0   3   0   0  


In [63]:
import streamlit as st
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

# Load and preprocess the dataset
def train_model():
    df = pd.read_csv("premier_league_2223.csv")

    # Check the columns to make sure they match the expected number
    print("Columns in dataset:", df.columns)

    # Filter and rename columns (make sure these columns exist in the dataset)
    df = df[['HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HS', 'AS', 'HST', 'AST', 'HF', 'AF', 'HC', 'AC', 'HY', 'AY', 'HR', 'AR']]

    # Renaming columns
    df.columns = ['HomeTeam', 'AwayTeam', 'HomeGoals', 'AwayGoals', 'Result', 'HomeShots', 'AwayShots',
                  'HomeShotsOnTarget', 'AwayShotsOnTarget', 'HomeFouls', 'AwayFouls', 'HomeCorners', 'AwayCorners',
                  'HomeYellowCards', 'AwayYellowCards', 'HomeRedCards', 'AwayRedCards']

    # Convert categorical variables (teams) to numeric codes
    df['HomeTeam'] = df['HomeTeam'].astype('category').cat.codes
    df['AwayTeam'] = df['AwayTeam'].astype('category').cat.codes
    df['Result'] = df['Result'].map({'H': 1, 'D': 0, 'A': -1})  # Map home win, draw, away win to numerical values

    # Fill missing values
    df = df.fillna(df.mean())

    # Features (X) and target variable (y)
    X = df[['HomeTeam', 'AwayTeam', 'HomeShots', 'AwayShots', 'HomeShotsOnTarget', 'AwayShotsOnTarget', 'HomeFouls', 'AwayFouls',
            'HomeCorners', 'AwayCorners', 'HomeYellowCards', 'AwayYellowCards', 'HomeRedCards', 'AwayRedCards']]

    y = df['Result']

    # Train the model (Random Forest)
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X, y)
    return model

# Train the model (or load it from a saved file)
model = train_model()

# Streamlit interface
st.title('Premier League Match Outcome Prediction')

# User input for home team and away team
home_team = st.selectbox('Select Home Team', ['Manchester United', 'Liverpool', 'Chelsea', 'Arsenal', 'Manchester City', 'Tottenham Hotspur', 'Other'])
away_team = st.selectbox('Select Away Team', ['Manchester United', 'Liverpool', 'Chelsea', 'Arsenal', 'Manchester City', 'Tottenham Hotspur', 'Other'])

# Collecting match stats (can be simplified, or more stats can be added)
home_shots = st.slider('Home Team Shots', 0, 50, 15)
away_shots = st.slider('Away Team Shots', 0, 50, 15)
home_shots_on_target = st.slider('Home Team Shots on Target', 0, 20, 5)
away_shots_on_target = st.slider('Away Team Shots on Target', 0, 20, 5)
home_fouls = st.slider('Home Team Fouls', 0, 30, 5)
away_fouls = st.slider('Away Team Fouls', 0, 30, 5)
home_corners = st.slider('Home Team Corners', 0, 20, 5)
away_corners = st.slider('Away Team Corners', 0, 20, 5)
home_yellow_cards = st.slider('Home Team Yellow Cards', 0, 5, 1)
away_yellow_cards = st.slider('Away Team Yellow Cards', 0, 5, 1)
home_red_cards = st.slider('Home Team Red Cards', 0, 5, 0)
away_red_cards = st.slider('Away Team Red Cards', 0, 5, 0)

# Prepare features for prediction
home_team_code = {'Manchester United': 0, 'Liverpool': 1, 'Chelsea': 2, 'Arsenal': 3, 'Manchester City': 4, 'Tottenham Hotspur': 5, 'Other': 6}
away_team_code = {'Manchester United': 0, 'Liverpool': 1, 'Chelsea': 2, 'Arsenal': 3, 'Manchester City': 4, 'Tottenham Hotspur': 5, 'Other': 6}

home_team_code = home_team_code.get(home_team, 6)
away_team_code = away_team_code.get(away_team, 6)

# Prepare the input features
input_features = pd.DataFrame([[home_team_code, away_team_code, home_shots, away_shots, home_shots_on_target, away_shots_on_target, home_fouls, away_fouls,
                                home_corners, away_corners, home_yellow_cards, away_yellow_cards, home_red_cards, away_red_cards]])

# Make prediction
prediction = model.predict(input_features)

# Show prediction result
if prediction == 1:
    st.write("Prediction: Home Team Wins!")
elif prediction == 0:
    st.write("Prediction: Draw!")
else:
    st.write("Prediction: Away Team Wins!")


Columns in dataset: Index(['Div', 'Date', 'Time', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR',
       'HTHG', 'HTAG',
       ...
       'AvgC<2.5', 'AHCh', 'B365CAHH', 'B365CAHA', 'PCAHH', 'PCAHA', 'MaxCAHH',
       'MaxCAHA', 'AvgCAHH', 'AvgCAHA'],
      dtype='object', length=106)


2024-11-13 11:59:41.690 
  command:

    streamlit run /usr/local/lib/python3.10/dist-packages/colab_kernel_launcher.py [ARGUMENTS]
2024-11-13 11:59:41.745 Session state does not function when running a script without `streamlit run`


In [65]:
!streamlit run app.py


Usage: streamlit run [OPTIONS] TARGET [ARGS]...
Try 'streamlit run --help' for help.

Error: Invalid value: File does not exist: app.py
