# Step 1: Setup

Import libraries and initialize MLflow experiment.

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import mlflow
import mlflow.sklearn

# Initialize MLflow experiment
mlflow.set_experiment("FPL_Baseline_Regression")

# Step 2: Data Loading & Inspection

Load the dataset and inspect its structure.

In [None]:
# Load the dataset
df = pd.read_csv('/mnt/data/dataset_ver3.2.5.csv')

# Inspect the data
df.info()
df.head()

# Step 3: Preprocessing

Parse list-like columns, convert types, and handle missing values.

In [None]:
from ast import literal_eval

# Columns to parse
list_cols = ['Pos', 'Opposition', 'Start']

# Apply literal_eval to convert string representations to lists, then extract first element
for col in list_cols:
    df[col] = df[col].apply(lambda x: literal_eval(x)[0] if isinstance(x, str) else x[0] if isinstance(x, list) else x)

# Convert percentage and numeric columns to float
df['Ownership, %'] = df['Ownership, %'].str.rstrip('%').astype(float)
df['Price'] = df['Price'].astype(float)
df['Min'] = df['Min'].astype(float)

# Drop rows with missing target or essential features
df = df.dropna(subset=['Points', 'Min', 'GlsPrev', 'AstPrev', 'CSPrev', 'PrevPoints'])

# Confirm preprocessing
df.info()

# Step 4: Feature Engineering (Baseline)

Create baseline features and target variable.

In [None]:
# Baseline feature list
baseline_features = ['Min', 'GlsPrev', 'AstPrev', 'CSPrev', 'PrevPoints', 'Price', 'Ownership, %', 'Selected', 'Pos', 'Team', 'Opposition']

# Select features and target
X = df[baseline_features].copy()
y = df['Points']

# One-hot encode categorical features
X = pd.get_dummies(X, columns=['Pos', 'Team', 'Opposition'], drop_first=True)

# Review feature set
X.head()