In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

chess_rating = "chess_blitz.last.rating"

players_df = pd.read_csv(
    "./csv/players.csv", usecols=["title", chess_rating], low_memory=False
)
sk_df = players_df[["title", chess_rating]]
sk_df.fillna(0)
sk_df = sk_df[sk_df[chess_rating] > 0]
TITLES: list[str] = ["GM", "IM", "FM", "CM", "NM", "WGM", "WIM", "WFM", "WCM", "WNM"]
sk_df["title_index"] = sk_df["title"].apply(TITLES.index)
sk_df["median"] = sk_df["title_index"].apply(
    lambda value: sk_df[sk_df["title_index"] == value][chess_rating].median()
)
sk_df = sk_df[sk_df[chess_rating] > sk_df["median"]]
print(sk_df[["title_index", "median"]])

# Separate feature and target
X = sk_df[[chess_rating]]  # Note the double brackets to keep X as a DataFrame
y = sk_df["title_index"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
print(X_train_scaled)
X_test_scaled = scaler.transform(X_test)
print(X_test_scaled)

# Initialize the model (e.g., Logistic Regression)
model = LogisticRegression(solver="lbfgs", max_iter=1000, random_state=42)

# Train the model
model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = model.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")