In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor

from sklearn.metrics import (
    accuracy_score, roc_auc_score, precision_score, recall_score, f1_score,
    mean_squared_error, mean_absolute_error, r2_score
)

In [16]:
def time_to_seconds(t):
    if pd.isna(t):
        return np.nan
    parts = t.split(':')
    try:
        if len(parts) == 2:
            minutes, seconds = parts
            return int(minutes) * 60 + float(seconds)
        elif len(parts) == 3:
            hours, minutes, seconds = parts
            return int(hours) * 3600 + int(minutes) * 60 + float(seconds)
        else:
            return float(t)
    except:
        return np.nan

In [17]:
df = pd.read_csv('Formual_E_Raceresults.csv')


# 3. Конвертация строковых временных колонок
for col in ['Best', 'Started', 'Time']:
    df[f'{col}_sec'] = df[col].apply(time_to_seconds)

# 4. Извлечение признаков и целей
# 4.1 Год сезона — ищем подряд 4 цифры и преобразуем в число
df['SeasonYear'] = (
    df['SeasonName']
      .str.extract(r'(\d{4})', expand=False)   # вытащить первую группу из 4 цифр
      .astype(float)                           # сначала в float, чтобы NaN появились
)
# Откинуть строки без года
df = df[df['SeasonYear'].notna()].copy()
df['SeasonYear'] = df['SeasonYear'].astype(int)

# 4.2 Полное имя гонщика
df['Driver'] = df['DriverFirstName'] + ' ' + df['DriverLastName']

# 4.3 Бинарная цель: попадание на подиум (Pos ≤ 3)
df['podium'] = (df['Pos'] <= 3).astype(int)

# 5. Список признаков
features = [
    'SeasonYear',
    'Team',
    'DriverNumber',
    'Driver',
    'Best_sec',
    'Started_sec',
    'Time_sec'
]

In [18]:
df.head(10)

Unnamed: 0,SeasonName,RaceName,Pos,DriverNumber,DriverFirstName,DriverLastName,Team,Started,Best,Time,PtsPoints,Best_sec,Started_sec,Time_sec,SeasonYear,Driver,podium
0,Season 1 2014/15,Beijing E-Prix,1,#11,Lucas,Di Grassi,Audi Sport ABT Formula E Team,2,1:46.718,52:23.413,25,106.718,2.0,3143.413,2014,Lucas Di Grassi,1
1,Season 1 2014/15,Beijing E-Prix,2,#27,Franck,Montagny,Andretti Autosport Formula E Team,8,1:46.640,+ 2.867,18,106.64,8.0,,2014,Franck Montagny,1
2,Season 1 2014/15,Beijing E-Prix,3,#2,Sam,Bird,Virgin Racing Formula E Team,11,1:46.563,+ 6.559,15,106.563,11.0,,2014,Sam Bird,1
3,Season 1 2014/15,Beijing E-Prix,4,#28,Charles,Pic,Andretti Autosport Formula E Team,7,1:46.730,+ 19.301,12,106.73,7.0,,2014,Charles Pic,0
4,Season 1 2014/15,Beijing E-Prix,5,#5,Karun,Chandhok,Mahindra Racing Formula E Team,4,1:45.892,+ 23.952,10,105.892,4.0,,2014,Karun Chandhok,0
5,Season 1 2014/15,Beijing E-Prix,6,#7,Jérôme,D'Ambrosio,Dragon Racing Formula E Team,12,1:47.313,+ 31.664,8,107.313,12.0,,2014,Jérôme D'Ambrosio,0
6,Season 1 2014/15,Beijing E-Prix,7,#6,Oriol,Servià,Dragon Racing Formula E Team,10,1:47.361,+ 41.968,6,107.361,10.0,,2014,Oriol Servià,0
7,Season 1 2014/15,Beijing E-Prix,8,#99,Nelson,Piquet Jr.,China Racing Formula E Team,9,1:47.819,+ 43.896,4,107.819,9.0,,2014,Nelson Piquet Jr.,0
8,Season 1 2014/15,Beijing E-Prix,9,#30,Stéphane,Sarrazin,Venturi Formula E Team,19,1:47.832,+ 43.975,2,107.832,19.0,,2014,Stéphane Sarrazin,0
9,Season 1 2014/15,Beijing E-Prix,10,#66,Daniel,Abt,Audi Sport ABT Formula E Team,3,1:46.551,+ 1:02.507,1,106.551,3.0,,2014,Daniel Abt,0


In [19]:
num_feats = ['SeasonYear', 'Best_sec', 'Started_sec', 'Time_sec']
cat_feats = ['Team', 'DriverNumber', 'Driver']

numeric_transformer = SimpleImputer(strategy='median')
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, num_feats),
    ('cat', categorical_transformer, cat_feats),
])

# Классификация “подиум/не-подиум”

In [20]:
X_clf = df[features]
y_clf = df['podium']

X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(
    X_clf, y_clf,
    test_size=0.2,
    stratify=y_clf,
    random_state=42
)

pipe_lr_clf = Pipeline([
    ('prep', preprocessor),
    ('scale', StandardScaler(with_mean=False)),
    ('clf',   LogisticRegression(max_iter=1000))
])

pipe_gb_clf = Pipeline([
    ('prep', preprocessor),
    ('clf',   GradientBoostingClassifier(n_estimators=200, random_state=42))
])

pipe_lr_clf.fit(X_train_c, y_train_c)
pipe_gb_clf.fit(X_train_c, y_train_c)

y_pred_lr = pipe_lr_clf.predict(X_test_c)
y_proba_lr = pipe_lr_clf.predict_proba(X_test_c)[:,1]
y_pred_gb = pipe_gb_clf.predict(X_test_c)
y_proba_gb = pipe_gb_clf.predict_proba(X_test_c)[:,1]

metrics_clf = pd.DataFrame({
    'Model': ['LogisticRegression', 'GradientBoosting'],
    'Accuracy': [
        accuracy_score(y_test_c, y_pred_lr),
        accuracy_score(y_test_c, y_pred_gb)
    ],
    'ROC AUC': [
        roc_auc_score(y_test_c, y_proba_lr),
        roc_auc_score(y_test_c, y_proba_gb)
    ],
    'Precision': [
        precision_score(y_test_c, y_pred_lr),
        precision_score(y_test_c, y_pred_gb)
    ],
    'Recall': [
        recall_score(y_test_c, y_pred_lr),
        recall_score(y_test_c, y_pred_gb)
    ],
    'F1-score': [
        f1_score(y_test_c, y_pred_lr),
        f1_score(y_test_c, y_pred_gb)
    ]
}).set_index('Model')

print("Classification metrics:\n", metrics_clf)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Classification metrics:
                     Accuracy   ROC AUC  Precision    Recall  F1-score
Model                                                                
LogisticRegression  0.657807  0.645965   0.475610  0.393939  0.430939
GradientBoosting    0.727575  0.720672   0.630769  0.414141  0.500000


In [22]:
X_reg = df[features]
y_reg = df['Pos']

X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(
    X_reg, y_reg, test_size=0.2, random_state=42
)

pipe_lr_reg = Pipeline([
    ('prep', preprocessor),
    ('scale', StandardScaler(with_mean=False)),
    ('reg',   LinearRegression())
])
pipe_gb_reg = Pipeline([
    ('prep', preprocessor),
    ('reg',   GradientBoostingRegressor(n_estimators=200, random_state=42))
])

pipe_lr_reg.fit(X_train_r, y_train_r)
pipe_gb_reg.fit(X_train_r, y_train_r)

y_pred_lr_r = pipe_lr_reg.predict(X_test_r)
y_pred_gb_r = pipe_gb_reg.predict(X_test_r)

# RMSE вручную через sqrt(MSE), чтобы избежать TypeError
rmse_lr = np.sqrt(mean_squared_error(y_test_r, y_pred_lr_r))
rmse_gb = np.sqrt(mean_squared_error(y_test_r, y_pred_gb_r))

metrics_reg = pd.DataFrame({
    'Model': ['LinearRegression', 'GradientBoosting'],
    'RMSE': [rmse_lr, rmse_gb],
    'MAE': [
        mean_absolute_error(y_test_r, y_pred_lr_r),
        mean_absolute_error(y_test_r, y_pred_gb_r)
    ],
    'R2': [
        r2_score(y_test_r, y_pred_lr_r),
        r2_score(y_test_r, y_pred_gb_r)
    ]
}).set_index('Model')

print("\nRegression metrics:\n", metrics_reg)


Regression metrics:
                       RMSE       MAE        R2
Model                                         
LinearRegression  6.129629  4.929065 -0.082406
GradientBoosting  5.778500  4.637041  0.038050
