<a href="https://colab.research.google.com/github/kasa10/Digital_Team_Gazpromneft_MIPT/blob/main/baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
df=pd.read_csv('/content/drive/MyDrive/train.csv')
train, test = train_test_split(df, test_size=0.33, random_state=42)

def explore_data():
    """ Make some visualizations and collect statistics about data in files """

    print(f'Размер train таблицы {train.shape}')
    print(f'Размер test таблицы {test.shape}')

    print(f'Количество уникальных скважин в train таблице: {len(train["Номер скважины"].unique())}')
    print(f'Количество уникальных скважин в test таблице: {len(test["Номер скважины"].unique())}')

    wells = list(train["Номер скважины"].unique())
    well_df = train[train["Номер скважины"] == wells[0]]
    print(well_df)
    print(f'Размер таблицы с данными для одной скважины {well_df.shape}')


if __name__ == '__main__':
    explore_data()

Размер train таблицы (44981, 20)
Размер test таблицы (22155, 20)
Количество уникальных скважин в train таблице: 106
Количество уникальных скважин в test таблице: 106
         datetime  Номер скважины  Дебит нефти  Давление забойное         x  \
39375  1992-03-18              61    12.608500          76.999421  15133.22   
38877  1990-11-06              61     8.382000          34.534516  15133.22   
39100  1991-06-17              61    15.605000          33.829684  15133.22   
39154  1991-08-10              61    11.530000          32.084234  15133.22   
39175  1991-08-31              61    10.479000          32.225702  15133.22   
...           ...             ...          ...                ...       ...   
39298  1992-01-01              61     7.908000          31.461979  15133.22   
39081  1991-05-29              61    13.315000          34.222931  15133.22   
39384  1992-03-27              61    11.324800          53.463181  15133.22   
39353  1992-02-25              61     8.1940

In [2]:
pip install fedot

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pandas<1.3.0,>=1.1.0
  Using cached pandas-1.2.5-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (9.9 MB)
Installing collected packages: pandas
  Attempting uninstall: pandas
    Found existing installation: pandas 1.3.0
    Uninstalling pandas-1.3.0:
      Successfully uninstalled pandas-1.3.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
xarray-einstats 0.2.2 requires numpy>=1.21, but you have numpy 1.19.5 which is incompatible.
plotnine 0.6.0 requires matplotlib>=3.1.1, but you have matplotlib 3.0.2 which is incompatible.
mizani 0.6.0 requires matplotlib>=3.1.1, but you have matplotlib 3.0.2 which is incompatible.[0m
Successfully installed pandas-1.2.5


In [4]:
import pandas as pd
import numpy as np

from fedot.api.main import Fedot
from fedot.core.repository.tasks import TsForecastingParams
from tqdm import tqdm

FORECAST_HORIZON = 90


def make_fit_predict(historical_values: pd.DataFrame,
                     forecast_horizon: int = FORECAST_HORIZON):
    """
    Используется одномерный временной ряд для обучения FEDOT модели
    Затем используется временной ряд как предыстория для формирования прогноза
    в будущее
    """
    time_series = np.array(historical_values["Дебит нефти"])

    model = Fedot(problem='ts_forecasting',
                  task_params=TsForecastingParams(forecast_length=forecast_horizon),
                  timeout=0.5, preset='fast_train', n_jobs=-1)

    # run AutoML model design in the same way
    pipeline = model.fit(features=time_series, target=time_series,
                         predefined_model='auto')
    forecast = model.predict(time_series)

    # Generate pipeline with datetime and predicted column
    date_range = pd.date_range(start='1992-04-11', freq='1D', periods=FORECAST_HORIZON)
    forecast_df = pd.DataFrame({'datetime': date_range, 'forecast': forecast})
    return forecast_df


def launch_baseline():
    train_df = pd.read_csv('/content/drive/MyDrive/train.csv')
    wells = list(train_df["Номер скважины"].unique())

    all_forecasts = []
    with tqdm(total=len(wells)) as pbar:
        for well in wells:
            well_df = train_df[train_df["Номер скважины"] == well]

            # Make predictions with FEDOT framework
            forecats_df = make_fit_predict(well_df)
            forecats_df["Номер скважины"] = [well] * len(forecats_df)
            all_forecasts.append(forecats_df)

            pbar.update(1)

    all_forecasts = pd.concat(all_forecasts)
    all_forecasts.to_csv('baseline_forecast.csv', index=False)


if __name__ == '__main__':
    launch_baseline()

100%|██████████| 106/106 [01:16<00:00,  1.39it/s]


In [9]:
import os
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import mean_squared_error
from tqdm import tqdm


def get_project_path() -> str:
    return Path(__file__).parent


def calculate_final_score(path_to_csv_file: str, vis: bool = False):
    """ Расчет метрики по таблице с предсказаниями алгоритма """
    df_with_metrics = pd.read_csv(path_to_csv_file, parse_dates=['datetime'])
    wells = list(df_with_metrics["Номер скважины"].unique())

    train_df = pd.read_csv(os.path.join(get_project_path(), 'data', 'train.csv'), parse_dates=['datetime'])
    test_df = pd.read_csv(os.path.join(get_project_path(), 'data', 'test.csv'), parse_dates=['datetime'])
    metrics = []
    with tqdm(total=len(wells)) as pbar:
        for well in wells:
            well_forecast_df = df_with_metrics[df_with_metrics["Номер скважины"] == well]
            well_forecast_df = well_forecast_df.sort_values(by='datetime')

            well_actual_df = test_df[test_df["Номер скважины"] == well]
            well_actual_df = well_actual_df.sort_values(by='datetime')

            rmse_metric = mean_squared_error(np.array(well_actual_df['Дебит нефти']),
                                             np.array(well_forecast_df['forecast']),
                                             squared=False)
            metrics.append(rmse_metric)

            if vis:
                # Create plot
                historical_df = train_df[train_df["Номер скважины"] == well]

                plt.plot(historical_df['datetime'], historical_df['Дебит нефти'], label='Train')
                plt.plot(well_actual_df['datetime'], well_actual_df['Дебит нефти'], label='Test')
                plt.plot(well_forecast_df['datetime'], well_forecast_df['forecast'], label='Forecast')
                plt.grid()
                plt.legend()
                plt.xlabel('Дата')
                plt.ylabel('Дебит нефти')
                plt.show()
            pbar.update(1)

    metrics = np.array(metrics)
    return np.mean(metrics)



metric = calculate_final_score(path_to_csv_file='/content/baseline_forecast.csv', vis=True)
print(f'Метрика RMSE {metric:.2f}')


NameError: ignored