In [206]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.ensemble import VotingRegressor
from joblib import dump


# TRAIN (don't open plz)

In [207]:
data = pd.read_csv('/content/my_data.csv')
df = pd.DataFrame(data)
df = df.drop(columns=['log_transformed'])

In [208]:
static_columns = []
dynamic_columns = []

for col in df.columns:
    if col in ['blend_id', 'component_name']:
        continue
    is_static = df.groupby('blend_id')[col].apply(lambda x: x.nunique() == 1).all()
    if is_static:
        static_columns.append(col)
    else:
        dynamic_columns.append(col)

print("Статические параметры:", static_columns)
print("Динамические параметры:", dynamic_columns)

duplicates = df.duplicated(subset=['blend_id', 'component_name'], keep=False)
if duplicates.any():
    print("\nПредупреждение: Найдены дубликаты. Используется агрегация (first):")
    print(df[duplicates])
    df = df.drop_duplicates(subset=['blend_id', 'component_name'], keep='first')

static_df = df.groupby('blend_id')[static_columns].first().reset_index()

pivot_dfs = []
for param in dynamic_columns:
    pivot_df = df.pivot_table(
        index='blend_id',
        columns='component_name',
        values=param,
        aggfunc='first'
    ).add_prefix(f'{param}_')
    pivot_dfs.append(pivot_df)

result_df = static_df.merge(pd.concat(pivot_dfs, axis=1), on='blend_id')

print("\nИтоговая таблица:")
result_df

Статические параметры: ['oil_property_value']
Динамические параметры: ['mass_fraction', 'component_type_title']

Предупреждение: Найдены дубликаты. Используется агрегация (first):
                                   blend_id  \
0      c090b033-87c0-4342-b737-fa7289b6f976   
1      c090b033-87c0-4342-b737-fa7289b6f976   
2      c090b033-87c0-4342-b737-fa7289b6f976   
3      c090b033-87c0-4342-b737-fa7289b6f976   
4      c090b033-87c0-4342-b737-fa7289b6f976   
...                                     ...   
94870  3826f353-92cb-4759-9625-8de04fd31f92   
94871  3826f353-92cb-4759-9625-8de04fd31f92   
94872  3826f353-92cb-4759-9625-8de04fd31f92   
94873  3826f353-92cb-4759-9625-8de04fd31f92   
94874  3826f353-92cb-4759-9625-8de04fd31f92   

                             component_name  mass_fraction  \
0      3a03d432-d849-417d-92e7-9a604187a096          67.00   
1      c33ebc5c-0935-4c6b-a489-f1404f88be22          33.00   
2      3a03d432-d849-417d-92e7-9a604187a096          67.00   
3      

Unnamed: 0,blend_id,oil_property_value,mass_fraction_0962b43d-14b3-4bc5-b7b4-34500629ff48,mass_fraction_0a3f8afa-864c-4d53-83fb-714ef27dc481,mass_fraction_0c047bc0-0041-45f7-9222-d2ec5fd2df38,mass_fraction_12c7dfaa-3b44-445e-9dcd-9986cd64588e,mass_fraction_1864d2ec-c58d-498d-98e8-4cf2cbff06d0,mass_fraction_1ec8ea9b-f26c-4ff9-bd23-3f69ba63151a,mass_fraction_22c43eee-8a3a-4bdb-a18d-6e474b6624de,mass_fraction_240187a5-a150-4a92-9e53-8d964dedf3fb,...,component_type_title_96666028-8268-41c0-92e4-cb08346a232d,component_type_title_9a8b4758-a712-4114-a28f-775e5f7f519c,component_type_title_9c948f48-0165-4291-8db6-bfe73a534785,component_type_title_a07715b4-8762-491c-817c-170633ec220e,component_type_title_a9c1305e-3832-4ddc-a6c4-24e3a649d2e6,component_type_title_c33ebc5c-0935-4c6b-a489-f1404f88be22,component_type_title_cb3bbf9b-6a34-46c1-bce8-9c7b5e2cbb50,component_type_title_cf40823a-4b86-415c-b841-a9ead84d9fcf,component_type_title_e206e647-919b-4d8e-b840-e79dbd46eb27,component_type_title_e468b13a-e145-4e03-a68e-1bdbecec3aae
0,00000d27-5cbc-45d1-8f32-169f158435b3,6.750,,,,,,,,,...,,,,,,5360a311-e081-4972-9215-26b5d9072f65,,1df5bd53-c7d2-422f-899c-0a6e1d638c6a,,
1,016d3413-ba63-4ee4-8925-b6b9a631e33c,6.885,30.000,,,,,,48.46,,...,,,,,,,,,,
2,021bc9d5-473f-49b2-8bd4-cd40221af7bc,7.440,78.233,,,,,,,,...,,,,,,,,,,
3,030d200c-7abe-4d1f-a43f-2043bb127ed2,7.000,,,,,,,55.20,,...,,,,,,,,,,
4,038a4242-87e4-4971-a2a4-534da0a39a95,7.870,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
463,fda95f96-2fac-419a-b1ec-a9d3fc300940,6.560,,,,,,,,,...,,,,,,5360a311-e081-4972-9215-26b5d9072f65,,1df5bd53-c7d2-422f-899c-0a6e1d638c6a,,
464,fde9a9fd-d996-4d17-b8c6-dd4d24c4a73b,10.450,63.910,,,,,,,,...,,,,,,,,1df5bd53-c7d2-422f-899c-0a6e1d638c6a,,
465,fe0e1171-0c63-41f4-93af-7ccc7d6a0aad,3.810,,,,,,,,,...,,,,,,,,,,
466,ff02ed11-db18-4dc8-9548-321e6714170c,6.710,,,,,,,,,...,,1df5bd53-c7d2-422f-899c-0a6e1d638c6a,,,,5360a311-e081-4972-9215-26b5d9072f65,,,,


In [210]:
data = result_df.fillna(0)

In [211]:
numeric_cols = data.select_dtypes(include=['float64']).columns.tolist()

In [212]:
categorical_features = [
    col for col in data.columns
    if data[col].dtype == 'object' or (data[col].dtype in ['int64', 'int32'] and data[col].nunique() / len(data) < 0.1)
]


In [213]:
X = data.drop('oil_property_value', axis=1)
y = data['oil_property_value']

In [214]:
for col in categorical_features:
    X[col] = X[col].astype(str)

In [215]:
encoder = OrdinalEncoder()
scaler = StandardScaler()

In [217]:
X[categorical_features] = encoder.fit_transform(X[categorical_features])
data[numeric_cols] = scaler.fit_transform(data[numeric_cols])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [262]:
ensemble_model = VotingRegressor(
    estimators=[
        ('random_forest', RandomForestRegressor(
            max_depth=12,
            random_state=42,
            n_estimators=2000
            )),
        ('gradient_boosting', GradientBoostingRegressor(
            n_estimators=2000,
            max_depth=12,
            learning_rate=0.05
        )),
        ('hist_gradient_boosting', HistGradientBoostingRegressor(random_state=42, max_iter=2000, max_depth=12, learning_rate=0.005))
    ],
    weights=[1, 2, 1.5]
)

ensemble_model.fit(X_train, y_train)


In [263]:
y_pred = ensemble_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f"MAE: {mae}")

MAE: 0.36501915056786516


In [272]:
dump(ensemble_model, 'model.joblib')

['model.joblib']

In [273]:
y_pred

array([ 6.52614419,  6.49194477,  4.80435991,  8.59165071,  8.2375767 ,
       10.32640839,  7.08998802,  6.89120364,  4.81708341,  6.50554185,
        5.54605904,  6.54829506,  6.14059429,  6.68802201,  8.04888672,
        6.06920619,  5.20578339,  6.46511926,  8.76743613,  8.44301245,
        6.76384693,  6.06930198,  7.52777316,  7.48459618,  8.02756532,
        8.44769696,  5.84299048,  8.30010744,  5.7312975 ,  6.16922033,
        6.29208686,  8.33902507,  6.15206817,  6.95359624, 10.65756031,
        6.83944256,  6.52369171,  8.07497278,  6.83138032,  8.37750017,
        6.28610775,  6.44757744,  8.49828714,  5.69930265,  6.28187352,
        6.1756806 ,  8.29376927,  5.54644212,  6.68453936,  7.96970088,
        5.6848182 ,  6.13078847,  5.78061083,  6.62461218,  5.48722687,
        8.36203254,  7.07591687,  5.41132038,  5.82699429,  8.64646113,
       10.31541544,  7.98256511,  5.99532905,  9.25059883,  6.01285634,
       10.34486909,  9.05157453,  6.81113734,  8.80873978,  5.47

In [274]:
np.array(y_test)

array([ 6.54 ,  6.94 ,  4.41 ,  8.25 ,  8.34 , 10.85 ,  7.03 ,  6.78 ,
        4.51 ,  5.982,  5.68 ,  7.01 ,  6.51 ,  6.727,  7.674,  6.53 ,
        4.64 ,  6.51 , 10.175,  8.39 ,  6.75 ,  6.24 ,  7.41 ,  7.46 ,
        8.16 ,  8.389,  5.69 ,  8.946,  5.64 ,  6.16 ,  6.51 ,  8.32 ,
        6.043,  6.54 , 10.34 ,  7.441,  6.62 ,  7.89 ,  6.66 ,  8.29 ,
        6.09 ,  6.89 ,  8.62 ,  5.61 ,  6.17 ,  5.72 ,  8.458,  5.5  ,
        6.89 ,  8.3  ,  5.47 ,  6.328,  5.67 ,  6.2  ,  5.56 ,  8.247,
       11.09 ,  5.47 ,  5.63 ,  8.72 , 10.57 ,  8.4  ,  5.4  ,  9.23 ,
        7.67 , 11.36 ,  9.1  ,  6.49 ,  8.92 ,  5.49 ,  5.842,  4.966,
        6.88 ,  8.29 ,  7.23 ,  8.92 , 10.57 , 10.64 ,  6.437,  6.23 ,
        6.45 ,  6.56 , 10.52 ,  9.38 ,  4.46 ,  5.82 , 10.9  ,  5.5  ,
        7.87 ,  4.965,  5.78 ,  9.768,  5.36 ,  6.68 ])

# GO

In [276]:
# 3a03d432-d849-417d-92e7-9a604187a096 ——— 67.0 ——— 5360a311-e081-4972-9215-26b5d9072f65
# c33ebc5c-0935-4c6b-a489-f1404f88be22 ——— 33.0 ——— 5360a311-e081-4972-9215-26b5d9072f65

user_input = {
    'blend_id': ['blend1', 'blend1'],
    'component_name': ['3a03d432-d849-417d-92e7-9a604187a096', 'c33ebc5c-0935-4c6b-a489-f1404f88be22'],
    'mass_fraction': [67.0, 33.0],
    'component_type_title': ['5360a311-e081-4972-9215-26b5d9072f65', '5360a311-e081-4972-9215-26b5d9072f65'],
}
model_go = joblib.load('model.joblib')

In [277]:
df_test = pd.DataFrame(user_input)

result = (
    df_test.groupby('blend_id')
    .apply(lambda x: pd.Series({
        f'mass_fraction_{uuid}': x.loc[x['component_name'] == uuid, 'mass_fraction'].sum()
        for uuid in df_test['component_name']
    }))
    .reset_index()
    .fillna(0)
)
if 'oil_property_value' in result.columns:
    result = result.drop(columns=['oil_property_value'])

for col in target_columns:
    if col not in result.columns:
        result[col] = 0

result = result[target_columns]
# print(result)

res = result.drop(columns=['oil_property_value'])
result = result.apply(lambda x: x.astype('float64', errors='ignore') if x.dtype.kind in 'iufc' else x)
res[categorical_features] = encoder.fit_transform(res[categorical_features])

  .apply(lambda x: pd.Series({


In [278]:
model_go.predict(res)

array([5.4439497])