In [1]:
import pandas as pd

In [12]:
data = pd.read_csv('data/my_data.csv')
df = pd.DataFrame(data)
df = df.drop(columns=['log_transformed'])

In [None]:
static_columns = []
dynamic_columns = []

for col in df.columns:
    if col in ['blend_id', 'component_name']:
        continue
    is_static = df.groupby('blend_id')[col].apply(lambda x: x.nunique() == 1).all()
    if is_static:
        static_columns.append(col)
    else:
        dynamic_columns.append(col)

print("Статические параметры:", static_columns)
print("Динамические параметры:", dynamic_columns)

duplicates = df.duplicated(subset=['blend_id', 'component_name'], keep=False)
if duplicates.any():
    print("\nПредупреждение: Найдены дубликаты. Используется агрегация (first):")
    print(df[duplicates])
    df = df.drop_duplicates(subset=['blend_id', 'component_name'], keep='first')

static_df = df.groupby('blend_id')[static_columns].first().reset_index()

pivot_dfs = []
for param in dynamic_columns:
    pivot_df = df.pivot_table(
        index='blend_id',
        columns='component_name',
        values=param,
        aggfunc='first'
    ).add_prefix(f'{param}_')
    pivot_dfs.append(pivot_df)

result_df = static_df.merge(pd.concat(pivot_dfs, axis=1), on='blend_id')

print("\nИтоговая таблица:")
print(result_df)

Статические параметры: ['oil_property_value']
Динамические параметры: ['mass_fraction', 'component_type_title']

Предупреждение: Найдены дубликаты. Используется агрегация (first):
                                   blend_id  \
0      c090b033-87c0-4342-b737-fa7289b6f976   
1      c090b033-87c0-4342-b737-fa7289b6f976   
2      c090b033-87c0-4342-b737-fa7289b6f976   
3      c090b033-87c0-4342-b737-fa7289b6f976   
4      c090b033-87c0-4342-b737-fa7289b6f976   
...                                     ...   
94870  3826f353-92cb-4759-9625-8de04fd31f92   
94871  3826f353-92cb-4759-9625-8de04fd31f92   
94872  3826f353-92cb-4759-9625-8de04fd31f92   
94873  3826f353-92cb-4759-9625-8de04fd31f92   
94874  3826f353-92cb-4759-9625-8de04fd31f92   

                             component_name  mass_fraction  \
0      3a03d432-d849-417d-92e7-9a604187a096          67.00   
1      c33ebc5c-0935-4c6b-a489-f1404f88be22          33.00   
2      3a03d432-d849-417d-92e7-9a604187a096          67.00   
3      

In [14]:
result_df = result_df.fillna(0)
result_df

Unnamed: 0,blend_id,oil_property_value,mass_fraction_0962b43d-14b3-4bc5-b7b4-34500629ff48,mass_fraction_0a3f8afa-864c-4d53-83fb-714ef27dc481,mass_fraction_0c047bc0-0041-45f7-9222-d2ec5fd2df38,mass_fraction_12c7dfaa-3b44-445e-9dcd-9986cd64588e,mass_fraction_1864d2ec-c58d-498d-98e8-4cf2cbff06d0,mass_fraction_1ec8ea9b-f26c-4ff9-bd23-3f69ba63151a,mass_fraction_22c43eee-8a3a-4bdb-a18d-6e474b6624de,mass_fraction_240187a5-a150-4a92-9e53-8d964dedf3fb,...,component_type_title_96666028-8268-41c0-92e4-cb08346a232d,component_type_title_9a8b4758-a712-4114-a28f-775e5f7f519c,component_type_title_9c948f48-0165-4291-8db6-bfe73a534785,component_type_title_a07715b4-8762-491c-817c-170633ec220e,component_type_title_a9c1305e-3832-4ddc-a6c4-24e3a649d2e6,component_type_title_c33ebc5c-0935-4c6b-a489-f1404f88be22,component_type_title_cb3bbf9b-6a34-46c1-bce8-9c7b5e2cbb50,component_type_title_cf40823a-4b86-415c-b841-a9ead84d9fcf,component_type_title_e206e647-919b-4d8e-b840-e79dbd46eb27,component_type_title_e468b13a-e145-4e03-a68e-1bdbecec3aae
0,00000d27-5cbc-45d1-8f32-169f158435b3,6.750,0.000,0.0,0.0,0.0,0.0,0.0,0.00,0.0,...,0,0,0,0,0,5360a311-e081-4972-9215-26b5d9072f65,0,1df5bd53-c7d2-422f-899c-0a6e1d638c6a,0,0
1,016d3413-ba63-4ee4-8925-b6b9a631e33c,6.885,30.000,0.0,0.0,0.0,0.0,0.0,48.46,0.0,...,0,0,0,0,0,0,0,0,0,0
2,021bc9d5-473f-49b2-8bd4-cd40221af7bc,7.440,78.233,0.0,0.0,0.0,0.0,0.0,0.00,0.0,...,0,0,0,0,0,0,0,0,0,0
3,030d200c-7abe-4d1f-a43f-2043bb127ed2,7.000,0.000,0.0,0.0,0.0,0.0,0.0,55.20,0.0,...,0,0,0,0,0,0,0,0,0,0
4,038a4242-87e4-4971-a2a4-534da0a39a95,7.870,0.000,0.0,0.0,0.0,0.0,0.0,0.00,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
463,fda95f96-2fac-419a-b1ec-a9d3fc300940,6.560,0.000,0.0,0.0,0.0,0.0,0.0,0.00,0.0,...,0,0,0,0,0,5360a311-e081-4972-9215-26b5d9072f65,0,1df5bd53-c7d2-422f-899c-0a6e1d638c6a,0,0
464,fde9a9fd-d996-4d17-b8c6-dd4d24c4a73b,10.450,63.910,0.0,0.0,0.0,0.0,0.0,0.00,0.0,...,0,0,0,0,0,0,0,1df5bd53-c7d2-422f-899c-0a6e1d638c6a,0,0
465,fe0e1171-0c63-41f4-93af-7ccc7d6a0aad,3.810,0.000,0.0,0.0,0.0,0.0,0.0,0.00,0.0,...,0,0,0,0,0,0,0,0,0,0
466,ff02ed11-db18-4dc8-9548-321e6714170c,6.710,0.000,0.0,0.0,0.0,0.0,0.0,0.00,0.0,...,0,1df5bd53-c7d2-422f-899c-0a6e1d638c6a,0,0,0,5360a311-e081-4972-9215-26b5d9072f65,0,0,0,0


In [16]:
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split

ModuleNotFoundError: No module named 'catboost'

In [None]:
categorical_features = [
    col for col in df.columns
    if df[col].dtype == 'object' or (df[col].dtype in ['int64', 'int32'] and df[col].nunique() / len(df) < 0.1)
]

In [17]:
X = result_df.drop(columns=['blend_id', 'oil_property_value'])
y = result_df['oil_property_value']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

NameError: name 'train_test_split' is not defined

In [None]:
model = CatBoostRegressor(
    iterations=4000,         
    learning_rate=0.005,      
    depth=8,                
    l2_leaf_reg=3,           
    loss_function='MAE',    
    eval_metric='MAE',  
    bootstrap_type='Bernoulli', 
    subsample=0.7,   
    random_seed=42,          
    od_type='Iter',          
    od_wait=100,      
    verbose=100     
)

In [None]:
model.fit(
    X_train, y_train, 
    eval_set=(X_val, y_val), early_stopping_rounds=100,
    categorical_features=categorical_features
    )

In [None]:
model.save_model('catboost_model.cbm')

инференс

In [None]:
model = CatBoostRegressor()
model.load_model('catboost_model.cbm')

feature_columns = pd.read_pickle('feature_columns.pkl')  # Сохраненный список колонок

def prepare_input(user_components):
    """
    Преобразует ввод пользователя в DataFrame с правильной структурой
    """
    input_df = pd.DataFrame(columns=feature_columns)
    
    for component in user_components:
        for param, value in component['params'].items():
            col_name = f"{param}_{component['name']}"
            if col_name in feature_columns:
                input_df[col_name] = value
                
    input_df = input_df.fillna(0)
    return input_df.reindex(columns=feature_columns, fill_value=0)

user_input = [
]

try:
    input_data = prepare_input(user_input)
    
    static_params = ['density', 'temperature'] 
    for param in static_params:
        if f"{param}_" in ' '.join(input_data.columns):
            raise ValueError(f"Параметр {param} должен быть статическим для всей смеси")
    
    prediction = model.predict(input_data)
    
    print(f"Предсказанное значение target: {prediction[0]:.2f}")

except Exception as e:
    print(f"Ошибка: {str(e)}")