In [None]:
import json
import numpy as np
import pandas as pd
import logging
import time

from sklearn.metrics import balanced_accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from imblearn.over_sampling import SMOTE

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s:%(message)s')


In [None]:
def add_engineered_features(df):
    # Расчёт total_consumption
    consumption_cols = [f'consumption_{i}' for i in range(1, 13)]
    df['total_consumption'] = df[consumption_cols].sum(axis=1)
    df['mean_consumption'] = df[consumption_cols].mean(axis=1)
    df['std_consumption'] = df[consumption_cols].std(axis=1)
    df['max_consumption'] = df[consumption_cols].max(axis=1)
    df['min_consumption'] = df[consumption_cols].min(axis=1)

    # Ежемесячные дельты (разница между месяцами)
    deltas = []
    for i in range(2, 13):
        deltas.append(df[f'consumption_{i}'] - df[f'consumption_{i-1}'])
    df['monthly_delta'] = pd.concat(deltas, axis=1).abs().mean(axis=1)

    # Потребление на жителя и на комнату (с учетом типа здания)
    df['cons_per_resident'] = df['total_consumption'] / df['residentsCount'].replace(0, 1)
    df['cons_per_room'] = df['total_consumption'] / df['roomsCount'].replace(0, 1)

    # Коэффициент сезонности — std / mean потребления
    df['seasonality_coef'] = df['std_consumption'] / (df['mean_consumption'] + 1e-6)

    # Масштаб потребления с учетом типа здания
    # Допустим, для "Многоквартирный" — делим на 1, для "Частный" — на 0.8, "Прочий" — на 0.9 (пример)
    scale_map = {'Многоквартирный': 1.0, 'Частный': 0.8, 'Прочий': 0.9}
    df['building_scale'] = df['buildingType'].map(scale_map).fillna(1.0)
    df['scaled_cons_per_resident'] = df['cons_per_resident'] * df['building_scale']

    return df


def remove_outliers(df):
    # Удалим выбросы по total_consumption (например, 1% и 99% квантиль)
    lower_bound = df['total_consumption'].quantile(0.01)
    upper_bound = df['total_consumption'].quantile(0.99)
    before = len(df)
    df = df[(df['total_consumption'] >= lower_bound) & (df['total_consumption'] <= upper_bound)]
    after = len(df)
    logging.info(f'Outliers removed: {before - after}')
    return df


In [None]:
def flatten_consumption(consumption_dict):
    # Если consumption - словарь с месяцами, возвращаем список из 12 элементов
    result = []
    for month in range(1, 13):
        val = consumption_dict.get(str(month), np.nan) if isinstance(consumption_dict, dict) else np.nan
        result.append(val)
    return result

def load_and_preprocess(json_path):
    with open(json_path, 'r', encoding='utf-8') as f:
        raw_data = json.load(f)

    records = []
    for entry in raw_data:
        flat = {
            "accountId": entry.get("accountId"),
            "roomsCount": entry.get("roomsCount", np.nan),
            "residentsCount": entry.get("residentsCount", 0),
            "buildingType": entry.get("buildingType", "Прочий"),
            "isCommercial": entry.get("isCommercial")  # <-- добавляем метку класса
        }
        consumption_values = flatten_consumption(entry.get("consumption", {}))
        for i, val in enumerate(consumption_values, 1):
            flat[f"consumption_{i}"] = val
        records.append(flat)

    df = pd.DataFrame(records)
    df = add_engineered_features(df)
    df = remove_outliers(df)
    return df



In [None]:
print(df_train.head())
print(df_train.columns)
print(df_train['isCommercial'].value_counts())


   accountId  roomsCount  residentsCount buildingType  isCommercial  \
0       1497         1.0               1      Частный          True   
1       1509         1.0               1      Частный          True   
2       1674         3.0               2      Частный          True   
3       1955         5.0               1      Частный          True   
4       1960         3.0               3      Частный          True   

   consumption_1  consumption_2  consumption_3  consumption_4  consumption_5  \
0         3484.0         2824.0         3035.0         3597.0         2664.0   
1         3756.0         1580.0         3191.0         2931.0          793.0   
2         1543.0         1075.0         2344.0         1125.0         1045.0   
3         5564.0         6201.0         5364.0         4031.0         5452.0   
4          631.0          616.0          439.0          562.0         4723.0   

   ...  mean_consumption  std_consumption  max_consumption  min_consumption  \
0  ...       

In [None]:
def create_preprocessor():
    numeric_features = ['roomsCount', 'residentsCount'] + [f'consumption_{i}' for i in range(1, 13)] + [
        'total_consumption', 'mean_consumption', 'std_consumption', 'max_consumption', 'min_consumption',
        'monthly_delta', 'cons_per_resident', 'cons_per_room', 'seasonality_coef', 'scaled_cons_per_resident'
    ]
    categorical_features = ['buildingType']

    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ]
    )
    return preprocessor


def prepare_train_data(df, target_column='isCommercial'):
    X = df.drop(columns=['accountId', target_column])
    y = df[target_column].values
    preprocessor = create_preprocessor()
    X_proc = preprocessor.fit_transform(X)
    return X_proc, y, preprocessor


In [None]:
def build_keras_model(input_dim):
    model = Sequential([
        Dense(128, activation='relu', input_shape=(input_dim,)),
        Dropout(0.3),
        Dense(64, activation='relu'),
        Dropout(0.3),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model


def train_model(X, y):
    # Балансировка классов с SMOTE
    sm = SMOTE(random_state=42)
    X_res, y_res = sm.fit_resample(X, y)
    logging.info(f"Original dataset shape: {np.bincount(y)}")
    logging.info(f"Resampled dataset shape: {np.bincount(y_res)}")

    # class_weight (на всякий случай)
    from sklearn.utils import class_weight
    class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_res), y=y_res)
    class_weights_dict = dict(enumerate(class_weights))

    # Делим на train/val
    X_train, X_val, y_train, y_val = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

    model = build_keras_model(X.shape[1])
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

    start = time.time()
    history = model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=100,
        batch_size=32,
        class_weight=class_weights_dict,
        callbacks=[early_stopping],
        verbose=2
    )
    end = time.time()
    logging.info(f"Training time: {end - start:.2f} seconds")

    # Оценка
    y_pred = (model.predict(X_val) > 0.5).astype(int)
    bal_acc = balanced_accuracy_score(y_val, y_pred)
    logging.info(f"Balanced accuracy on validation: {bal_acc:.4f}")
    logging.info("\n" + classification_report(y_val, y_pred))

    return model, bal_acc


In [None]:
def predict_and_save(model, preprocessor, input_json_path, output_json_path):
    df = load_and_preprocess(input_json_path)

    X_pred = df.drop(columns=['accountId'])
    X_pred_proc = preprocessor.transform(X_pred)

    probs = model.predict(X_pred_proc).flatten()
    threshold = 0.5
    results = []
    for idx, account_id in enumerate(df['accountId']):
        results.append({
            "accountId": int(account_id),
            "isCommercial": bool(probs[idx] > threshold),
            "probability": float(round(probs[idx], 4))
        })

    with open(output_json_path, 'w', encoding='utf-8') as f:
        json.dump(results, f, ensure_ascii=False, indent=2)
    logging.info(f"Predictions saved to {output_json_path}")


In [None]:
# Загрузи и подготовь данные обучения
df_train = load_and_preprocess('dataset_train.json')

X_train, y_train, preprocessor = prepare_train_data(df_train, target_column='isCommercial')

# Обучи модель
model, bal_acc = train_model(X_train, y_train)

# Сохрани модель и препроцессор для повторного использования
model.save('keras_model.h5')

import joblib
joblib.dump(preprocessor, 'preprocessor.pkl')

# Предскажем для новых данных
predict_and_save(model, preprocessor, 'new_data.json', 'prediction_results.json')


Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


144/144 - 3s - 19ms/step - accuracy: 0.5864 - loss: 0.6775 - val_accuracy: 0.6316 - val_loss: 0.6530
Epoch 2/100
144/144 - 1s - 7ms/step - accuracy: 0.6322 - loss: 0.6449 - val_accuracy: 0.6429 - val_loss: 0.6339
Epoch 3/100
144/144 - 1s - 7ms/step - accuracy: 0.6413 - loss: 0.6329 - val_accuracy: 0.6429 - val_loss: 0.6396
Epoch 4/100
144/144 - 1s - 4ms/step - accuracy: 0.6483 - loss: 0.6238 - val_accuracy: 0.6455 - val_loss: 0.6221
Epoch 5/100
144/144 - 1s - 4ms/step - accuracy: 0.6541 - loss: 0.6194 - val_accuracy: 0.6603 - val_loss: 0.6211
Epoch 6/100
144/144 - 1s - 4ms/step - accuracy: 0.6485 - loss: 0.6218 - val_accuracy: 0.6568 - val_loss: 0.6259
Epoch 7/100
144/144 - 1s - 4ms/step - accuracy: 0.6635 - loss: 0.6103 - val_accuracy: 0.6551 - val_loss: 0.6114
Epoch 8/100
144/144 - 1s - 4ms/step - accuracy: 0.6657 - loss: 0.6102 - val_accuracy: 0.6646 - val_loss: 0.6100
Epoch 9/100
144/144 - 0s - 3ms/step - accuracy: 0.6752 - loss: 0.5994 - val_accuracy: 0.6646 - val_loss: 0.6074
Epo



[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
