In [3]:
import json
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
import tensorflow as tf
import joblib

# === 1. Загрузка и подготовка данных ===
with open('dataset_train.json', 'r', encoding='utf-8') as f:
    raw_data = json.load(f)

data = [d for d in raw_data if 'consumption' in d and isinstance(d['consumption'], dict)]
df = pd.DataFrame(data)

months = [str(i) for i in range(1, 13)]
for m in months:
    df[f'consumption_{m}'] = df['consumption'].apply(lambda d: d.get(m, 0))

df.drop(columns=['consumption', 'address'], inplace=True, errors='ignore')
df['roomsCount'] = pd.to_numeric(df['roomsCount'], errors='coerce')
df['residentsCount'] = pd.to_numeric(df['residentsCount'], errors='coerce')
df['isCommercial'] = df['isCommercial'].astype(int)

# === 2. Новые признаки ===
cons_cols = [f'consumption_{m}' for m in months]
df['total_consumption'] = df[cons_cols].sum(axis=1)
df['mean_consumption'] = df[cons_cols].mean(axis=1)
df['std_consumption'] = df[cons_cols].std(axis=1)
df['max_consumption'] = df[cons_cols].max(axis=1)
df['min_consumption'] = df[cons_cols].min(axis=1)
df['monthly_delta'] = df[cons_cols].apply(lambda row: row.max() - row.min(), axis=1)
df['cons_per_resident'] = df['total_consumption'] / (df['residentsCount'] + 1)
df['cons_per_room'] = df['total_consumption'] / (df['roomsCount'] + 1)

# === 3. Признаки и метки ===
numeric_features = [
    'roomsCount', 'residentsCount'
] + cons_cols + [
    'total_consumption', 'mean_consumption', 'std_consumption',
    'max_consumption', 'min_consumption', 'monthly_delta',
    'cons_per_resident', 'cons_per_room'
]
categorical_features = ['buildingType']

X = df[numeric_features + categorical_features]
y = df['isCommercial']

# === 4. Тренировочный и валидационный набор ===
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# === 5. Препроцессинг ===
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])
preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# === 6. Модель нейросети с улучшениями ===
model = Sequential([
    Dense(128, activation='relu', kernel_regularizer=l2(0.001)),
    BatchNormalization(),
    Dropout(0.3),
    Dense(64, activation='relu', kernel_regularizer=l2(0.001)),
    BatchNormalization(),
    Dropout(0.3),
    Dense(32, activation='relu', kernel_regularizer=l2(0.001)),
    BatchNormalization(),
    Dense(1, activation='sigmoid')
])

# Learning rate schedule
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=0.001,
    decay_steps=5000,
    decay_rate=0.9,
    staircase=True
)

model.compile(
    optimizer=Adam(learning_rate=lr_schedule),
    loss='binary_crossentropy',
    metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall(), tf.keras.metrics.AUC()]
)

# === 7. Обучение ===
class_weights = {0: 1, 1: 2}

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', patience=10, restore_best_weights=True
)

history = model.fit(
    X_train_processed, y_train,
    validation_split=0.2,
    epochs=100,
    batch_size=32,
    class_weight=class_weights,
    callbacks=[early_stopping],
    verbose=1
)

# === 8. Сохранение модели и препроцессора ===
model.save('commercial_model.keras')
joblib.dump(preprocessor, 'preprocessor.pkl')

print("\n\u2705 Обучение завершено. Модель и препроцессор сохранены.")


ModuleNotFoundError: No module named 'numpy'

In [None]:
def add_engineered_features(df):
    # Переименуем consumption столбцы
    for i in range(1, 13):
        df.rename(columns={f"month_{i}": f"consumption_{i}"}, inplace=True)

    month_cols = [f"consumption_{i}" for i in range(1, 13)]
    consumption = df[month_cols]

    df["total_consumption"] = consumption.sum(axis=1)
    df["mean_consumption"] = consumption.mean(axis=1)
    df["std_consumption"] = consumption.std(axis=1)
    df["min_consumption"] = consumption.min(axis=1)
    df["max_consumption"] = consumption.max(axis=1)
    df["monthly_delta"] = df["max_consumption"] - df["min_consumption"]
    df["cons_per_resident"] = df["total_consumption"] / df["residentsCount"].replace(0, 1)
    df["cons_per_room"] = df["total_consumption"] / df["roomsCount"].replace(0, 1)

    return df


In [None]:
import json
import joblib
import numpy as np
import pandas as pd
from tensorflow.keras.models import load_model



def flatten_consumption(consumption_dict):
    return [consumption_dict.get(str(month), 0) for month in range(1, 13)]

def prepare_dataframe(data):
    records = []
    for entry in data:
        flat = {
            "accountId": entry["accountId"],
            "roomsCount": entry["roomsCount"],
            "residentsCount": entry["residentsCount"],
            "buildingType": entry["buildingType"]
        }
        consumption_values = flatten_consumption(entry["consumption"])
        for i, val in enumerate(consumption_values, 1):
            flat[f"month_{i}"] = val
        records.append(flat)
    df = pd.DataFrame(records)
    df = add_engineered_features(df)  # Важно!
    return df

def predict_commercial(input_json_path, output_json_path):
    model = load_model("commercial_model.keras")
    preprocessor = joblib.load("preprocessor.pkl")

    with open(input_json_path, 'r', encoding='utf-8') as f:
        new_data = json.load(f)

    df = prepare_dataframe(new_data)
    account_ids = df["accountId"].tolist()
    X = df.drop(columns=["accountId"])

    X_processed = preprocessor.transform(X)
    probabilities = model.predict(X_processed).flatten()
    predictions = probabilities >= 0.5

    results = [
        {
            "accountId": acc_id,
            "isCommercial": bool(pred),
            "probability": round(float(prob), 4)
        }
        for acc_id, pred, prob in sorted(zip(account_ids, predictions, probabilities), key=lambda x: x[2], reverse=True)
    ]

    with open(output_json_path, 'w', encoding='utf-8') as f:
        json.dump(results, f, ensure_ascii=False, indent=2)

    print(f"Saved predictions to {output_json_path}")


In [None]:
import json
import joblib
import numpy as np
import pandas as pd
from tensorflow.keras.models import load_model

def flatten_consumption(consumption_dict):
    return [consumption_dict.get(str(month), 0) for month in range(1, 13)]

def prepare_dataframe(data):
    records = []
    for entry in data:
        flat = {
            "accountId": entry.get("accountId"),
            "roomsCount": entry.get("roomsCount"),
            "residentsCount": entry.get("residentsCount", 0),
            "buildingType": entry.get("buildingType")
        }
        consumption_values = flatten_consumption(entry.get("consumption", []))
        # month → consumption
        for i, val in enumerate(consumption_values, 1):
            flat[f"consumption_{i}"] = val
        records.append(flat)
    df = pd.DataFrame(records)
    # Добавляем  признаки
    df = add_engineered_features(df)
    return df

def predict_commercial(input_json_path, output_json_path):
    # Load model and preprocessor
    model = load_model("commercial_model.keras")
    preprocessor = joblib.load("preprocessor.pkl")

    # Load new data
    with open(input_json_path, 'r', encoding='utf-8') as f:
        new_data = json.load(f)

    # Prepare dataframe
    df = prepare_dataframe(new_data)
    account_ids = df["accountId"].tolist()
    X = df.drop(columns=["accountId"])

    print(" Фактические признаки:", X.columns.tolist())
    print(" Ожидаемые признаки:", preprocessor.transformers_[0][2])


    # Preprocess features
    X_processed = preprocessor.transform(X)

    # Predict probabilities
    probabilities = model.predict(X_processed).flatten()
    predictions = probabilities >= 0.5

    # Prepare output
    results = [
        {
            "accountId": acc_id,
            "isCommercial": bool(pred),
            "probability": round(float(prob), 4)
        }
        for acc_id, pred, prob in sorted(zip(account_ids, predictions, probabilities), key=lambda x: x[2], reverse=True)
    ]

    # Save to JSON
    with open(output_json_path, 'w', encoding='utf-8') as f:
        json.dump(results, f, ensure_ascii=False, indent=2)

    print(f"Saved predictions to {output_json_path}")


In [None]:
predict_commercial('new_data.json', 'prediction_results.json')


 Фактические признаки: ['roomsCount', 'residentsCount', 'buildingType', 'consumption_1', 'consumption_2', 'consumption_3', 'consumption_4', 'consumption_5', 'consumption_6', 'consumption_7', 'consumption_8', 'consumption_9', 'consumption_10', 'consumption_11', 'consumption_12', 'total_consumption', 'mean_consumption', 'std_consumption', 'min_consumption', 'max_consumption', 'monthly_delta', 'cons_per_resident', 'cons_per_room']
 Ожидаемые признаки: ['roomsCount', 'residentsCount', 'consumption_1', 'consumption_2', 'consumption_3', 'consumption_4', 'consumption_5', 'consumption_6', 'consumption_7', 'consumption_8', 'consumption_9', 'consumption_10', 'consumption_11', 'consumption_12', 'total_consumption', 'mean_consumption', 'std_consumption', 'max_consumption', 'min_consumption', 'monthly_delta', 'cons_per_resident', 'cons_per_room']
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step
Saved predictions to prediction_results.json
