In [None]:
{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "import pandas as pd\n",
        "import numpy as np\n",
        "import os\n",
        "import matplotlib.pyplot as plt\n",
        "import seaborn as sns\n",
        "import json\n",
        "import logging\n",
        "import warnings\n",
        "warnings.filterwarnings('ignore')\n",
        "\n",
        "# Setup logging\n",
        "logging.basicConfig(filename='data_analysis.log', level=logging.DEBUG,\n",
        "                    format='%(asctime)s - %(levelname)s - %(message)s')\n",
        "\n",
        "# Configuration\n",
        "CONFIG = {\n",
        "    'RAW_DATA_PATH': 'data/raw/Tetuan City power consumption.csv',\n",
        "    'MODEL_DIR': 'models',\n",
        "    'RESULTS_DIR': 'results',\n",
        "    'MODEL_PATHS': {\n",
        "        'Zone_1_Power_Consumption': 'models/best_model_Zone_1_Power_Consumption.pkl',\n",
        "        'Zone_2_Power_Consumption': 'models/best_model_Zone_2_Power_Consumption.pkl',\n",
        "        'Zone_3_Power_Consumption': 'models/best_model_Zone_3_Power_Consumption.pkl',\n",
        "    },\n",
        "    'FEATURES': ['Temperature', 'Humidity', 'Wind_Speed', 'general_diffuse_flows', 'diffuse_flows'],\n",
        "    'TARGETS': ['Zone_1_Power_Consumption', 'Zone_2_Power_Consumption', 'Zone_3_Power_Consumption']\n",
        "}\n",
        "\n",
        "# Create directories\n",
        "os.makedirs(CONFIG['RESULTS_DIR'], exist_ok=True)\n",
        "os.makedirs(CONFIG['MODEL_DIR'], exist_ok=True)\n",
        "\n",
        "def json_serializable(obj):\n",
        "    \"\"\"Handle non-serializable objects for JSON.\"\"\"\n",
        "    if pd.isna(obj) or obj is np.nan:\n",
        "        return None\n",
        "    if isinstance(obj, (np.integer, np.floating)):\n",
        "        return obj.item()\n",
        "    if isinstance(obj, np.ndarray):\n",
        "        return obj.tolist()\n",
        "    if isinstance(obj, pd.Timestamp):\n",
        "        return obj.strftime('%Y-%m-%d %H:%M:%S')\n",
        "    return str(obj)\n",
        "\n",
        "def load_data(file_path):\n",
        "    \"\"\"Load the dataset from a CSV file and inspect its structure.\"\"\"\n",
        "    logging.info(f\"Attempting to load dataset from {file_path}\")\n",
        "    if not os.path.exists(file_path):\n",
        "        logging.warning(f\"Dataset not found at {file_path}. Using sample data.\")\n",
        "        dates = pd.date_range(start='2023-01-01', end='2023-12-31', freq='10min')\n",
        "        np.random.seed(42)\n",
        "        df = pd.DataFrame({\n",
        "            'Temperature': np.random.normal(20, 5, len(dates)),\n",
        "            'Humidity': np.random.uniform(30, 80, len(dates)),\n",
        "            'Wind_Speed': np.random.exponential(2, len(dates)),\n",
        "            'general_diffuse_flows': np.random.uniform(0, 500, len(dates)),\n",
        "            'diffuse_flows': np.random.uniform(0, 300, len(dates)),\n",
        "            'Zone_1_Power_Consumption': np.random.normal(100, 20, len(dates)),\n",
        "            'Zone_2_Power_Consumption': np.random.normal(80, 15, len(dates)),\n",
        "            'Zone_3_Power_Consumption': np.random.normal(60, 10, len(dates))\n",
        "        }, index=dates)\n",
        "        logging.info(f\"Sample data created. Columns: {df.columns.tolist()}\")\n",
        "        logging.info(f\"Missing values:\\n{df.isnull().sum().to_dict()}\")\n",
        "        logging.info(f\"Duplicate timestamps: {df.index.duplicated().sum()}\")\n",
        "        return df\n",
        "    try:\n",
        "        df = pd.read_csv(file_path, parse_dates=['DateTime'], index_col='DateTime')\n",
        "        logging.info(f\"Loaded dataset with shape: {df.shape}\")\n",
        "        logging.info(f\"Columns: {df.columns.tolist()}\")\n",
        "        logging.info(f\"Missing values:\\n{df.isnull().sum().to_dict()}\")\n",
        "        logging.info(f\"Duplicate timestamps: {df.index.duplicated().sum()}\")\n",
        "        return df\n",
        "    except Exception as e:\n",
        "        logging.error(f\"Error loading data: {e}\")\n",
        "        return None\n",
        "\n",
        "def clean_data(df):\n",
        "    \"\"\"Clean the dataset by handling missing values, standardizing column names, and removing duplicates.\"\"\"\n",
        "    if df is None or df.empty:\n",
        "        logging.error(\"Input DataFrame is None or empty. Returning empty DataFrame.\")\n",
        "        return pd.DataFrame(), {'duplicate_count': 0, 'duplicates': []}\n",
        "    \n",
        "    # Standardize column names\n",
        "    column_mapping = {\n",
        "        'Zone 1 Power Consumption': 'Zone_1_Power_Consumption',\n",
        "        'Zone 2  Power Consumption': 'Zone_2_Power_Consumption',\n",
        "        'Zone 3  Power Consumption': 'Zone_3_Power_Consumption',\n",
        "        'Wind Speed': 'Wind_Speed',\n",
        "        'general diffuse flows': 'general_diffuse_flows',\n",
        "        'diffuse flows': 'diffuse_flows'\n",
        "    }\n",
        "    df = df.rename(columns=column_mapping)\n",
        "    df.columns = [col.replace(' ', '_') for col in df.columns]\n",
        "    logging.info(f\"Columns after renaming: {df.columns.tolist()}\")\n",
        "    \n",
        "    # Verify required columns\n",
        "    required_cols = CONFIG['FEATURES'] + CONFIG['TARGETS']\n",
        "    missing_cols = [col for col in required_cols if col not in df.columns]\n",
        "    if missing_cols:\n",
        "        logging.error(f\"Missing required columns: {missing_cols}\")\n",
        "        raise ValueError(f\"Missing required columns: {missing_cols}\")\n",
        "    \n",
        "    # Ensure numeric data types\n",
        "    for col in CONFIG['FEATURES'] + CONFIG['TARGETS']:\n",
        "        df[col] = pd.to_numeric(df[col], errors='coerce')\n",
        "    \n",
        "    # Handle missing values\n",
        "    if df.isnull().any().any():\n",
        "        logging.info(f\"Found {df.isnull().sum().sum()} missing values. Filling with mean.\")\n",
        "        df = df.fillna(df.mean(numeric_only=True))\n",
        "    \n",
        "    # Handle negative values in power consumption\n",
        "    for zone in CONFIG['TARGETS']:\n",
        "        if (df[zone] < 0).any():\n",
        "            logging.info(f\"Negative values found in {zone}. Replacing with 0.\")\n",
        "            df[zone] = df[zone].clip(lower=0)\n",
        "    \n",
        "    # Handle duplicate timestamps\n",
        "    if df.index.duplicated().any():\n",
        "        duplicate_count = df.index.duplicated().sum()\n",
        "        logging.info(f\"Found {duplicate_count} duplicate timestamps. Aggregating by mean.\")\n",
        "        duplicate_indices = df.index[df.index.duplicated()].strftime('%Y-%m-%d %H:%M:%S').tolist()\n",
        "        df = df.groupby(df.index).mean()\n",
        "        logging.info(f\"After aggregation, dataset shape: {df.shape}\")\n",
        "        return df, {'duplicate_count': duplicate_count, 'duplicates': duplicate_indices[:5]}\n",
        "    else:\n",
        "        logging.info(\"No duplicate timestamps found.\")\n",
        "        return df, {'duplicate_count': 0, 'duplicates': []}\n",
        "\n",
        "# Load and clean data\n",
        "try:\n",
        "    df = load_data(CONFIG['RAW_DATA_PATH'])\n",
        "    df, duplicate_info = clean_data(df)\n",
        "    logging.info(f\"Cleaned data head:\\n{df.head().to_dict()}\")\n",
        "except Exception as e:\n",
        "    logging.error(f\"Failed to load or clean data: {e}\")\n",
        "    df = pd.DataFrame()\n",
        "    duplicate_info = {'duplicate_count': 0, 'duplicates': []}\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Time Consistency & Structure\n",
        "def check_time_consistency(df, duplicate_info):\n",
        "    results = {\n",
        "        'timestamp_consistency': {'is_monotonic': False, 'irregular_timestamps': 'N/A'},\n",
        "        'sampling_frequency': {'frequency_minutes': 'N/A', 'is_consistent': False},\n",
        "        'duplicates': duplicate_info\n",
        "    }\n",
        "    \n",
        "    if df is not None and not df.empty:\n",
        "        try:\n",
        "            # Check for monotonic timestamps\n",
        "            is_monotonic = df.index.is_monotonic_increasing\n",
        "            results['timestamp_consistency']['is_monotonic'] = is_monotonic\n",
        "            results['timestamp_consistency']['irregular_timestamps'] = 0 if is_monotonic else len(df.index[df.index.to_series().diff().dt.total_seconds() <= 0])\n",
        "            \n",
        "            # Check sampling frequency\n",
        "            time_diffs = df.index.to_series().diff().dropna().dt.total_seconds() / 60\n",
        "            expected_freq = 10  # 10-minute intervals\n",
        "            is_consistent = (time_diffs == expected_freq).all() if not time_diffs.empty else False\n",
        "            results['sampling_frequency']['frequency_minutes'] = time_diffs.median() if not time_diffs.empty else 'N/A'\n",
        "            results['sampling_frequency']['is_consistent'] = is_consistent\n",
        "        except Exception as e:\n",
        "            logging.error(f\"Error in time consistency analysis: {e}\")\n",
        "    \n",
        "    # Save results\n",
        "    try:\n",
        "        file_path = os.path.join(CONFIG['RESULTS_DIR'], 'time_results.json')\n",
        "        with open(file_path, 'w') as f:\n",
        "            json.dump(results, f, indent=2, default=json_serializable)\n",
        "        logging.info(f\"Time consistency results saved to {file_path}\")\n",
        "    except Exception as e:\n",
        "        logging.error(f\"Error saving time_results.json: {e}\")\n",
        "    \n",
        "    return results\n",
        "\n",
        "time_results = check_time_consistency(df, duplicate_info)\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Temporal Patterns\n",
        "def analyze_temporal_patterns(df):\n",
        "    results = {\n",
        "        'hourly': {target: {'mean': {}, 'std': {}} for target in CONFIG['TARGETS']},\n",
        "        'daily': {target: {'mean': {}, 'std': {}} for target in CONFIG['TARGETS']},\n",
        "        'weekly': {target: {'mean': {}, 'std': {}} for target in CONFIG['TARGETS']}\n",
        "    }\n",
        "    \n",
        "    if df is not None and not df.empty:\n",
        "        try:\n",
        "            for target in CONFIG['TARGETS']:\n",
        "                if target not in df.columns:\n",
        "                    logging.warning(f\"{target} not in DataFrame columns. Skipping.\")\n",
        "                    continue\n",
        "                # Hourly patterns\n",
        "                hourly = df[target].groupby(df.index.hour).agg(['mean', 'std']).to_dict()\n",
        "                results['hourly'][target] = {'mean': hourly['mean'], 'std': hourly['std']}\n",
        "                # Daily patterns\n",
        "                daily = df[target].groupby(df.index.dayofweek).agg(['mean', 'std']).to_dict()\n",
        "                results['daily'][target] = {'mean': daily['mean'], 'std': daily['std']}\n",
        "                # Weekly patterns\n",
        "                weekly = df[target].resample('W').agg(['mean', 'std']).to_dict()\n",
        "                results['weekly'][target] = {'mean': weekly['mean'], 'std': weekly['std']}\n",
        "        except Exception as e:\n",
        "            logging.error(f\"Error in temporal patterns analysis: {e}\")\n",
        "    \n",
        "    # Save results\n",
        "    try:\n",
        "        file_path = os.path.join(CONFIG['RESULTS_DIR'], 'temporal_results.json')\n",
        "        with open(file_path, 'w') as f:\n",
        "            json.dump(results, f, indent=2, default=json_serializable)\n",
        "        logging.info(f\"Temporal patterns results saved to {file_path}\")\n",
        "    except Exception as e:\n",
        "        logging.error(f\"Error saving temporal_results.json: {e}\")\n",
        "    \n",
        "    return results\n",
        "\n",
        "temporal_results = analyze_temporal_patterns(df)\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Correlation Analysis\n",
        "def correlation_analysis(df):\n",
        "    results = {target: {feature: 'N/A' for feature in CONFIG['FEATURES']} for target in CONFIG['TARGETS']}\n",
        "    \n",
        "    if df is not None and not df.empty:\n",
        "        try:\n",
        "            for target in CONFIG['TARGETS']:\n",
        "                if target not in df.columns:\n",
        "                    logging.warning(f\"{target} not in DataFrame columns. Skipping.\")\n",
        "                    continue\n",
        "                for feature in CONFIG['FEATURES']:\n",
        "                    if feature in df.columns:\n",
        "                        corr = df[target].corr(df[feature])\n",
        "                        results[target][feature] = corr if not pd.isna(corr) else 'N/A'\n",
        "                    else:\n",
        "                        logging.warning(f\"{feature} not in DataFrame columns. Skipping.\")\n",
        "        except Exception as e:\n",
        "            logging.error(f\"Error in correlation analysis: {e}\")\n",
        "    \n",
        "    # Save results\n",
        "    try:\n",
        "        file_path = os.path.join(CONFIG['RESULTS_DIR'], 'correlation_results.json')\n",
        "        with open(file_path, 'w') as f:\n",
        "            json.dump(results, f, indent=2, default=json_serializable)\n",
        "        logging.info(f\"Correlation results saved to {file_path}\")\n",
        "    except Exception as e:\n",
        "        logging.error(f\"Error saving correlation_results.json: {e}\")\n",
        "    \n",
        "    return results\n",
        "\n",
        "correlation_results = correlation_analysis(df)\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Lag Effects\n",
        "def analyze_lag_effects(df, max_lag=24):\n",
        "    results = {target: {str(lag): 'N/A' for lag in range(1, max_lag + 1)} for target in CONFIG['TARGETS']}\n",
        "    \n",
        "    if df is not None and not df.empty:\n",
        "        try:\n",
        "            for target in CONFIG['TARGETS']:\n",
        "                if target not in df.columns:\n",
        "                    logging.warning(f\"{target} not in DataFrame columns. Skipping.\")\n",
        "                    continue\n",
        "                for lag in range(1, max_lag + 1):\n",
        "                    lagged = df[target].shift(lag)\n",
        "                    corr = df[target].corr(lagged)\n",
        "                    results[target][str(lag)] = corr if not pd.isna(corr) else 'N/A'\n",
        "        except Exception as e:\n",
        "            logging.error(f\"Error in lag effects analysis: {e}\")\n",
        "    \n",
        "    # Save results\n",
        "    try:\n",
        "        file_path = os.path.join(CONFIG['RESULTS_DIR'], 'lagged_results.json')\n",
        "        with open(file_path, 'w') as f:\n",
        "            json.dump(results, f, indent=2, default=json_serializable)\n",
        "        logging.info(f\"Lagged results saved to {file_path}\")\n",
        "    except Exception as e:\n",
        "        logging.error(f\"Error saving lagged_results.json: {e}\")\n",
        "    \n",
        "    return results\n",
        "\n",
        "lagged_results = analyze_lag_effects(df)\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Outlier Detection\n",
        "def detect_outliers(df):\n",
        "    results = {col: {'outlier_count': 0, 'outlier_indices': [], 'summary': ''} for col in CONFIG['FEATURES'] + CONFIG['TARGETS']}\n",
        "    \n",
        "    if df is not None and not df.empty:\n",
        "        try:\n",
        "            # Generate boxplot for visualization\n",
        "            available_cols = [col for col in CONFIG['FEATURES'] + CONFIG['TARGETS'] if col in df.columns]\n",
        "            if available_cols:\n",
        "                plt.figure(figsize=(10, 6))\n",
        "                df[available_cols].boxplot()\n",
        "                plt.title(\"Boxplot for Outlier Detection\", fontsize=14, fontweight='bold')\n",
        "                plt.xticks(rotation=45, ha='right')\n",
        "                plt.ylabel(\"Values\")\n",
        "                boxplot_path = os.path.join(CONFIG['RESULTS_DIR'], 'outlier_boxplot.png')\n",
        "                plt.savefig(boxplot_path)\n",
        "                plt.close()\n",
        "                results['boxplot'] = boxplot_path\n",
        "                logging.info(f\"Boxplot saved to {boxplot_path}\")\n",
        "            else:\n",
        "                logging.warning(\"No valid columns available for boxplot.\")\n",
        "            \n",
        "            for col in CONFIG['FEATURES'] + CONFIG['TARGETS']:\n",
        "                if col not in df.columns:\n",
        "                    results[col]['summary'] = f\"Column {col} not found in dataset.\"\n",
        "                    logging.warning(f\"{col} not in DataFrame columns. Skipping outlier detection.\")\n",
        "                    continue\n",
        "                if not pd.api.types.is_numeric_dtype(df[col]):\n",
        "                    results[col]['summary'] = f\"Column {col} is non-numeric, skipping outlier detection.\"\n",
        "                    logging.warning(f\"{col} is non-numeric, skipping outlier detection.\")\n",
        "                    continue\n",
        "                Q1 = df[col].quantile(0.25)\n",
        "                Q3 = df[col].quantile(0.75)\n",
        "                IQR = Q3 - Q1\n",
        "                lower_bound = Q1 - 1.5 * IQR\n",
        "                upper_bound = Q3 + 1.5 * IQR\n",
        "                outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)][col]\n",
        "                results[col] = {\n",
        "                    'outlier_count': len(outliers),\n",
        "                    'outlier_indices': outliers.index.strftime('%Y-%m-%d %H:%M:%S').tolist()[:5],\n",
        "                    'summary': f\"Detected {len(outliers)} outliers in {col}. Q1={Q1:.2f}, Q3={Q3:.2f}, IQR={IQR:.2f}, Bounds=[{lower_bound:.2f}, {upper_bound:.2f}].\"\n",
        "                }\n",
        "        except Exception as e:\n",
        "            logging.error(f\"Error in outlier detection: {e}\")\n",
        "            for col in CONFIG['FEATURES'] + CONFIG['TARGETS']:\n",
        "                results[col]['summary'] = f\"Error detecting outliers for {col}: {e}\"\n",
        "    \n",
        "    # Save results\n",
        "    try:\n",
        "        file_path = os.path.join(CONFIG['RESULTS_DIR'], 'outlier_results.json')\n",
        "        with open(file_path, 'w') as f:\n",
        "            json.dump(results, f, indent=2, default=json_serializable)\n",
        "        logging.info(f\"Outlier results saved to {file_path}\")\n",
        "    except Exception as e:\n",
        "        logging.error(f\"Error saving outlier_results.json: {e}\")\n",
        "    \n",
        "    return results\n",
        "\n",
        "outlier_results = detect_outliers(df)\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Generate EDA Questions\n",
        "def generate_eda_questions(time_results, correlation_results, lagged_results, outlier_results):\n",
        "    questions = {\n",
        "        'time_consistency': {\n",
        "            'questions': [\n",
        "                {\n",
        "                    'question': 'Are the timestamps consistent and properly spaced?',\n",
        "                    'answer': 'The dataset has {irregular_timestamps} irregular timestamps, with a median sampling frequency of {frequency_minutes} minutes, which is {consistency_status} consistent. Found {duplicate_count} duplicate timestamps{duplicate_details}.'\n",
        "                }\n",
        "            ]\n",
        "        },\n",
        "        'temporal_trends': {\n",
        "            'questions': [\n",
        "                {\n",
        "                    'question': 'What temporal patterns exist in power consumption?',\n",
        "                    'answer': 'Hourly, daily, and weekly patterns were analyzed for each zone, showing variations in mean and standard deviation.'\n",
        "                }\n",
        "            ]\n",
        "        },\n",
        "        'environmental_relationships': {\n",
        "            'questions': [\n",
        "                {\n",
        "                    'question': 'How do environmental features correlate with power consumption?',\n",
        "                    'answer': 'Zone 1 power consumption has a correlation of {zone_1_temp_corr} with Temperature, and Zone 2 has a correlation of {zone_2_humidity_corr} with Humidity.'\n",
        "                }\n",
        "            ]\n",
        "        },\n",
        "        'lag_effects': {\n",
        "            'questions': [\n",
        "                {\n",
        "                    'question': 'Are there significant lag effects in power consumption?',\n",
        "                    'answer': 'For {zone}, the 1-hour lag correlation is {lag_1_temp_corr}, and the 3-hour lag correlation is {lag_3_temp_corr}.'\n",
        "                }\n",
        "            ]\n",
        "        },\n",
        "        'data_quality': {\n",
        "            'questions': [\n",
        "                {\n",
        "                    'question': 'Are there outliers or anomalies in the data?',\n",
        "                    'answer': '{outlier_count} columns have outliers: {outlier_details}.' if len(outlier_cols) > 0 else 'No outliers detected in any columns.'\n",
        "                }\n",
        "            ]\n",
        "        }\n",
        "    }\n",
        "    \n",
        "    # Add detailed outlier summaries\n",
        "    outlier_cols = [k for k, v in outlier_results.items() if k != 'boxplot' and v.get('outlier_count', 0) > 0]\n",
        "    questions['data_quality']['questions'][0]['answer'] = (\n",
        "        f\"{len(outlier_cols)} columns have outliers: {', '.join([f'{k}: {v['outlier_count']} outliers ({v['summary']})' for k, v in outlier_results.items() if k != 'boxplot' and v.get('outlier_count', 0) > 0])}\"\n",
        "        if outlier_cols else\n",
        "        \"No outliers detected in any columns. All features and targets were analyzed using the IQR method.\"\n",
        "    )\n",
        "    \n",
        "    # Save results\n",
        "    try:\n",
        "        file_path = os.path.join(CONFIG['RESULTS_DIR'], 'eda_questions.json')\n",
        "        with open(file_path, 'w') as f:\n",
        "            json.dump(questions, f, indent=2, default=json_serializable)\n",
        "        logging.info(f\"EDA questions saved to {file_path}\")\n",
        "    except Exception as e:\n",
        "        logging.error(f\"Error saving eda_questions.json: {e}\")\n",
        "    \n",
        "    return questions\n",
        "\n",
        "eda_questions = generate_eda_questions(time_results, correlation_results, lagged_results, outlier_results)\n"
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.8.10"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 4
}