In [1]:
import pandas as pd
import os
import numpy as np
# Specify the directory where your CSV files are stored
csv_directory = '/Downloads/UNPROCESSED_combined_When2heat_bolnabohotkuch'

# List all CSV files in the directory
csv_files = [file for file in os.listdir(csv_directory) if file.endswith('.csv')]

# Create an empty dictionary to store country-wise DataFrames
country_dfs = {}

# Iterate through CSV files and create DataFrames
for csv_file in csv_files:
    # Extract the country code from the filename (adjust this based on your file naming convention)
    country_code = csv_file.split('_')[0]  # Assuming the country code is at the beginning of the filename

    # Read the CSV file into a DataFrame
    file_path = os.path.join(csv_directory, csv_file)
    country_df = pd.read_csv(file_path)

    # Add the DataFrame to the dictionary
    country_dfs[country_code] = country_df


In [2]:
country_df.head()

Unnamed: 0,cet_cest_timestamp,year,month,day,hour,COP_ASHP_floor,COP_ASHP_radiator,COP_ASHP_water,COP_GSHP_floor,COP_GSHP_radiator,...,heat_demand_water,heat_demand_water_COM,heat_demand_water_MFH,heat_demand_water_SFH,heat_profile_space_COM,heat_profile_space_MFH,heat_profile_space_SFH,heat_profile_water_COM,heat_profile_water_MFH,heat_profile_water_SFH
0,2008-01-01T03:30:00.000+05:30,2008,1,1,3,3.0,2.833333,2.0,4.0,,...,45.0,16.0,20.0,9.0,176.0,128.0,143.0,66.0,68.0,12.0
1,2008-01-01T04:30:00.000+05:30,2008,1,1,4,3.0,2.833333,2.0,4.0,,...,52.0,20.0,21.0,10.0,179.0,132.0,145.0,83.0,71.0,15.0
2,2008-01-01T05:30:00.000+05:30,2008,1,1,5,3.0,2.833333,2.0,4.0,,...,50.0,21.0,20.0,9.0,183.0,135.0,150.0,86.0,68.0,12.0
3,2008-01-01T06:30:00.000+05:30,2008,1,1,6,3.0,2.833333,2.0,4.0,,...,57.0,22.0,20.0,16.0,196.0,148.0,157.0,88.0,66.0,22.0
4,2008-01-01T07:30:00.000+05:30,2008,1,1,7,3.0,2.833333,2.0,4.0,,...,98.0,30.0,31.0,37.0,197.0,173.0,177.0,121.0,105.0,53.0


In [3]:
generic_column_names = ['COP_ASHP_floor', 'COP_ASHP_radiator', 'COP_ASHP_water',
                        'COP_GSHP_floor', 'COP_GSHP_radiator', 'COP_GSHP_water',
                        'COP_WSHP_floor', 'COP_WSHP_radiator', 'COP_WSHP_water',
                        'heat_demand_space', 'heat_demand_space_COM', 'heat_demand_space_MFH',
                        'heat_demand_space_SFH', 'heat_demand_total', 'heat_demand_water',
                        'heat_demand_water_COM', 'heat_demand_water_MFH', 'heat_demand_water_SFH',
                        'heat_profile_space_COM', 'heat_profile_space_MFH', 'heat_profile_space_SFH',
                        'heat_profile_water_COM', 'heat_profile_water_MFH', 'heat_profile_water_SFH']

country_codes = ['SE', 'EE', 'NO', 'DK', 'GR', 'IE', 'SK', 'PT', 'BE', 'SI', 'LU', 'IT', 'NL', 'HR', 'ES', 'CZ', 'AT', 'DE', 'HU', 'RO', 'LT', 'FR', 'BG', 'CH', 'FI', 'GB', 'LV', 'PL']

In [4]:
# Iterate through country DataFrames
for country_code, country_df in country_dfs.items():
    # Skip DataFrames with no 'heat_demand' columns
    if not any('heat_demand' in col for col in country_df.columns):
        print(f"Skipping {country_code} DataFrame as it has no 'heat_demand' columns.")
        continue

    print(f"Processing {country_code} DataFrame...")

    # Get the timestamp range for the current country DataFrame
    from_timestamp = "2016-02-01T00:30:00.000+05:30"
    till_timestamp = "2023-01-01T04:30:00.000+05:30"

    # Filter rows within the specified timestamp range
    selected_rows = country_df[(country_df['cet_cest_timestamp'] >= from_timestamp) & (country_df['cet_cest_timestamp'] <= till_timestamp)]

    # Identify rows with null or 0.0 values in the specified columns
    columns_to_check = ['heat_demand_space', 'heat_demand_space_COM', 'heat_demand_space_MFH',
                        'heat_demand_space_SFH', 'heat_demand_total', 'heat_demand_water',
                        'heat_demand_water_COM', 'heat_demand_water_MFH', 'heat_demand_water_SFH']

    rows_to_remove = selected_rows[selected_rows[columns_to_check].isnull().any(axis=1) | selected_rows[columns_to_check].eq(0.0).any(axis=1) | selected_rows[columns_to_check].eq('').any(axis=1)]

    # Remove identified rows from the country DataFrame
    country_df = country_df.drop(rows_to_remove.index)

    print(f"Finished processing {country_code} DataFrame.\n")

    # Update the country DataFrame in the dictionary
    country_dfs[country_code] = country_df


Processing HR DataFrame...
Finished processing HR DataFrame.

Processing DE DataFrame...
Finished processing DE DataFrame.

Skipping CH DataFrame as it has no 'heat_demand' columns.
Processing GB DataFrame...
Finished processing GB DataFrame.

Processing BG DataFrame...
Finished processing BG DataFrame.

Processing DK DataFrame...
Finished processing DK DataFrame.

Processing FR DataFrame...
Finished processing FR DataFrame.

Processing BE DataFrame...
Finished processing BE DataFrame.

Processing ES DataFrame...
Finished processing ES DataFrame.

Skipping NO DataFrame as it has no 'heat_demand' columns.
Processing FI DataFrame...
Finished processing FI DataFrame.

Processing LV DataFrame...
Finished processing LV DataFrame.

Processing RO DataFrame...
Finished processing RO DataFrame.

Processing PT DataFrame...
Finished processing PT DataFrame.

Processing LT DataFrame...
Finished processing LT DataFrame.

Processing CZ DataFrame...
Finished processing CZ DataFrame.

Processing PL Da

In [5]:
# Iterate through country DataFrames
for country_code, country_df in country_dfs.items():
    # Skip DataFrames with no 'heat_demand' columns
    if not any('heat_demand' in col for col in country_df.columns):
        print(f"Skipping {country_code} DataFrame as it has no 'heat_demand' columns.")
        continue

    print(f"Processing {country_code} DataFrame...")

    # Get the timestamp range for the current country DataFrame
    from_timestamp = "2016-02-01T00:30:00.000+05:30"
    till_timestamp = "2023-01-01T04:30:00.000+05:30"

    # Filter rows within the specified timestamp range
    selected_rows = country_df[(country_df['cet_cest_timestamp'] >= from_timestamp) & (country_df['cet_cest_timestamp'] <= till_timestamp)]

    # Identify rows with null or 0.0 values in the specified columns
    columns_to_check = ['COP_ASHP_floor', 'COP_ASHP_radiator', 'COP_ASHP_water',
                        'COP_GSHP_floor', 'COP_GSHP_radiator', 'COP_GSHP_water',
                        'COP_WSHP_floor', 'COP_WSHP_radiator', 'COP_WSHP_water',
                        'heat_demand_space', 'heat_demand_space_COM', 'heat_demand_space_MFH',
                        'heat_demand_space_SFH', 'heat_demand_total', 'heat_demand_water',
                        'heat_demand_water_COM', 'heat_demand_water_MFH', 'heat_demand_water_SFH',
                        'heat_profile_space_COM', 'heat_profile_space_MFH', 'heat_profile_space_SFH',
                        'heat_profile_water_COM', 'heat_profile_water_MFH', 'heat_profile_water_SFH']

    rows_to_remove = selected_rows[selected_rows[columns_to_check].isnull().any(axis=1) | selected_rows[columns_to_check].eq(0.0).any(axis=1) | selected_rows[columns_to_check].eq('').any(axis=1)]

    # Remove identified rows from the country DataFrame
    country_df = country_df.drop(rows_to_remove.index)

    # Replace NaN values with the mean of each column
    country_df[columns_to_check] = country_df[columns_to_check].apply(lambda col: col.fillna(col.mean()))

    print(f"Finished processing {country_code} DataFrame.\n")

    # Update the country DataFrame in the dictionary
    country_dfs[country_code] = country_df


Processing HR DataFrame...
Finished processing HR DataFrame.

Processing DE DataFrame...
Finished processing DE DataFrame.

Skipping CH DataFrame as it has no 'heat_demand' columns.
Processing GB DataFrame...
Finished processing GB DataFrame.

Processing BG DataFrame...
Finished processing BG DataFrame.

Processing DK DataFrame...
Finished processing DK DataFrame.

Processing FR DataFrame...
Finished processing FR DataFrame.

Processing BE DataFrame...
Finished processing BE DataFrame.

Processing ES DataFrame...
Finished processing ES DataFrame.

Skipping NO DataFrame as it has no 'heat_demand' columns.
Processing FI DataFrame...
Finished processing FI DataFrame.

Processing LV DataFrame...
Finished processing LV DataFrame.

Processing RO DataFrame...
Finished processing RO DataFrame.

Processing PT DataFrame...
Finished processing PT DataFrame.

Processing LT DataFrame...
Finished processing LT DataFrame.

Processing CZ DataFrame...
Finished processing CZ DataFrame.

Processing PL Da

In [6]:
# Iterate through country DataFrames
for country_code, country_df in country_dfs.items():
    print(f"Checking NaN values in {country_code} DataFrame...")

    # Check for NaN values in each column
    nan_values = country_df.isnull().sum()

    # Print the result
    print(f"NaN values in {country_code} DataFrame:")
    print(nan_values)

    print(f"Finished checking NaN values for {country_code} DataFrame.\n")


Checking NaN values in HR DataFrame...
NaN values in HR DataFrame:
cet_cest_timestamp            0
year                          0
month                         0
day                           0
hour                          0
COP_ASHP_floor                0
COP_ASHP_radiator             0
COP_ASHP_water                0
COP_GSHP_floor                0
COP_GSHP_radiator             0
COP_GSHP_water                0
COP_WSHP_floor                0
COP_WSHP_radiator             0
COP_WSHP_water            70861
heat_demand_space             0
heat_demand_space_COM         0
heat_demand_space_MFH         0
heat_demand_space_SFH         0
heat_demand_total             0
heat_demand_water             0
heat_demand_water_COM         0
heat_demand_water_MFH         0
heat_demand_water_SFH         0
heat_profile_space_COM        0
heat_profile_space_MFH        0
heat_profile_space_SFH        0
heat_profile_water_COM        0
heat_profile_water_MFH        0
heat_profile_water_SFH        0
dtype

In [9]:
import pandas as pd

# Assuming df is your DataFrame and 'timestamp' is the column representing timestamps

# Step 1: Identify rows with NaN values
nan_rows = df[df.isnull().any(axis=1)]

# Step 2: Check timestamps
nan_rows['cet_cest_timestamp'] = pd.to_datetime(nan_rows['cet_cest_timestamp'])
current_year = pd.to_datetime('today').year

nan_rows = nan_rows[(nan_rows['cet_cest_timestamp'].dt.year >= (current_year - 2)) & (nan_rows['cet_cest_timestamp'].dt.year <= (current_year + 2))]

# Step 3: Replace NaN with mean of last 2 years and future 2 years
for col in nan_rows.columns:
    nan_rows[col].fillna(nan_rows[col].mean(), inplace=True)

# Step 4: Impute NaN values in previous years if needed
# Check and replace NaN values in the previous years

# Update the original DataFrame with the changes
df.update(nan_rows)

# Display the updated DataFrame
print(df)


                  cet_cest_timestamp  year  month  day  hour  COP_ASHP_floor  \
0      2008-01-01T03:30:00.000+05:30  2008      1    1     3             3.0   
1      2008-01-01T04:30:00.000+05:30  2008      1    1     4             3.0   
2      2008-01-01T05:30:00.000+05:30  2008      1    1     5             3.0   
3      2008-01-01T06:30:00.000+05:30  2008      1    1     6             3.0   
4      2008-01-01T07:30:00.000+05:30  2008      1    1     7             3.0   
...                              ...   ...    ...  ...   ...             ...   
77431  2016-10-31T19:30:00.000+05:30  2016     10   31    19             4.0   
77432  2016-10-31T20:30:00.000+05:30  2016     10   31    20             4.0   
77433  2016-10-31T21:30:00.000+05:30  2016     10   31    21             4.0   
77434  2016-10-31T22:30:00.000+05:30  2016     10   31    22             4.0   
77435  2016-10-31T23:30:00.000+05:30  2016     10   31    23             4.0   

       COP_ASHP_radiator  COP_ASHP_wate

In [10]:
# Iterate through country DataFrames
for country_code, country_df in country_dfs.items():
    print(f"Checking NaN values in {country_code} DataFrame...")

    # Check for NaN values in each column
    nan_values = country_df.isnull().sum()

    # Print the result
    print(f"NaN values in {country_code} DataFrame:")
    print(nan_values)

    print(f"Finished checking NaN values for {country_code} DataFrame.\n")


Checking NaN values in HR DataFrame...
NaN values in HR DataFrame:
cet_cest_timestamp            0
year                          0
month                         0
day                           0
hour                          0
COP_ASHP_floor            16158
COP_ASHP_radiator          5904
COP_ASHP_water             7949
COP_GSHP_floor            16152
COP_GSHP_radiator         11014
COP_GSHP_water             5708
COP_WSHP_floor                0
COP_WSHP_radiator             0
COP_WSHP_water            72327
heat_demand_space             0
heat_demand_space_COM         0
heat_demand_space_MFH         0
heat_demand_space_SFH         0
heat_demand_total             0
heat_demand_water             0
heat_demand_water_COM         0
heat_demand_water_MFH         0
heat_demand_water_SFH         0
heat_profile_space_COM        0
heat_profile_space_MFH        0
heat_profile_space_SFH        0
heat_profile_water_COM        0
heat_profile_water_MFH        0
heat_profile_water_SFH        0
dtype

In [6]:
import os
import pandas as pd

output_folder = "/Downloads/godsplan"

for country, df in country_dfs.items():
    # Save Pandas DataFrame to CSV in the specified folder
    csv_file_path = os.path.join(output_folder, f"{country}_preprocessed_data.csv")
    df.to_csv(csv_file_path, index=False)
