In [45]:
# Import necessary libraries
import pandas as pd

# Function to read a Feather file
def read_feather_file(file_path):
    try:
        # Read the feather file
        df = pd.read_feather(file_path)
        return df
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

# File paths 
# telemetry_large_consumers_file = 'telemetry_large_consumers_DCW.feather'
standard_profiles_file = 'standard_profiles_DCW.feather'
customer_attributes_file = 'customer_attributes_DCW.feather'

# Read the files
# telemetry_large_consumers_df = read_feather_file(telemetry_large_consumers_file)
standard_profiles_df = read_feather_file(standard_profiles_file)
customer_attributes_df = read_feather_file(customer_attributes_file)

In [6]:
# Initial Exploration
# To understand the basic structure of the datasets, let's print the first few rows of each dataframe
print("Telemetry Large Consumers:")
telemetry_large_consumers_df.head()

Telemetry Large Consumers:


Unnamed: 0,RND_ID,2023-01-01 00:00,2023-01-01 00:15,2023-01-01 00:30,2023-01-01 00:45,2023-01-01 01:00,2023-01-01 01:15,2023-01-01 01:30,2023-01-01 01:45,2023-01-01 02:00,...,2023-12-31 21:30,2023-12-31 21:45,2023-12-31 22:00,2023-12-31 22:15,2023-12-31 22:30,2023-12-31 22:45,2023-12-31 23:00,2023-12-31 23:15,2023-12-31 23:30,2023-12-31 23:45
0,8423,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,6756,36.0,32.0,32.0,36.0,32.0,36.0,32.0,36.0,32.0,...,35.89,35.43,34.09,33.31,30.57,29.65,32.06,29.13,28.25,27.85
2,1077,0.0,4.0,0.0,4.0,0.0,4.0,0.0,4.0,0.0,...,2.0,1.5,1.5,2.0,1.5,1.5,2.0,1.5,2.0,1.5
3,8061,16.0,8.0,12.0,8.0,12.0,16.0,12.0,12.0,12.0,...,8.4,8.8,8.8,11.0,10.19,9.19,8.8,9.0,11.0,12.4
4,10575,0.0,4.0,0.0,0.0,4.0,0.0,0.0,4.0,0.0,...,6.67,6.66,3.54,1.15,1.1,1.1,1.1,1.1,1.1,1.12


In [11]:
import pandas as pd

# **Assuming you have your 'telemetry_large_consumers_df' loaded as a pandas DataFrame**

# Select the first 100 rows
df_first_100 = telemetry_large_consumers_df.head(100)

# Save as a new feather file
df_first_100.to_feather('telemetry_first_100.feather')

In [7]:
print("\nStandard Profiles:")
standard_profiles_df.head()


Standard Profiles:


Unnamed: 0,DATUM_TIJDSTIP,PROFIEL,WAARDE
0,2023-10-06 07:30,KVKSEGMENT_7,4.1e-05
1,2023-08-27 14:15,KVKSEGMENT_9,1.9e-05
2,2023-02-25 18:00,KVKSEGMENT_18,3.2e-05
3,2023-11-29 00:15,KVKSEGMENT_12,1.9e-05
4,2023-04-26 17:45,KVKSEGMENT_7,2.4e-05


In [8]:
print("\nCustomer Attributes:")
customer_attributes_df.head()


Customer Attributes:


Unnamed: 0,RND_ID,BASELOAD_PROFILE,AANSLUITCATEGORIE
0,8423,010,AC4A
1,6756,E3B,AC4B
2,1077,008,AC4A
3,8061,001,AC4B
4,10575,008,AC4B


In [8]:
standard_profiles_df['PROFILE'].unique()

array(['KVKSEGMENT_7', 'KVKSEGMENT_9', 'KVKSEGMENT_18', 'KVKSEGMENT_12',
       'KVKSEGMENT_8', 'KVKSEGMENT_1', 'KVKSEGMENT_15', 'KVKSEGMENT_3',
       'KVKSEGMENT_10', 'KVKSEGMENT_11', 'KVKSEGMENT_20', 'KVKSEGMENT_4',
       'KVKSEGMENT_19', 'KVKSEGMENT_14', 'KVKSEGMENT_17', 'KVKSEGMENT_13',
       'KVKSEGMENT_16', 'PV', 'KVKSEGMENT_5', 'KVKSEGMENT_6',
       'KVKSEGMENT_2', 'WIND_KLEIN'], dtype=object)

In [57]:
from googletrans import Translator
standard_profiles_df = read_feather_file(standard_profiles_file)

# Instantiate the translator
translator = Translator()

# Function to translate column names
def translate_column_names(dataframe):
    translations = {col: translator.translate(col, src='nl', dest='en').text for col in dataframe.columns}
    dataframe.rename(columns=translations, inplace=True)
    return dataframe


# Now translate the column names from Dutch to English
standard_profiles_df = translate_column_names(standard_profiles_df)
customer_attributes_df = translate_column_names(customer_attributes_df)


In [58]:
# Print the translated dataframe columns
print(standard_profiles_df.head())

       Date of time        PROFILE     VALUE
0  2023-10-06 07:30   KVKSEGMENT_7  0.000041
1  2023-08-27 14:15   KVKSEGMENT_9  0.000019
2  2023-02-25 18:00  KVKSEGMENT_18  0.000032
3  2023-11-29 00:15  KVKSEGMENT_12  0.000019
4  2023-04-26 17:45   KVKSEGMENT_7  0.000024


In [60]:
df = standard_profiles_df

In [61]:
# Convert 'Date of time' column to datetime format
df['Date of time'] = pd.to_datetime(df['Date of time'])

# Round the datetime to the nearest day
df['Date of time'] = df['Date of time'].dt.to_period('Y').dt.to_timestamp()

# Group by 'PROFILE' and sum the 'VALUE' column
grouped_df = df.groupby(['PROFILE', 'Date of time']).sum().reset_index()

print(grouped_df)

          PROFILE Date of time         VALUE
0    KVKSEGMENT_1   2023-01-01      1.000000
1   KVKSEGMENT_10   2023-01-01      1.000000
2   KVKSEGMENT_11   2023-01-01      1.000000
3   KVKSEGMENT_12   2023-01-01      1.000000
4   KVKSEGMENT_13   2023-01-01      1.000000
5   KVKSEGMENT_14   2023-01-01      1.000000
6   KVKSEGMENT_15   2023-01-01      1.000000
7   KVKSEGMENT_16   2023-01-01      1.000000
8   KVKSEGMENT_17   2023-01-01      1.000000
9   KVKSEGMENT_18   2023-01-01      1.000000
10  KVKSEGMENT_19   2023-01-01      1.000000
11   KVKSEGMENT_2   2023-01-01      1.000000
12  KVKSEGMENT_20   2023-01-01      1.000000
13   KVKSEGMENT_3   2023-01-01      1.000000
14   KVKSEGMENT_4   2023-01-01      1.000000
15   KVKSEGMENT_5   2023-01-01      1.000000
16   KVKSEGMENT_6   2023-01-01      1.000000
17   KVKSEGMENT_7   2023-01-01      1.000000
18   KVKSEGMENT_8   2023-01-01      1.000000
19   KVKSEGMENT_9   2023-01-01      1.000000
20             PV   2023-01-01  -5235.721599
21     WIN

### KVK segments are the segments from the chamber of commerce

In [5]:
# Print the translated dataframe columns
print(customer_attributes_df.head())

   Rnd_id Baseload_profile Connection category
0    8423              010                AC4A
1    6756              E3B                AC4B
2    1077              008                AC4A
3    8061              001                AC4B
4   10575              008                AC4B


In [6]:
customer_attributes_df['Baseload_profile'].unique()

array(['010', 'E3B', '008', '001', '013', 'PV', '004', 'E3A', 'E3C',
       '014', '018', '002', '017', '006', 'E3D', '009', '012', '019',
       '016', '007', '015', '020', '005', '011', 'WIND_KLEIN'],
      dtype=object)

In [7]:
customer_attributes_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17102 entries, 0 to 17101
Data columns (total 3 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Rnd_id               17102 non-null  int32 
 1   Baseload_profile     17102 non-null  object
 2   Connection category  17102 non-null  object
dtypes: int32(1), object(2)
memory usage: 334.2+ KB
