In [1]:
import pandas as pd
import numpy as np

In [2]:

# Read the Verbraucher Preisindex CSV
# Read the CSV with semicolon separator and specify encoding if needed
df_vpi = pd.read_csv("Verbraucherpreisindex.csv", sep=';', header=None, 
                        names=['Jahr', 'Monat', 'VPI'], 
                        encoding='utf-8')  # Adjust encoding if needed



df_vpi

Unnamed: 0,Jahr,Monat,VPI
0,2013,Januar,974
1,2013,Februar,980
2,2013,März,984
3,2013,April,980
4,2013,Mai,984
...,...,...,...
91,2020,August,1060
92,2020,September,1058
93,2020,Oktober,1059
94,2020,November,1050


In [3]:
print(df_vpi.head())

   Jahr    Monat   VPI
0  2013   Januar  97,4
1  2013  Februar  98,0
2  2013     März  98,4
3  2013    April  98,0
4  2013      Mai  98,4


In [4]:
# Create a mapping of German month names to numbers
month_mapping = {
    'Januar': 1, 'Februar': 2, 'März': 3, 'April': 4, 
    'Mai': 5, 'Juni': 6, 'Juli': 7, 'August': 8, 
    'September': 9, 'Oktober': 10, 'November': 11, 'Dezember': 12
}

df_vpi['Monat'] = df_vpi['Monat'].map(month_mapping)

df_vpi.head()


# # Convert month names to numbers
# df_vpi['Monat_Nummer'] = df_vpi['Monat'].map(month_mapping)

# # Create a datetime column
# df_vpi['Datum'] = pd.to_datetime(
#     df_vpi['Jahr'].astype(str) + '-' + 
#     df_vpi['Monat_Nummer'].astype(str) + 
#     '-01'
# )

Unnamed: 0,Jahr,Monat,VPI
0,2013,1,974
1,2013,2,980
2,2013,3,984
3,2013,4,980
4,2013,5,984


In [5]:
df_vpi['VPI'] = df_vpi['VPI'].str.replace(',', '.')
df_vpi['VPI'] = df_vpi['VPI'].astype(float)
df_vpi.head()

Unnamed: 0,Jahr,Monat,VPI
0,2013,1,97.4
1,2013,2,98.0
2,2013,3,98.4
3,2013,4,98.0
4,2013,5,98.4


In [6]:
df_vpi['Datum'] = pd.to_datetime(df_vpi['Jahr'].astype(str) + '-' + df_vpi['Monat'].astype(str) + '-01')
df_vpi.head()

Unnamed: 0,Jahr,Monat,VPI,Datum
0,2013,1,97.4,2013-01-01
1,2013,2,98.0,2013-02-01
2,2013,3,98.4,2013-03-01
3,2013,4,98.0,2013-04-01
4,2013,5,98.4,2013-05-01


In [7]:
# Keep only Datum and VPI
df_vpi = df_vpi[['Datum', 'VPI']]
df_vpi

Unnamed: 0,Datum,VPI
0,2013-01-01,97.4
1,2013-02-01,98.0
2,2013-03-01,98.4
3,2013-04-01,98.0
4,2013-05-01,98.4
...,...,...
91,2020-08-01,106.0
92,2020-09-01,105.8
93,2020-10-01,105.9
94,2020-11-01,105.0


In [8]:
# Create a date range for all days
date_range = pd.date_range(start='2013-07-01', end='2019-07-31', freq='D')

# Create a new DataFrame with this date range
df_vpi_daily = pd.DataFrame({'Datum': date_range})

# Merge with original VPI data, using the month start as the key
df_vpi_daily['year_month'] = df_vpi_daily['Datum'].dt.to_period('M')
df_vpi['year_month'] = df_vpi['Datum'].dt.to_period('M')

# Merge and cleanup
df_vpi_daily = df_vpi_daily.merge(df_vpi[['year_month', 'VPI']], on='year_month')
df_vpi_daily = df_vpi_daily[['Datum', 'VPI']]

#df_vpi_daily['VPI'] = df_vpi_daily['VPI'] / 106.2
df_vpi_daily

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_vpi['year_month'] = df_vpi['Datum'].dt.to_period('M')


Unnamed: 0,Datum,VPI
0,2013-07-01,98.9
1,2013-07-02,98.9
2,2013-07-03,98.9
3,2013-07-04,98.9
4,2013-07-05,98.9
...,...,...
2217,2019-07-27,106.2
2218,2019-07-28,106.2
2219,2019-07-29,106.2
2220,2019-07-30,106.2


In [9]:
df = df_vpi_daily

df['VPI'] = (df['VPI'] - df['VPI'].min()) / (df['VPI'].max() - df['VPI'].min())
df['VPI'] = df['VPI'].round(3)

df
    

Unnamed: 0,Datum,VPI
0,2013-07-01,0.052
1,2013-07-02,0.052
2,2013-07-03,0.052
3,2013-07-04,0.052
4,2013-07-05,0.052
...,...,...
2217,2019-07-27,1.000
2218,2019-07-28,1.000
2219,2019-07-29,1.000
2220,2019-07-30,1.000


In [10]:
df_vpi_daily.to_csv('/workspaces/bakery_sales_prediction/0_DataPreparation/02_Verbraucherpreisindex/vpi_daily.csv', index=False)