# Eurostat Data Analysis for Short-Stay Accommodations


This notebook provides an initial analysis of three Eurostat datasets relevant to short-stay accommodations:

- `ei_cphi_m`: Harmonised Index of Consumer Prices (HICP), monthly data.
- `ei_bsse_m_r2`: Services confidence indicator, monthly data.
- `ei_bsco_m`: Consumer confidence indicator, monthly data.

The analysis includes data loading, summary statistics, and initial visualizations.


In [2]:
#!pip install eurostat

import eurostat
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the datasets from Eurostat
hicp_data = eurostat.get_data_df('ei_cphi_m')
services_confidence_data = eurostat.get_data_df('ei_bsse_m_r2')
consumer_confidence_data = eurostat.get_data_df('ei_bsco_m')

In [3]:
hicp_data.head()

Unnamed: 0,freq,unit,s_adj,indic,geo\TIME_PERIOD,1996-01,1996-02,1996-03,1996-04,1996-05,...,2024-01,2024-02,2024-03,2024-04,2024-05,2024-06,2024-07,2024-08,2024-09,2024-10
0,M,HICP2015,NSA,CP-HI00,AL,,,,,,...,128.98,129.39,130.36,130.37,130.32,130.07,129.83,130.22,,
1,M,HICP2015,NSA,CP-HI00,AT,71.34,71.55,71.77,71.69,71.62,...,132.6,133.26,134.12,134.27,134.39,134.44,133.95,133.77,134.13,134.66
2,M,HICP2015,NSA,CP-HI00,BE,69.91,69.98,70.12,70.47,70.61,...,127.51,130.54,131.05,130.72,130.98,131.56,130.76,132.73,132.21,133.13
3,M,HICP2015,NSA,CP-HI00,BG,,,,,,...,136.29,136.72,137.05,136.93,136.93,137.15,138.75,138.83,137.25,
4,M,HICP2015,NSA,CP-HI00,CH,,,,,,...,106.57,106.8,106.74,107.3,107.58,107.44,107.89,107.86,107.5,


In [4]:
services_confidence_data.head() 

Unnamed: 0,freq,indic,s_adj,unit,geo\TIME_PERIOD,1988-01,1988-02,1988-03,1988-04,1988-05,...,2024-01,2024-02,2024-03,2024-04,2024-05,2024-06,2024-07,2024-08,2024-09,2024-10
0,M,BS-PE3M,NSA,BAL,AL,,,,,,...,10.9,9.5,8.8,13.5,8.2,12.6,8.1,6.9,4.0,8.4
1,M,BS-PE3M,NSA,BAL,AT,,,,,,...,42.4,20.9,18.1,10.4,12.7,8.0,9.8,3.6,8.2,14.5
2,M,BS-PE3M,NSA,BAL,BE,,,,,,...,42.2,16.4,9.0,8.1,6.0,9.5,7.0,7.2,7.1,9.5
3,M,BS-PE3M,NSA,BAL,BG,,,,,,...,13.7,13.2,13.3,14.5,8.2,8.5,4.9,2.7,0.9,3.9
4,M,BS-PE3M,NSA,BAL,CY,,,,,,...,21.4,36.5,43.7,32.1,27.8,40.3,27.7,24.8,28.2,5.7


In [5]:
consumer_confidence_data.head()

Unnamed: 0,freq,indic,s_adj,unit,geo\TIME_PERIOD,1980-01,1980-02,1980-03,1980-04,1980-05,...,2024-01,2024-02,2024-03,2024-04,2024-05,2024-06,2024-07,2024-08,2024-09,2024-10
0,M,BS-CSMCI,NSA,BAL,AL,,,,,,...,-12.8,-14.6,-14.3,-10.2,-13.2,-12.9,-12.5,-11.5,-11.4,-11.8
1,M,BS-CSMCI,NSA,BAL,AT,,,,,,...,-18.1,-15.9,-16.2,-13.2,-14.9,-12.2,-14.4,-13.1,-14.1,-16.1
2,M,BS-CSMCI,NSA,BAL,BE,,,,,,...,-10.7,-11.9,-11.1,-10.5,-10.1,-6.5,-8.5,-7.1,-9.7,-10.2
3,M,BS-CSMCI,NSA,BAL,BG,,,,,,...,-18.7,-20.6,-18.8,-19.8,-18.1,-14.3,-19.1,-18.3,-15.6,-16.3
4,M,BS-CSMCI,NSA,BAL,CY,,,,,,...,-21.8,-26.5,-23.7,-26.5,-25.3,-20.7,-22.1,-27.0,-25.5,-27.3


In [3]:
import eurostat
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Load Eurostat datasets 
# For "demo_r_d2jan" dataset: demography
#demo_data = eurostat.get_data_df('demo_r_d2jan')

# For "tour_ce_omn12" dataset: Local information about tourism
tourism_data = eurostat.get_data_df('tour_ce_omn12')


In [4]:
import pandas as pd



# Replace this with your actual dataset
df = pd.DataFrame(tourism_data)

# Step 1: Convert month codes to readable format
month_mapping = {
    'M01': 'January', 'M02': 'February', 'M03': 'March', 'M04': 'April',
    'M05': 'May', 'M06': 'June', 'M07': 'July', 'M08': 'August',
    'M09': 'September', 'M10': 'October', 'M11': 'November', 'M12': 'December',
    'TOTAL': 'Total'
}
df['month'] = df['month'].map(month_mapping)

# Step 2: Melt year columns into a single "Year" and "Value" format
df_melted = df.melt(id_vars=['freq', 'indic_to', 'c_resid', 'month', 'unit', 'geo\\TIME_PERIOD'], 
                    var_name='Year', 
                    value_name='Value')

# Step 3: Split geo\TIME_PERIOD into geo_code and region_name if needed
# This is hypothetical as you will have to handle different regions differently
df_melted['geo_code'] = df_melted['geo\\TIME_PERIOD'].str.extract(r'([A-Z0-9]+)')
df_melted['region_name'] = df_melted['geo\\TIME_PERIOD'].str.extract(r'\[(.*?)\]')

# Step 4: Ensure categorical columns are consistent (capitalize, standardize as needed)
df_melted['indic_to'] = df_melted['indic_to'].str.upper()
df_melted['c_resid'] = df_melted['c_resid'].str.upper()

# Step 5: Handle missing data if necessary
df_melted['Value'] = df_melted['Value'].fillna(0)  # Replace NaN with 0 or any other strategy

# Now df_melted is ready for visualizations and comparisons
print(df_melted.head())


  freq indic_to c_resid    month unit geo\TIME_PERIOD  Year    Value geo_code  \
0    A     LSTY     DOM  January   NR              AT  2018  23783.0       AT   
1    A     LSTY     DOM  January   NR             AT1  2018   8096.0      AT1   
2    A     LSTY     DOM  January   NR            AT11  2018    239.0     AT11   
3    A     LSTY     DOM  January   NR            AT12  2018    790.0     AT12   
4    A     LSTY     DOM  January   NR            AT13  2018   7067.0     AT13   

  region_name  
0         NaN  
1         NaN  
2         NaN  
3         NaN  
4         NaN  


In [7]:
print(tourism_data)

      freq indic_to c_resid  month unit geo\TIME_PERIOD      2018      2019  \
0        A     LSTY     DOM    M01   NR              AT   23783.0   27923.0   
1        A     LSTY     DOM    M01   NR             AT1    8096.0    9799.0   
2        A     LSTY     DOM    M01   NR            AT11     239.0     374.0   
3        A     LSTY     DOM    M01   NR            AT12     790.0    1323.0   
4        A     LSTY     DOM    M01   NR            AT13    7067.0    8102.0   
...    ...      ...     ...    ...  ...             ...       ...       ...   
45040    A      STY   TOTAL  TOTAL   NR             SK0  201570.0  282091.0   
45041    A      STY   TOTAL  TOTAL   NR            SK01   83713.0  127479.0   
45042    A      STY   TOTAL  TOTAL   NR            SK02   16825.0   22662.0   
45043    A      STY   TOTAL  TOTAL   NR            SK03   57106.0   68598.0   
45044    A      STY   TOTAL  TOTAL   NR            SK04   43926.0   63352.0   

           2020      2021      2022      2023     2

In [8]:
import pandas as pd
pd.DataFrame(tourism_data)
tourism_data.to_excel('tourism_data.xlsx')

## Checking for Missing Values

In [11]:

# Check for missing values in each dataset
print("Missing values in HICP dataset:")
print(hicp_data.isnull().sum())

print("\nMissing values in Services Confidence dataset:")
print(services_confidence_data.isnull().sum())

print("\nMissing values in Consumer Confidence dataset:")
print(consumer_confidence_data.isnull().sum())


Missing values in HICP dataset:
freq                  0
unit                  0
s_adj                 0
indic                 0
geo\TIME_PERIOD       0
                   ... 
2024-06             150
2024-07             150
2024-08             150
2024-09             225
2024-10            2967
Length: 351, dtype: int64

Missing values in Services Confidence dataset:
freq                0
indic               0
s_adj               0
unit                0
geo\TIME_PERIOD     0
                   ..
2024-06            12
2024-07            12
2024-08            12
2024-09            12
2024-10            24
Length: 447, dtype: int64

Missing values in Consumer Confidence dataset:
freq                0
indic               0
s_adj               0
unit                0
geo\TIME_PERIOD     0
                   ..
2024-06            24
2024-07            24
2024-08            24
2024-09            24
2024-10            48
Length: 543, dtype: int64


## Checking for Duplicate Rows

In [12]:

# Check for duplicates in each dataset
print("Duplicate rows in HICP dataset:", hicp_data.duplicated().sum())
print("Duplicate rows in Services Confidence dataset:", services_confidence_data.duplicated().sum())
print("Duplicate rows in Consumer Confidence dataset:", consumer_confidence_data.duplicated().sum())


Duplicate rows in HICP dataset: 0
Duplicate rows in Services Confidence dataset: 0
Duplicate rows in Consumer Confidence dataset: 0


## Summary Statistics

In [13]:

# Descriptive statistics for numerical columns in each dataset
print("Descriptive statistics for HICP dataset:")
print(hicp_data.describe())

print("\nDescriptive statistics for Services Confidence dataset:")
print(services_confidence_data.describe())

print("\nDescriptive statistics for Consumer Confidence dataset:")
print(consumer_confidence_data.describe())


Descriptive statistics for HICP dataset:
          1996-01      1996-02      1996-03      1996-04      1996-05  \
count  588.000000  1176.000000  1176.000000  1176.000000  1176.000000   
mean    61.083214    31.016514    31.461947    31.562543    31.551122   
std     29.874708    36.926418    37.058848    37.267757    37.442016   
min      0.440000   -20.900000    -1.000000    -2.200000    -3.200000   
25%     46.312500     0.400000     0.400000     0.400000     0.300000   
50%     61.160000     3.700000     4.230000     3.885000     3.160000   
75%     71.627500    61.217500    62.237500    62.677500    62.922500   
max    234.400000   236.600000   237.500000   237.300000   236.800000   

           1996-06      1996-07      1996-08      1996-09      1996-10  ...  \
count  1176.000000  1176.000000  1176.000000  1176.000000  1176.000000  ...   
mean     31.396905    31.421964    31.369235    32.016054    31.887696  ...   
std      37.605337    37.501655    37.458444    37.496245    37.

## Initial Visualizations

In [14]:

# Sample visualizations: time series plots for each dataset

# Convert 'time' column to datetime format if necessary
hicp_data['time'] = pd.to_datetime(hicp_data['time'], errors='coerce')
services_confidence_data['time'] = pd.to_datetime(services_confidence_data['time'], errors='coerce')
consumer_confidence_data['time'] = pd.to_datetime(consumer_confidence_data['time'], errors='coerce')

# HICP Plot
plt.figure(figsize=(10, 6))
sns.lineplot(data=hicp_data, x='time', y='values', hue='geo', legend=None)
plt.title('HICP Over Time')
plt.xlabel('Time')
plt.ylabel('HICP Value')
plt.show()

# Services Confidence Indicator Plot
plt.figure(figsize=(10, 6))
sns.lineplot(data=services_confidence_data, x='time', y='values', hue='geo', legend=None)
plt.title('Services Confidence Indicator Over Time')
plt.xlabel('Time')
plt.ylabel('Confidence Indicator')
plt.show()

# Consumer Confidence Indicator Plot
plt.figure(figsize=(10, 6))
sns.lineplot(data=consumer_confidence_data, x='time', y='values', hue='geo', legend=None)
plt.title('Consumer Confidence Indicator Over Time')
plt.xlabel('Time')
plt.ylabel('Confidence Indicator')
plt.show()


KeyError: 'time'