In [122]:
import os
import pandas as pd
from pandas import Series, DataFrame
import country_converter as coco
import warnings
warnings.filterwarnings("ignore")

Data Sources:
- EuroStat

In [123]:
## Get current working directory
print("Current working directory:", os.getcwd())

## Find path to raw data (for the raw files)
raw_data_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'raw_data'))

## Find path to folder for saving cleaned csv
cleaned_data_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'cleaned_data'))

Current working directory: /Users/jennagreene/Documents/GitHub/HATCH_data/reading_files


In [124]:
# Find and read the CSV file from the raw_data folder
eurostat_folder = 'eurostat'
eurostat_folder_path = os.path.join(raw_data_path, eurostat_folder)

In [125]:
eurostat = os.listdir(eurostat_folder_path)
for file in eurostat:
    if '.xlsx' not in file:
        eurostat.remove(file)
eurostat

['airfleets_europe.xlsx',
 'eurostat_internetdevices.xlsx',
 'Motorcycles_Eurostat.xlsx',
 'FlushToilets_Europe.xlsx',
 'GSHP_Eurostat.xlsx',
 'roads_europe.xlsx']

### Airfleets

Read Airfleets data

In [126]:
eurostat_file_path = os.path.join(eurostat_folder_path, 'airfleets_europe.xlsx')
airfleets = pd.read_excel(eurostat_file_path, sheet_name= 'Sheet 1', header = 7, na_values='..', skipfooter=5)

In [127]:
airfleets = airfleets.drop(['Unnamed: 2', 'Unnamed: 4', 'Unnamed: 6', 'Unnamed: 8', 'Unnamed: 10', 'Unnamed: 12', 'Unnamed: 14', 'Unnamed: 16', 'Unnamed: 18', 'Unnamed: 20', 'Unnamed: 22', 'Unnamed: 24', 
                'Unnamed: 26', 'Unnamed: 28', 'Unnamed: 30', 'Unnamed: 32', 'Unnamed: 34', 'Unnamed: 36', 'Unnamed: 38', 'Unnamed: 40', 'Unnamed: 42', 'Unnamed: 44'], axis = 1)
airfleets = airfleets.drop(0, axis = 0)
airfleets = airfleets.rename(columns = {'TIME': 'Country Name'})

airfleets['Data Source'] = 'Eurostat'
airfleets['Technology Name'] = 'Commercial Aircrafts'
airfleets['Spatial Scale'] = 'National'
airfleets['Metric'] = 'Total Number'
airfleets['Unit'] = '-'
airfleets['Country Code'] = coco.convert(names=airfleets['Country Name'], to='iso2')
airfleets['ID'] = airfleets['Technology Name'] + '_' + airfleets['Metric'] + '_' + airfleets['Country Code']
airfleets.set_index('ID', inplace=True)

Save Airfleet Data

In [128]:
airfleets
output_file = 'airfleets.csv'
output_file_path = os.path.join(cleaned_data_path, output_file)

airfleets.to_csv(output_file_path, index = False)
print("Data saved to:", output_file_path)

Data saved to: /Users/jennagreene/Documents/GitHub/HATCH_data/cleaned_data/airfleets.csv


### Flush Toilets

In [129]:
eurostat_file_path = os.path.join(eurostat_folder_path, 'FlushToilets_Europe.xlsx')
flushtoilets = pd.read_excel(eurostat_file_path, sheet_name= 'Sheet 1', header = 11, na_values=':', skipfooter=9)

In [130]:
flushtoilets = flushtoilets.drop(['Unnamed: 2', 'Unnamed: 4', 'Unnamed: 6', 'Unnamed: 8', 'Unnamed: 10', 'Unnamed: 12', 'Unnamed: 14', 'Unnamed: 16', 'Unnamed: 18', 'Unnamed: 20', 'Unnamed: 22'], axis = 1)
flushtoilets = flushtoilets.drop([0, 1, 2, 3, 4, 5], axis = 0)
flushtoilets = flushtoilets.rename(columns = {'TIME': 'Country Name'})

#flushtoilets contains households without flush toilets, subtract from 100 to get share with flush toilets
flushtoilets = flushtoilets.apply(lambda x: 100 - x if x.dtype == 'float64' else x)

In [131]:
flushtoilets['Data Source'] = 'Eurostat'
flushtoilets['Technology Name'] = 'Flush toilet'
flushtoilets['Spatial Scale'] = 'National'
flushtoilets['Metric'] = 'Share of Households'
flushtoilets['Unit'] = '%'
flushtoilets['Country Code'] = coco.convert(names=flushtoilets['Country Name'], to='iso2')
flushtoilets['ID'] = flushtoilets['Technology Name'] + '_' + flushtoilets['Metric'] + '_' + flushtoilets['Country Code']
flushtoilets.set_index('ID', inplace=True)

Save Flush Toilets Data

In [132]:
output_file = 'flushtoilets.csv'
output_file_path = os.path.join(cleaned_data_path, output_file)

flushtoilets.to_csv(output_file_path, index = False)
print("Data saved to:", output_file_path)

Data saved to: /Users/jennagreene/Documents/GitHub/HATCH_data/cleaned_data/flushtoilets.csv


### GSHPS

In [133]:
eurostat_file_path = os.path.join(eurostat_folder_path, 'GSHP_Eurostat.xlsx')
gshp = pd.read_excel(eurostat_file_path, sheet_name= 'Sheet 1', header = 9, na_values=':', skipfooter=5)

In [134]:
gshp = gshp.drop([0, 1, 2], axis = 0)
gshp = gshp.rename(columns = {'TIME': 'Country Name'})

In [135]:
gshp['Data Source'] = 'Eurostat'
gshp['Technology Name'] = 'Ground Source Heat Pumps'
gshp['Spatial Scale'] = 'National'
gshp['Metric'] = 'Installed thermal capacity'
gshp['Unit'] = 'GW'
gshp['Country Code'] = coco.convert(names=gshp['Country Name'], to='iso2')
gshp['ID'] = gshp['Technology Name'] + '_' + gshp['Metric'] + '_' + gshp['Country Code']
gshp.set_index('ID', inplace=True)

In [136]:
output_file = 'gshp_europe.csv'
output_file_path = os.path.join(cleaned_data_path, output_file)

gshp.to_csv(output_file_path, index = False)
print("Data saved to:", output_file_path)

Data saved to: /Users/jennagreene/Documents/GitHub/HATCH_data/cleaned_data/gshp_europe.csv


### Motorcycles

In [137]:
eurostat_file_path = os.path.join(eurostat_folder_path, 'Motorcycles_Eurostat.xlsx')
motorcycles = pd.read_excel(eurostat_file_path, sheet_name= 'Sheet 1', header = 8, na_values=':', skipfooter=9)

In [138]:
motorcycles
motorcycles = motorcycles.drop(['Unnamed: 2', 'Unnamed: 4', 'Unnamed: 6', 'Unnamed: 8', 'Unnamed: 10', 'Unnamed: 12', 'Unnamed: 14', 'Unnamed: 16', 'Unnamed: 18', 'Unnamed: 20', 'Unnamed: 22', 'Unnamed: 24', 
                'Unnamed: 26', 'Unnamed: 28', 'Unnamed: 30', 'Unnamed: 32', 'Unnamed: 34', 'Unnamed: 36', 'Unnamed: 38', 'Unnamed: 40', 'Unnamed: 42', 'Unnamed: 44', 'Unnamed: 34', 'Unnamed: 36', 'Unnamed: 38', 
                'Unnamed: 40', 'Unnamed: 42', 'Unnamed: 44', 'Unnamed: 46', 'Unnamed: 48', 'Unnamed: 50','Unnamed: 52', 'Unnamed: 54', 'Unnamed: 56','Unnamed: 58', 'Unnamed: 60', 'Unnamed: 62',
                'Unnamed: 64', 'Unnamed: 66', 'Unnamed: 68', 'Unnamed: 70', 'Unnamed: 72', 'Unnamed: 74', 'Unnamed: 76', 'Unnamed: 78', 'Unnamed: 80','Unnamed: 82', 'Unnamed: 84', 'Unnamed: 86', 
                'Unnamed: 88', 'Unnamed: 90', 'Unnamed: 92', 'Unnamed: 94', 'Unnamed: 96', 'Unnamed: 98', 'Unnamed: 100', 'Unnamed: 102', 'Unnamed: 104'   ], axis = 1)
motorcycles = motorcycles.drop([0,1], axis = 0)
motorcycles = motorcycles.rename(columns = {'TIME': 'Country Name'})


In [139]:
motorcycles['Data Source'] = 'Eurostat'
motorcycles['Technology Name'] = 'Motorcycles'
motorcycles['Spatial Scale'] = 'National'
motorcycles['Metric'] = 'Total Number'
motorcycles['Unit'] = '-'
motorcycles['Country Code'] = coco.convert(names=motorcycles['Country Name'], to='iso2')
motorcycles['ID'] = motorcycles['Technology Name'] + '_' + motorcycles['Metric'] + '_' + motorcycles['Country Code']
motorcycles.set_index('ID', inplace=True)

Save motorcycle data

In [140]:
output_file = 'motorcycles_europe.csv'
output_file_path = os.path.join(cleaned_data_path, output_file)

motorcycles.to_csv(output_file_path, index = False)
print("Data saved to:", output_file_path)

Data saved to: /Users/jennagreene/Documents/GitHub/HATCH_data/cleaned_data/motorcycles_europe.csv


### Paved and Unpaved Roads

In [141]:
eurostat_file_path = os.path.join(eurostat_folder_path, 'roads_europe.xlsx')
roads_europe = pd.read_excel(eurostat_file_path, sheet_name= 'Sheet 1', header = 8, na_values=':', skipfooter=9)

In [142]:
roads_europe = roads_europe.drop(['Unnamed: 2', 'Unnamed: 4', 'Unnamed: 6', 'Unnamed: 8', 'Unnamed: 10', 'Unnamed: 12', 'Unnamed: 14', 'Unnamed: 16', 'Unnamed: 18', 'Unnamed: 20', 'Unnamed: 22', 'Unnamed: 24', 
                'Unnamed: 26', 'Unnamed: 28', 'Unnamed: 30', 'Unnamed: 32', 'Unnamed: 34', 'Unnamed: 36', 'Unnamed: 38', 'Unnamed: 40', 'Unnamed: 42', 'Unnamed: 44', 'Unnamed: 34', 'Unnamed: 36', 'Unnamed: 38', 
                'Unnamed: 40', 'Unnamed: 42', 'Unnamed: 44', 'Unnamed: 46', 'Unnamed: 48', 'Unnamed: 50','Unnamed: 52', 'Unnamed: 54', 'Unnamed: 56','Unnamed: 58', 'Unnamed: 60', 'Unnamed: 62',
                'Unnamed: 64', 'Unnamed: 66', 'Unnamed: 68', 'Unnamed: 70', 'Unnamed: 72', 'Unnamed: 74', 'Unnamed: 76', 'Unnamed: 78', 'Unnamed: 80','Unnamed: 82', 'Unnamed: 84', 'Unnamed: 86', 
                'Unnamed: 88', 'Unnamed: 90', 'Unnamed: 92', 'Unnamed: 94', 'Unnamed: 96', 'Unnamed: 98', 'Unnamed: 100', 'Unnamed: 102', 'Unnamed: 104'   ], axis = 1)

roads_europe = roads_europe.drop(0, axis = 0)
roads_europe = roads_europe.rename(columns = {'TIME': 'Country Name'})


In [143]:
## Convert km to miles to be consistent with US Public roads data
km_to_miles = 0.621371
roads_europe = roads_europe.apply(lambda x: x * km_to_miles if x.dtype == 'float64' else x)


In [144]:
roads_europe['Data Source'] = 'Eurostat'
roads_europe['Technology Name'] = 'Public Roads'
roads_europe['Spatial Scale'] = 'National'
roads_europe['Metric'] = 'Total Length'
roads_europe['Unit'] = 'miles'
roads_europe['Country Code'] = coco.convert(names=roads_europe['Country Name'], to='iso2')
roads_europe['ID'] = roads_europe['Technology Name'] + '_' + roads_europe['Metric'] + '_' + roads_europe['Country Code']
roads_europe.set_index('ID', inplace=True)

In [145]:
output_file = 'roads_europe.csv'
output_file_path = os.path.join(cleaned_data_path, output_file)

roads_europe.to_csv(output_file_path, index = False)
print("Data saved to:", output_file_path)

Data saved to: /Users/jennagreene/Documents/GitHub/HATCH_data/cleaned_data/roads_europe.csv
