# Wyjaśnienie jak wyodrębniłam dane

### Wgranie bibliotek i danych

In [2]:
#Biblioteki
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime

#Zmienienie formatu zapisu danych numerycznych na dwie cyfry po przecinku.
pd.options.display.float_format = '{:.2f}'.format

#Wczytanie danych
df = pd.read_csv('all_energy_statistics.csv')

### Podział kolumny 'commodity_transaction'

źródło kodu: https://www.kaggle.com/code/gabrielapiwar/cleaning-of-commodities-column/edit

In [3]:
#Podział na trzy kolumny
split_commodities = df.commodity_transaction.str.split(" - | – ",  expand=True)

#Przypisanie nazw kolumn
split_commodities.columns = ["commodity", "transaction_type", "additional_transaction_info"]

#Oczyszczenie stringów w kolumnie transaction_type
split_commodities.transaction_type = split_commodities.transaction_type.str.lower().str.strip()

#Poprawienie błędów w pisowni i formatowaniu
split_commodities.transaction_type = split_commodities.transaction_type.str.replace("transformatin", "transformation")
split_commodities.transaction_type = split_commodities.transaction_type.str.replace("non energy uses", "consumption for non-energy uses")
split_commodities.transaction_type = split_commodities.transaction_type.str.replace(" /", "/")
split_commodities.transaction_type = split_commodities.transaction_type.str.replace("/ ", "/")

#Dodanie trzech nowych kolumn do orginalnych danych
new_df = df.join(split_commodities)

#Usunięcie kolumny 'commodity_transaction'
new_df = new_df.drop(columns = ['commodity_transaction'])

#Zapisanie nowego pliku z danymi
with open("commodity_split_data.csv", "w+") as file:
    file.write(new_df.to_csv())

In [28]:
new_df['category'].unique()

array(['additives_and_oxygenates', 'animal_waste', 'anthracite',
       'aviation_gasoline', 'bagasse', 'biodiesel', 'biogases',
       'biogasoline', 'bitumen', 'black_liquor', 'blast_furnace_gas',
       'brown_coal_briquettes', 'brown_coal', 'charcoal', 'coal_tar',
       'coke_oven_coke', 'coking_coal', 'conventional_crude_oil',
       'direct_use_of_geothermal_heat',
       'direct_use_of_solar_thermal_heat',
       'electricity_net_installed_capacity_of_electric_power_plants',
       'ethane', 'falling_water', 'fuel_oil', 'fuelwood', 'gas_coke',
       'gas_oil_diesel_oil', 'gasoline_type_jet_fuel', 'gasworks_gas',
       'geothermal', 'hard_coal', 'heat', 'hydro', 'industrial_waste',
       'kerosene_type_jet_fuel', 'lignite', 'liquified_petroleum_gas',
       'lubricants', 'motor_gasoline', 'municipal_wastes', 'naphtha',
       'natural_gas_including_lng', 'natural_gas_liquids',
       'nuclear_electricity', 'of_which_biodiesel',
       'of_which_biogasoline', 'oil_shale_oil_sa

### Podgląd unikatowych wartości dla każdej kolumny

In [4]:
#Stworzenie słownika z unikatowymi wartościami w kolumnach 
unique_column_values = {}
keys = list(new_df.columns)
for col in keys:
    unique_column_values[col] = new_df[col].unique() 

#Posortowanie kolumny year żeby było czytelniej
unique_column_values['year'].sort()
    
#wypisanie nazw kolumn   
print(unique_column_values.keys(),"\n")

#print("Unikatowe wartości w kolumnie 'country_or_area':\n{}\n".format(unique_column_values['country_or_area']))
print("Unikatowe wartości w kolumnie 'year':\n{}\n".format(unique_column_values['year']))
print("Unikatowe wartości w kolumnie 'unit':\n{}\n".format(unique_column_values['unit']))
print("Unikatowe wartości w kolumnie 'category':\n{}\n".format(unique_column_values['category']))
print("Unikatowe wartości w kolumnie 'commodity':\n{}\n".format(unique_column_values['commodity']))
print("Unikatowe wartości w kolumnie 'transaction_type':\n{}\n".format(unique_column_values['transaction_type']))
print("Unikatowe wartości w kolumnie 'additional_transaction_info':\n{}\n".format(unique_column_values['additional_transaction_info']))

dict_keys(['country_or_area', 'year', 'unit', 'quantity', 'quantity_footnotes', 'category', 'commodity', 'transaction_type', 'additional_transaction_info']) 

Unikatowe wartości w kolumnie 'year':
[1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003
 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014]

Unikatowe wartości w kolumnie 'unit':
['Metric tons,  thousand' 'Terajoules' 'Kilowatts,  thousand'
 'Kilowatt-hours, million' 'Cubic metres, thousand' 'Metric Tons']

Unikatowe wartości w kolumnie 'category':
['additives_and_oxygenates' 'animal_waste' 'anthracite'
 'aviation_gasoline' 'bagasse' 'biodiesel' 'biogases' 'biogasoline'
 'bitumen' 'black_liquor' 'blast_furnace_gas' 'brown_coal_briquettes'
 'brown_coal' 'charcoal' 'coal_tar' 'coke_oven_coke' 'coking_coal'
 'conventional_crude_oil' 'direct_use_of_geothermal_heat'
 'direct_use_of_solar_thermal_heat'
 'electricity_net_installed_capacity_of_electric_power_plants' 'ethane'
 'falling_water' 'fuel_oil' 'fuelwood

### Wyodrębnienie elektryczności z OZE

Zanim przejdę do działania objaśnię mój tok myślowy jak podzieliłam wszystko. Na początek sprawdziłam jednostki 

In [68]:
#Pracujemy tylko na jednym commodity (dane się powtarzają między różnymi commodity, aby uprościć wybieramy tylko jedną kategorię)
eco_df = new_df.loc[new_df['commodity'] == 'Electricity']

In [69]:
#Wypisanie transaction type które nas interesuje z Electricity
eco_transaction_type = ['total geothermal production', 'total hydro production', 'total solar production', 
                        'total tide, wave production', 'total wind production', 'total production, main activity', 
                        'gross production', 'gross demand','total production, autoproducer', 'imports', 'exports']

In [70]:
#Stworzenie nowego datasetu tylko z wybranymi transaction type
eco_df = eco_df.loc[eco_df['transaction_type'].isin(eco_transaction_type)]

In [116]:
#Podział na regiony/kraje które nas interesują
EU_country = ["Austria", "Belgium", "Bulgaria", "Croatia", "Cyprus", "Czech Republic", "Denmark", "Estonia",
              "Finland", "France", "Greece", "Spain", "Netherlands", "Ireland", "Lithuania", "Luxembourg",
              "Latvia", "Malta", "Germany", "Poland", "Portugal", "Romania", "Slovakia", "Slovenia", "Sweden", 
              "Hungary", "Italy", "United Kingdom"]

World = ["United States", "China", "India", "Russian Federation"]

In [78]:
#Stworzenie nowego datasetu tylko z wybranymi regionami/krajami
EU_world_df = eco_df.loc[eco_df['country_or_area'].isin(UE_country + World)]
EU_world_df

Unnamed: 0,country_or_area,year,unit,quantity,quantity_footnotes,category,commodity,transaction_type,additional_transaction_info
490921,Austria,2014,"Kilowatt-hours, million",0.00,,geothermal,Electricity,total geothermal production,
490922,Austria,2013,"Kilowatt-hours, million",0.00,,geothermal,Electricity,total geothermal production,
490923,Austria,2012,"Kilowatt-hours, million",1.00,,geothermal,Electricity,total geothermal production,
490924,Austria,2011,"Kilowatt-hours, million",1.00,,geothermal,Electricity,total geothermal production,
490925,Austria,2010,"Kilowatt-hours, million",1.00,,geothermal,Electricity,total geothermal production,
...,...,...,...,...,...,...,...,...,...
1189456,United States,1994,"Kilowatt-hours, million",3483.00,,wind_electricity,Electricity,total wind production,
1189457,United States,1993,"Kilowatt-hours, million",3053.00,,wind_electricity,Electricity,total wind production,
1189458,United States,1992,"Kilowatt-hours, million",2917.00,,wind_electricity,Electricity,total wind production,
1189459,United States,1991,"Kilowatt-hours, million",3051.00,,wind_electricity,Electricity,total wind production,


In [77]:
#sprawdzenie jednostek - dane tylko w kilowatach na godzinę, czyli jednostce energii elektrycznej
EU_world_df['unit'].unique()

array(['Kilowatt-hours, million'], dtype=object)

In [44]:
#sprawdzenie category - 
EU_world_df['category'].unique()

array(['geothermal', 'hydro', 'solar_electricity',
       'tide_wave_and_ocean_electricity', 'total_electricity',
       'wind_electricity'], dtype=object)

In [172]:
#suma energii wyprodukowanej w danym kraju, z podziałem na jej rodzaje (suma dla wszystkich lat lub wybrany rok)

#Wybranie rocznika (można pominąć lub wybrać np. kilka lat)
#EU_world_df = EU_world_df.loc[EU_world_df['year'] == 2014]

#sumowanie energii osobno dla każdego kraju dla wszystkich roczników
sum_energy_df = EU_world_df[['country_or_area', 'quantity', 'transaction_type']].pivot_table(index='country_or_area',
                                                                                columns='transaction_type',
                                                                               aggfunc = np.sum)
#Podgląd danych
#sum_energy_df

Unnamed: 0_level_0,quantity,quantity,quantity,quantity,quantity,quantity,quantity,quantity,quantity,quantity,quantity
transaction_type,exports,gross demand,gross production,imports,total geothermal production,total hydro production,"total production, autoproducer","total production, main activity",total solar production,"total tide, wave production",total wind production
country_or_area,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
Austria,335813.0,1457467.0,1529690.0,390669.0,22.0,986426.0,211171.0,1318519.0,2173.0,,24285.0
Belgium,186606.0,2027313.0,2033493.0,308631.0,,36094.0,85934.0,1947559.0,9622.0,,17803.0
Bulgaria,154439.0,847937.0,1068924.0,50859.0,,81956.0,44333.0,1024591.0,3546.0,,5900.0
China,252116.0,54131715.0,58804770.6,87305.0,,9912320.0,3016994.0,55787776.6,20753.0,,508206.0
Croatia,75306.0,352818.0,276381.0,163821.0,,149325.0,11752.0,264629.0,48.0,,2076.0
Cyprus,,87886.0,92481.0,,,,680.0,91801.0,183.0,,743.0
Denmark,243075.0,864930.0,948103.0,209373.0,,608.0,50967.0,897136.0,1260.0,,125788.0
Estonia,57110.0,172345.0,234777.0,20539.0,,342.0,2896.0,231881.0,,,2776.0
Finland,52419.0,1962318.0,1776119.0,317815.0,,330300.0,274382.0,1501737.0,88.0,,4751.0
France,1633762.0,11102400.0,13289334.0,187359.0,,1679584.0,540310.0,12749024.0,17674.0,12151.0,92526.0


In [174]:
#wartości Nan zamieniamy na 0.0 i usuwamy MultiIndex'owanie
sum_energy_df.fillna(0, inplace=True)
sum_energy_df.columns = sum_energy_df.columns.droplevel()
sum_energy_df.columns.name = None
sum_energy_df.reset_index(inplace=True)

#Podgląd danych
#sum_energy_df

In [186]:
#sumowanie energii z krajów europejskich - stworzenie jednego wiersza EU countries
eu_energy_df = sum_energy_df.loc[sum_energy_df['country_or_area'].isin(EU_country)].sum(numeric_only = True)
eu_energy_df = eu_energy_df.to_frame().transpose()
eu_energy_df['country_or_area'] = 'EU countries'
eu_energy_df

Unnamed: 0,exports,gross demand,gross production,imports,total geothermal production,total hydro production,"total production, autoproducer","total production, main activity",total solar production,"total tide, wave production",total wind production,country_or_area
0,5850341.0,69480288.0,73812933.0,6442371.0,119581.0,8922281.0,5861643.0,67951290.0,347473.0,12167.0,1771314.0,EU countries


In [188]:
#Połączenie EU i World
world_energy_df = sum_energy_df.loc[sum_energy_df['country_or_area'].isin(World)] 

final_energy_df = pd.concat([world_df, eu_energy_df])
final_energy_df.reset_index(inplace=True, drop = True)

#Podgląd danych
final_energy_df

Unnamed: 0,country_or_area,exports,gross demand,gross production,imports,total geothermal production,total hydro production,"total production, autoproducer","total production, main activity",total solar production,"total tide, wave production",total wind production
0,China,252116.0,54131715.0,58804770.6,87305.0,0.0,9912320.0,3016994.0,55787776.6,20753.0,0.0,508206.0
1,India,4147.0,15603930.0,16539859.0,73831.0,0.0,2280019.0,1868703.0,14671156.0,8453.0,0.0,208900.0
2,Russian Federation,580975.0,19907649.0,21847563.0,219004.0,5955.0,3907008.0,1240599.0,20606964.0,160.0,0.0,172.0
3,United States,377445.0,93789000.0,98746617.0,1129039.0,410355.0,7520138.0,5787864.0,92958753.0,81665.0,0.0,1002367.0
4,EU countries,5850341.0,69480288.0,73812933.0,6442371.0,119581.0,8922281.0,5861643.0,67951290.0,347473.0,12167.0,1771314.0


In [189]:
#dodanie kolumny z sumą całej energi z OZE na kraj
sum_electricity_list = ['total geothermal production', 'total hydro production',
                        'total solar production', 'total tide, wave production']

#Skrót RE oznacza Renewable Energy
final_energy_df['total RE production'] = final_energy_df[sum_electricity_list].sum(axis = 1)

#Podgląd danych
#final_energy_df

In [190]:
#dodanie kolumny z procentem jaki stanowi energia odnawialna w produkcji elektrycznosci
final_energy_df['RE production %'] = (final_energy_df['total RE production'] / final_energy_df['gross production'] * 100)

#dodanie kolumny ile % stanowi produkowana elektryczność w stosunku do zapotrzebowania
final_energy_df['RE production to demand %'] = (final_energy_df['total RE production'] / final_energy_df['gross demand'] * 100)

#dodanie kolumny z różnicą w procentach
final_energy_df['sub'] = final_energy_df['RE production to demand %'] - final_energy_df['RE production %']

#Podgląd danych
#final_energy_df

In [191]:
#Ostateczna wersja tabeli - podgląd danych
final_energy_df

Unnamed: 0,country_or_area,exports,gross demand,gross production,imports,total geothermal production,total hydro production,"total production, autoproducer","total production, main activity",total solar production,"total tide, wave production",total wind production,total RE production,RE production %,RE production to demand %,sub
0,China,252116.0,54131715.0,58804770.6,87305.0,0.0,9912320.0,3016994.0,55787776.6,20753.0,0.0,508206.0,9933073.0,16.89,18.35,1.46
1,India,4147.0,15603930.0,16539859.0,73831.0,0.0,2280019.0,1868703.0,14671156.0,8453.0,0.0,208900.0,2288472.0,13.84,14.67,0.83
2,Russian Federation,580975.0,19907649.0,21847563.0,219004.0,5955.0,3907008.0,1240599.0,20606964.0,160.0,0.0,172.0,3913123.0,17.91,19.66,1.75
3,United States,377445.0,93789000.0,98746617.0,1129039.0,410355.0,7520138.0,5787864.0,92958753.0,81665.0,0.0,1002367.0,8012158.0,8.11,8.54,0.43
4,EU countries,5850341.0,69480288.0,73812933.0,6442371.0,119581.0,8922281.0,5861643.0,67951290.0,347473.0,12167.0,1771314.0,9401502.0,12.74,13.53,0.79


In [192]:
#Zapisanie tabeli do pliku csv
with open("sum_eco_energy_data_eu_world.csv", "w+") as file:
    file.write(final_energy_df.to_csv())

### Wyodrębnienie energii z biomasy

In [16]:
#biomass_commodity_list = ['Bagasse', 'Biodiesel', 'Biogases', 'Biogasoline', 'Other liquid biofuels', 'Peat (for fuel use)']
biomass_commodity_list = ['Biodiesel']