In [1]:
#import libraries
import pandas as pd
import numpy as np
import os

In [2]:
#create path
path = r'C:\Users\jacob\Documents\Data Analytics\Data Immersion\Achievement 6\Energy_GDP_Project'

In [3]:
#import energy from nuclear (nuc) data
nuc = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'energy_from_nuclear.csv'))

In [4]:
#check output
nuc.head()

Unnamed: 0,country,year,combined key,%_energy_from_nuclear
0,Afghanistan,1990,"Afghanistan, 1990",
1,Afghanistan,1991,"Afghanistan, 1991",
2,Afghanistan,1992,"Afghanistan, 1992",
3,Afghanistan,1993,"Afghanistan, 1993",
4,Afghanistan,1994,"Afghanistan, 1994",


In [5]:
nuc.shape

(6916, 4)

In [6]:
#change % energy from nuclear to % energy_nuclear
nuc.rename(columns = {'%_energy_from_nuclear' : '%_energy_nuclear'}, inplace = True)

In [7]:
#check output
nuc.head()

Unnamed: 0,country,year,combined key,%_energy_nuclear
0,Afghanistan,1990,"Afghanistan, 1990",
1,Afghanistan,1991,"Afghanistan, 1991",
2,Afghanistan,1992,"Afghanistan, 1992",
3,Afghanistan,1993,"Afghanistan, 1993",
4,Afghanistan,1994,"Afghanistan, 1994",


In [8]:
#check for missing values
nuc.isnull().sum()

country                0
year                   0
combined key           0
%_energy_nuclear    2282
dtype: int64

In [9]:
#check for duplicates
nuc.duplicated().value_counts()

False    6916
Name: count, dtype: int64

In [10]:
#descriptive analysis
nuc.describe()

Unnamed: 0,year,%_energy_nuclear
count,6916.0,4634.0
mean,2002.5,6.172183
std,7.500542,13.635725
min,1990.0,0.0
25%,1996.0,0.0
50%,2002.5,0.0
75%,2009.0,3.666252
max,2015.0,87.986221


There are no duplicate values, but there are 2282 missing values in the % of energy produced from nuclear column. this could be due to the fact that the 75% of the entries have less than 4% of their energy from nuclear. Because of this I will imput 0 for all of the missing values so that we don't lose that data.

In [11]:
#group data by nuclear data to make it easier to find missing data
nuc_grouped = nuc.groupby('country', as_index = False)['%_energy_nuclear'].sum()

In [12]:
#check output
nuc_grouped

Unnamed: 0,country,%_energy_nuclear
0,Afghanistan,0.000000
1,Africa Eastern and Southern,98.466993
2,Africa Western and Central,0.000000
3,Albania,0.000000
4,Algeria,0.000000
...,...,...
261,West Bank and Gaza,0.000000
262,World,388.379554
263,"Yemen, Rep.",0.000000
264,Zambia,0.000000


In [13]:
#create a flag for whether or not the weighted average or zero needs to be imputed for missing values
nuc_grouped.loc[nuc_grouped['%_energy_nuclear'] > 0, 'imputed_value'] = 'weighted_avg'
nuc_grouped.loc[nuc_grouped['%_energy_nuclear'] == 0, 'imputed_value'] = 'zero'

  nuc_grouped.loc[nuc_grouped['%_energy_nuclear'] > 0, 'imputed_value'] = 'weighted_avg'


In [14]:
#check flag column
nuc_grouped

Unnamed: 0,country,%_energy_nuclear,imputed_value
0,Afghanistan,0.000000,zero
1,Africa Eastern and Southern,98.466993,weighted_avg
2,Africa Western and Central,0.000000,zero
3,Albania,0.000000,zero
4,Algeria,0.000000,zero
...,...,...,...
261,West Bank and Gaza,0.000000,zero
262,World,388.379554,weighted_avg
263,"Yemen, Rep.",0.000000,zero
264,Zambia,0.000000,zero


In [15]:
#create a new data fram that creates a list for either wieghted average or zero imputations
nuc_grouped_2 = nuc_grouped.groupby('imputed_value')['country'].agg(list).reset_index()

In [16]:
nuc_grouped_2

Unnamed: 0,imputed_value,country
0,weighted_avg,"[Africa Eastern and Southern, Argentina, Armen..."
1,zero,"[Afghanistan, Africa Western and Central, Alba..."


In [17]:
group1_countries = nuc_grouped_2.loc[nuc_grouped_2['imputed_value'] == 'weighted_avg', 'country'].values[0]
group2_countries = nuc_grouped_2.loc[nuc_grouped_2['imputed_value'] == 'zero', 'country'].values[0]

In [18]:
#list for countries that need weighted averages imputed
group1_countries

['Africa Eastern and Southern',
 'Argentina',
 'Armenia',
 'Belgium',
 'Brazil',
 'Bulgaria',
 'Canada',
 'Central Europe and the Baltics',
 'China',
 'Czechia',
 'Early-demographic dividend',
 'East Asia & Pacific',
 'East Asia & Pacific (IDA & IBRD countries)',
 'East Asia & Pacific (excluding high income)',
 'Euro area',
 'Europe & Central Asia',
 'Europe & Central Asia (IDA & IBRD countries)',
 'Europe & Central Asia (excluding high income)',
 'European Union',
 'Finland',
 'Fragile and conflict affected situations',
 'France',
 'Germany',
 'High income',
 'Hungary',
 'IBRD only',
 'IDA & IBRD total',
 'IDA blend',
 'IDA total',
 'India',
 'Iran, Islamic Rep.',
 'Japan',
 'Korea, Rep.',
 'Late-demographic dividend',
 'Latin America & Caribbean',
 'Latin America & Caribbean (excluding high income)',
 'Latin America & the Caribbean (IDA & IBRD countries)',
 'Lithuania',
 'Low & middle income',
 'Lower middle income',
 'Mexico',
 'Middle East & North Africa',
 'Middle East & North Afr

In [19]:
#list of countries that need zero imputed for missing values
group2_countries

['Afghanistan',
 'Africa Western and Central',
 'Albania',
 'Algeria',
 'American Samoa',
 'Andorra',
 'Angola',
 'Antigua and Barbuda',
 'Arab World',
 'Aruba',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahamas, The',
 'Bahrain',
 'Bangladesh',
 'Barbados',
 'Belarus',
 'Belize',
 'Benin',
 'Bermuda',
 'Bhutan',
 'Bolivia',
 'Bosnia and Herzegovina',
 'Botswana',
 'British Virgin Islands',
 'Brunei Darussalam',
 'Burkina Faso',
 'Burundi',
 'Cabo Verde',
 'Cambodia',
 'Cameroon',
 'Caribbean small states',
 'Cayman Islands',
 'Central African Republic',
 'Chad',
 'Channel Islands',
 'Chile',
 'Colombia',
 'Comoros',
 'Congo, Dem. Rep.',
 'Congo, Rep.',
 'Costa Rica',
 "Cote d'Ivoire",
 'Croatia',
 'Cuba',
 'Curacao',
 'Cyprus',
 'Denmark',
 'Djibouti',
 'Dominica',
 'Dominican Republic',
 'Ecuador',
 'Egypt, Arab Rep.',
 'El Salvador',
 'Equatorial Guinea',
 'Eritrea',
 'Estonia',
 'Eswatini',
 'Ethiopia',
 'Faroe Islands',
 'Fiji',
 'French Polynesia',
 'Gabon',
 'Gambia, The',
 'Ge

In [20]:
nuc_weight = nuc[nuc['country'].isin(group1_countries)]

In [21]:
#check number of countries in weighted average list
nuc_weight['country'].nunique()

68

In [22]:
nuc_zero = nuc[nuc['country'].isin(group2_countries)]

In [23]:
#check number of counties in zero list
nuc_zero['country'].nunique()

198

In [24]:
198 + 68

266

In [25]:
#check for missing vlaues
nuc_zero.isnull().sum()

country                0
year                   0
combined key           0
%_energy_nuclear    2255
dtype: int64

In [26]:
#impute zero for missing values in in this list
nuc_zero_imputed = nuc_zero.fillna(0)

In [27]:
nuc_zero_imputed.isnull().sum()

country             0
year                0
combined key        0
%_energy_nuclear    0
dtype: int64

In [28]:
nuc_zero_imputed

Unnamed: 0,country,year,combined key,%_energy_nuclear
0,Afghanistan,1990,"Afghanistan, 1990",0.0
1,Afghanistan,1991,"Afghanistan, 1991",0.0
2,Afghanistan,1992,"Afghanistan, 1992",0.0
3,Afghanistan,1993,"Afghanistan, 1993",0.0
4,Afghanistan,1994,"Afghanistan, 1994",0.0
...,...,...,...,...
6911,Zimbabwe,2011,"Zimbabwe, 2011",0.0
6912,Zimbabwe,2012,"Zimbabwe, 2012",0.0
6913,Zimbabwe,2013,"Zimbabwe, 2013",0.0
6914,Zimbabwe,2014,"Zimbabwe, 2014",0.0


In [29]:
nuc_zero_imputed.head()

Unnamed: 0,country,year,combined key,%_energy_nuclear
0,Afghanistan,1990,"Afghanistan, 1990",0.0
1,Afghanistan,1991,"Afghanistan, 1991",0.0
2,Afghanistan,1992,"Afghanistan, 1992",0.0
3,Afghanistan,1993,"Afghanistan, 1993",0.0
4,Afghanistan,1994,"Afghanistan, 1994",0.0


In [30]:
nuc_weight.head(30)

Unnamed: 0,country,year,combined key,%_energy_nuclear
26,Africa Eastern and Southern,1990,"Africa Eastern and Southern, 1990",3.866791
27,Africa Eastern and Southern,1991,"Africa Eastern and Southern, 1991",4.14083
28,Africa Eastern and Southern,1992,"Africa Eastern and Southern, 1992",4.25065
29,Africa Eastern and Southern,1993,"Africa Eastern and Southern, 1993",3.201404
30,Africa Eastern and Southern,1994,"Africa Eastern and Southern, 1994",4.149434
31,Africa Eastern and Southern,1995,"Africa Eastern and Southern, 1995",4.67902
32,Africa Eastern and Southern,1996,"Africa Eastern and Southern, 1996",4.6039
33,Africa Eastern and Southern,1997,"Africa Eastern and Southern, 1997",4.768023
34,Africa Eastern and Southern,1998,"Africa Eastern and Southern, 1998",5.108086
35,Africa Eastern and Southern,1999,"Africa Eastern and Southern, 1999",4.816205


In [31]:
nuc_weight.isnull().sum()

country              0
year                 0
combined key         0
%_energy_nuclear    27
dtype: int64

In [32]:
#created a weighted average column to use for final missing values
nuc_weight['Weighted_Avg'] = nuc_weight.groupby('country')['%_energy_nuclear'].transform(lambda x: x.fillna(x.mean()))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nuc_weight['Weighted_Avg'] = nuc_weight.groupby('country')['%_energy_nuclear'].transform(lambda x: x.fillna(x.mean()))


In [33]:
nuc_weight.head(150)

Unnamed: 0,country,year,combined key,%_energy_nuclear,Weighted_Avg
26,Africa Eastern and Southern,1990,"Africa Eastern and Southern, 1990",3.866791,3.866791
27,Africa Eastern and Southern,1991,"Africa Eastern and Southern, 1991",4.140830,4.140830
28,Africa Eastern and Southern,1992,"Africa Eastern and Southern, 1992",4.250650,4.250650
29,Africa Eastern and Southern,1993,"Africa Eastern and Southern, 1993",3.201404,3.201404
30,Africa Eastern and Southern,1994,"Africa Eastern and Southern, 1994",4.149434,4.149434
...,...,...,...,...,...
847,Bulgaria,2005,"Bulgaria, 2005",42.420176,42.420176
848,Bulgaria,2006,"Bulgaria, 2006",42.839875,42.839875
849,Bulgaria,2007,"Bulgaria, 2007",34.103454,34.103454
850,Bulgaria,2008,"Bulgaria, 2008",35.360219,35.360219


In [34]:
27 + 2255

2282

In [35]:
nuc_weight.tail(20)

Unnamed: 0,country,year,combined key,%_energy_nuclear,Weighted_Avg
6818,World,1996,"World, 1996",17.583602,17.583602
6819,World,1997,"World, 1997",17.046995,17.046995
6820,World,1998,"World, 1998",16.975507,16.975507
6821,World,1999,"World, 1999",17.084883,17.084883
6822,World,2000,"World, 2000",16.703651,16.703651
6823,World,2001,"World, 2001",16.877983,16.877983
6824,World,2002,"World, 2002",16.384221,16.384221
6825,World,2003,"World, 2003",15.656159,15.656159
6826,World,2004,"World, 2004",15.551745,15.551745
6827,World,2005,"World, 2005",15.071328,15.071328


In [36]:
#drop %_energy_nuclear column with missing values
nuc_weight_imputed = nuc_weight.drop(columns = ['%_energy_nuclear'])

In [37]:
nuc_weight_imputed.head()

Unnamed: 0,country,year,combined key,Weighted_Avg
26,Africa Eastern and Southern,1990,"Africa Eastern and Southern, 1990",3.866791
27,Africa Eastern and Southern,1991,"Africa Eastern and Southern, 1991",4.14083
28,Africa Eastern and Southern,1992,"Africa Eastern and Southern, 1992",4.25065
29,Africa Eastern and Southern,1993,"Africa Eastern and Southern, 1993",3.201404
30,Africa Eastern and Southern,1994,"Africa Eastern and Southern, 1994",4.149434


In [38]:
#change weighted_avgerage column to new %_energy_nuclear column
nuc_weight_imputed_2 = nuc_weight_imputed.rename(columns = {'Weighted_Avg' : '%_energy_nuclear'})

In [39]:
nuc_weight_imputed_2.head()

Unnamed: 0,country,year,combined key,%_energy_nuclear
26,Africa Eastern and Southern,1990,"Africa Eastern and Southern, 1990",3.866791
27,Africa Eastern and Southern,1991,"Africa Eastern and Southern, 1991",4.14083
28,Africa Eastern and Southern,1992,"Africa Eastern and Southern, 1992",4.25065
29,Africa Eastern and Southern,1993,"Africa Eastern and Southern, 1993",3.201404
30,Africa Eastern and Southern,1994,"Africa Eastern and Southern, 1994",4.149434


In [40]:
#check for missing values
nuc_weight_imputed_2.isnull().sum()

country             0
year                0
combined key        0
%_energy_nuclear    0
dtype: int64

In [41]:
#concate list nuc_weight_imputed on nuc_zero_imputed
nuc_concated = pd.concat([nuc_weight_imputed_2,nuc_zero_imputed], ignore_index = True)

In [42]:
#check shape
nuc_concated.shape

(6916, 4)

In [43]:
#check for missing values
nuc_concated.isnull().sum()

country             0
year                0
combined key        0
%_energy_nuclear    0
dtype: int64

In [44]:
#check data
nuc_concated.head(150)

Unnamed: 0,country,year,combined key,%_energy_nuclear
0,Africa Eastern and Southern,1990,"Africa Eastern and Southern, 1990",3.866791
1,Africa Eastern and Southern,1991,"Africa Eastern and Southern, 1991",4.140830
2,Africa Eastern and Southern,1992,"Africa Eastern and Southern, 1992",4.250650
3,Africa Eastern and Southern,1993,"Africa Eastern and Southern, 1993",3.201404
4,Africa Eastern and Southern,1994,"Africa Eastern and Southern, 1994",4.149434
...,...,...,...,...
145,Bulgaria,2005,"Bulgaria, 2005",42.420176
146,Bulgaria,2006,"Bulgaria, 2006",42.839875
147,Bulgaria,2007,"Bulgaria, 2007",34.103454
148,Bulgaria,2008,"Bulgaria, 2008",35.360219


In [45]:
#sort data aphabetically and then by year
nuc_sorted = nuc_concated.sort_values(by = ['country', 'year']).reset_index(drop = True)

In [46]:
#check output
nuc_sorted

Unnamed: 0,country,year,combined key,%_energy_nuclear
0,Afghanistan,1990,"Afghanistan, 1990",0.0
1,Afghanistan,1991,"Afghanistan, 1991",0.0
2,Afghanistan,1992,"Afghanistan, 1992",0.0
3,Afghanistan,1993,"Afghanistan, 1993",0.0
4,Afghanistan,1994,"Afghanistan, 1994",0.0
...,...,...,...,...
6911,Zimbabwe,2011,"Zimbabwe, 2011",0.0
6912,Zimbabwe,2012,"Zimbabwe, 2012",0.0
6913,Zimbabwe,2013,"Zimbabwe, 2013",0.0
6914,Zimbabwe,2014,"Zimbabwe, 2014",0.0


In [47]:
nuc_sorted.isnull().sum()

country             0
year                0
combined key        0
%_energy_nuclear    0
dtype: int64

In [48]:
#check descriptive stats
nuc_sorted.describe()

Unnamed: 0,year,%_energy_nuclear
count,6916.0,6916.0
mean,2002.5,4.171182
std,7.500542,11.571857
min,1990.0,0.0
25%,1996.0,0.0
50%,2002.5,0.0
75%,2009.0,0.0
max,2015.0,87.986221


In [49]:
nuc_sorted.shape

(6916, 4)

In [51]:
#export wrangled energy nuclear data
nuc_sorted.to_csv(os.path.join(path, '02 data', 'prepared data', 'energy_from_nuclear_wrangled.csv'))