In [1]:
#import dependencies
import pandas as pd

In [3]:
#read file
file = "../Resources/master.csv"
data = pd.read_csv(file)
data.head(3)

Unnamed: 0,country,year,sex,age,suicides_no,population,suicides/100k pop,country-year,HDI for year,gdp_for_year ($),gdp_per_capita ($),generation
0,Albania,1987,male,15-24 years,21,312900,6.71,Albania1987,,2156624900,796,Generation X
1,Albania,1987,male,35-54 years,16,308000,5.19,Albania1987,,2156624900,796,Silent
2,Albania,1987,female,15-24 years,14,289700,4.83,Albania1987,,2156624900,796,Generation X


In [4]:
#check column names to see what is needed
data.columns

Index(['country', 'year', 'sex', 'age', 'suicides_no', 'population',
       'suicides/100k pop', 'country-year', 'HDI for year',
       ' gdp_for_year ($) ', 'gdp_per_capita ($)', 'generation'],
      dtype='object')

In [5]:
#check number of unique countries in dataset
countries = (data['country'].unique())
len(countries)

101

## Suicide rate table

In [6]:
#create df grouped by year and country
#show suicides per 100k pop to standardize across countries, as opposed to total suicides
by_yearcountry = pd.DataFrame(data.groupby(['year','country']).sum()['suicides/100k pop'])
by_yearcountry = by_yearcountry.unstack(1)
by_yearcountry.head(3)

Unnamed: 0_level_0,suicides/100k pop,suicides/100k pop,suicides/100k pop,suicides/100k pop,suicides/100k pop,suicides/100k pop,suicides/100k pop,suicides/100k pop,suicides/100k pop,suicides/100k pop,suicides/100k pop,suicides/100k pop,suicides/100k pop,suicides/100k pop,suicides/100k pop,suicides/100k pop,suicides/100k pop,suicides/100k pop,suicides/100k pop,suicides/100k pop,suicides/100k pop
country,Albania,Antigua and Barbuda,Argentina,Armenia,Aruba,Australia,Austria,Azerbaijan,Bahamas,Bahrain,...,Thailand,Trinidad and Tobago,Turkey,Turkmenistan,Ukraine,United Arab Emirates,United Kingdom,United States,Uruguay,Uzbekistan
year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1985,,0.0,134.47,,,163.41,384.81,,4.76,20.12,...,82.54,30.17,,,,,116.45,184.72,178.61,
1986,,0.0,152.9,,,175.79,401.8,,,,...,71.91,123.13,,,,,112.01,191.65,172.47,
1987,31.85,0.0,144.98,,,188.82,405.88,,13.59,25.68,...,81.82,155.54,,173.61,294.04,,103.81,191.66,164.43,


In [7]:
#There are some null values across the countries; 16 out of the 101 countries have a reported rate
#in the most recent year reported, 2016. For the most recent rate, need to grab the last rate reported for each country.
#Doing this as opposed to averaging out all reported rates for each country, because varying presence of null values
#and varying number of reports may skew the averages for each country.

#create a dictionary to store {country: suicide rate}
country_rates = {}

#loop through each column
for column, index in by_yearcountry.columns:
    #set variable 'last_rate' to grab last rate reported for each country
    last_rate_year = pd.Series.last_valid_index(by_yearcountry[column, index])
    #create new key for country in 'country_rates' dict. Set the rate as the value
    country_rates[index] = (by_yearcountry[column, index][last_rate_year])
#print completed dict and length of dict to confirm all countries were captured
print(country_rates)
len(country_rates)

{'Albania': 41.66, 'Antigua and Barbuda': 15.62, 'Argentina': 112.13000000000001, 'Armenia': 40.56000000000001, 'Aruba': 93.42, 'Australia': 154.18000000000004, 'Austria': 183.23, 'Azerbaijan': 11.830000000000002, 'Bahamas': 13.91, 'Bahrain': 6.5200000000000005, 'Barbados': 0.0, 'Belarus': 252.90999999999997, 'Belgium': 196.66, 'Belize': 135.46, 'Bosnia and Herzegovina': 109.5, 'Brazil': 77.45, 'Bulgaria': 139.77, 'Cabo Verde': 133.84, 'Canada': 133.09, 'Chile': 147.68, 'Colombia': 70.39999999999999, 'Costa Rica': 70.85, 'Croatia': 213.55999999999997, 'Cuba': 194.31, 'Cyprus': 47.14999999999999, 'Czech Republic': 161.76, 'Denmark': 121.42, 'Dominica': 0.0, 'Ecuador': 97.39, 'El Salvador': 101.84, 'Estonia': 205.37999999999997, 'Fiji': 91.58, 'Finland': 161.18999999999997, 'France': 179.56, 'Georgia': 84.68, 'Germany': 150.34, 'Greece': 56.059999999999995, 'Grenada': 0.0, 'Guatemala': 44.85, 'Guyana': 356.38000000000005, 'Hungary': 241.34, 'Iceland': 138.07, 'Ireland': 119.6799999999999

101

In [8]:
#convert dict to df
country_rates_df = pd.DataFrame.from_dict(country_rates,orient='index',columns=['suicides/100k pop'])
country_rates_df.index.name = 'country'

In [9]:
country_rates_df

Unnamed: 0_level_0,suicides/100k pop
country,Unnamed: 1_level_1
Albania,41.66
Antigua and Barbuda,15.62
Argentina,112.13
Armenia,40.56
Aruba,93.42
...,...
United Arab Emirates,22.71
United Kingdom,86.74
United States,175.41
Uruguay,270.02


In [11]:
#export df to csv
country_rates_df.to_csv(r'rates_bycountry_KM.csv')

## GDP table

In [13]:
#narrow down df to make it easier to look at
new_data = data[['country','year','gdp_per_capita ($)']]
new_data.head()

Unnamed: 0,country,year,gdp_per_capita ($)
0,Albania,1987,796
1,Albania,1987,796
2,Albania,1987,796
3,Albania,1987,796
4,Albania,1987,796


In [14]:
#drop duplicate rows
byyear_gdp = new_data.drop_duplicates(keep='last')
#group by country and year and view gdp per capita values
byyear_gdp = pd.DataFrame(byyear_gdp.groupby(['country','year']).sum()['gdp_per_capita ($)'])
byyear_gdp = byyear_gdp.unstack(0)
byyear_gdp.head(3)

Unnamed: 0_level_0,gdp_per_capita ($),gdp_per_capita ($),gdp_per_capita ($),gdp_per_capita ($),gdp_per_capita ($),gdp_per_capita ($),gdp_per_capita ($),gdp_per_capita ($),gdp_per_capita ($),gdp_per_capita ($),gdp_per_capita ($),gdp_per_capita ($),gdp_per_capita ($),gdp_per_capita ($),gdp_per_capita ($),gdp_per_capita ($),gdp_per_capita ($),gdp_per_capita ($),gdp_per_capita ($),gdp_per_capita ($),gdp_per_capita ($)
country,Albania,Antigua and Barbuda,Argentina,Armenia,Aruba,Australia,Austria,Azerbaijan,Bahamas,Bahrain,...,Thailand,Trinidad and Tobago,Turkey,Turkmenistan,Ukraine,United Arab Emirates,United Kingdom,United States,Uruguay,Uzbekistan
year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1985,,3850.0,3264.0,,,12374.0,9759.0,,11393.0,9980.0,...,840.0,7317.0,,,,,9231.0,19693.0,1729.0,
1986,,4740.0,4072.0,,,12288.0,13911.0,,,,...,911.0,4516.0,,,,,11323.0,20588.0,2132.0,
1987,796.0,5595.0,4026.0,,,12564.0,17415.0,,12286.0,9321.0,...,1045.0,4449.0,,815.0,1353.0,,13996.0,21631.0,2651.0,


### gdp at the time of last recorded suicide rate

In [20]:
#create a dictionary to store {country: suicide rate}
country_gdp = {}

#loop through each column
for value, countryname in byyear_gdp.columns:
    #create new key for country in 'country_gdp' dict. Set the rate as the value
    country_gdp[countryname] = (byyear_gdp[value, countryname][last_rate_year])
#print completed dict and length of dict to confirm all countries were captured
print(country_gdp)
len(country_gdp)

{'Albania': nan, 'Antigua and Barbuda': 14093.0, 'Argentina': 13400.0, 'Armenia': 4142.0, 'Aruba': nan, 'Australia': 66809.0, 'Austria': 54278.0, 'Azerbaijan': nan, 'Bahamas': nan, 'Bahrain': 27503.0, 'Barbados': nan, 'Belarus': 8849.0, 'Belgium': 50173.0, 'Belize': 5448.0, 'Bosnia and Herzegovina': 5079.0, 'Brazil': 12975.0, 'Bulgaria': 8241.0, 'Cabo Verde': nan, 'Canada': nan, 'Chile': 15883.0, 'Colombia': 8591.0, 'Costa Rica': 11485.0, 'Croatia': 14299.0, 'Cuba': 7459.0, 'Cyprus': 29024.0, 'Czech Republic': 20859.0, 'Denmark': 66114.0, 'Dominica': nan, 'Ecuador': 7116.0, 'El Salvador': 3965.0, 'Estonia': 21117.0, 'Fiji': nan, 'Finland': 52832.0, 'France': 47318.0, 'Georgia': 4757.0, 'Germany': 50167.0, 'Greece': 22834.0, 'Grenada': 9456.0, 'Guatemala': 4210.0, 'Guyana': nan, 'Hungary': 14886.0, 'Iceland': 56833.0, 'Ireland': 59634.0, 'Israel': 41869.0, 'Italy': 37035.0, 'Jamaica': nan, 'Japan': 40328.0, 'Kazakhstan': 14351.0, 'Kiribati': nan, 'Kuwait': 43774.0, 'Kyrgyzstan': 1465.0,

101

In [23]:
#convert dict to df
country_gdp_df = pd.DataFrame.from_dict(country_gdp,orient='index',columns=['gdp_per_capita_($)'])
country_gdp_df.index.name = 'country'
country_gdp_df
#contains NaN so may need to use last recorded gdp per capita instead?

Unnamed: 0_level_0,gdp_per_capita_($)
country,Unnamed: 1_level_1
Albania,
Antigua and Barbuda,14093.0
Argentina,13400.0
Armenia,4142.0
Aruba,
...,...
United Arab Emirates,
United Kingdom,49906.0
United States,58531.0
Uruguay,18012.0


### last gdp recorded

In [25]:
#create a dictionary to store {country: suicide rate}
country_gdp_2 = {}

#loop through each column
for value2, countryname2 in byyear_gdp.columns:
    #set variable 'last_rate' to grab last rate reported for each country
    last_gdp_year = pd.Series.last_valid_index(byyear_gdp[value2, countryname2])
    #create new key for country in 'country_rates' dict. Set the rate as the value
    country_gdp_2[countryname2] = (byyear_gdp[value2, countryname2][last_gdp_year])
#print completed dict and length of dict to confirm all countries were captured
print(country_gdp_2)
len(country_gdp_2)

{'Albania': 4359.0, 'Antigua and Barbuda': 14853.0, 'Argentina': 14981.0, 'Armenia': 3788.0, 'Aruba': 27066.0, 'Australia': 60656.0, 'Austria': 46976.0, 'Azerbaijan': 4172.0, 'Bahamas': 30455.0, 'Bahrain': 27503.0, 'Barbados': 17395.0, 'Belarus': 8849.0, 'Belgium': 42830.0, 'Belize': 5561.0, 'Bosnia and Herzegovina': 5079.0, 'Brazil': 9431.0, 'Bulgaria': 8241.0, 'Cabo Verde': 4124.0, 'Canada': 55310.0, 'Chile': 14729.0, 'Colombia': 6552.0, 'Costa Rica': 11485.0, 'Croatia': 12905.0, 'Cuba': 8044.0, 'Cyprus': 25098.0, 'Czech Republic': 19505.0, 'Denmark': 55972.0, 'Dominica': 1485.0, 'Ecuador': 6832.0, 'El Salvador': 3965.0, 'Estonia': 18149.0, 'Fiji': 5073.0, 'Finland': 44862.0, 'France': 47318.0, 'Georgia': 4046.0, 'Germany': 43201.0, 'Greece': 18927.0, 'Grenada': 10838.0, 'Guatemala': 4472.0, 'Guyana': 4372.0, 'Hungary': 13448.0, 'Iceland': 64708.0, 'Ireland': 59634.0, 'Israel': 39793.0, 'Italy': 31537.0, 'Jamaica': 5552.0, 'Japan': 36510.0, 'Kazakhstan': 11784.0, 'Kiribati': 854.0, '

101

In [26]:
#convert dict to df
country_gdp_df2 = pd.DataFrame.from_dict(country_gdp_2,orient='index',columns=['gdp_per_capita_($)'])
country_gdp_df2.index.name = 'country'
country_gdp_df2

Unnamed: 0_level_0,gdp_per_capita_($)
country,Unnamed: 1_level_1
Albania,4359.0
Antigua and Barbuda,14853.0
Argentina,14981.0
Armenia,3788.0
Aruba,27066.0
...,...
United Arab Emirates,36964.0
United Kingdom,47240.0
United States,60387.0
Uruguay,16696.0
