In [1]:
# Import dependencies
import pandas as pd

In [2]:
# Read in the CSV files needed

classification_df = pd.read_csv("data_sources/IMF_data.csv", encoding='utf-8')

dataset_df = pd.read_csv("output_data/data_06_to_2015.csv", encoding='utf-8')

In [3]:
# Check the dtypes of each dataframe before merge
classification_df.dtypes

country                   object
country classification    object
dtype: object

In [4]:
dataset_df.dtypes

country                                     object
year                                         int64
number of suicides (per 100,000 people)    float64
health expenditure (per capita)            float64
dtype: object

In [5]:
# Change the dtypes that we need
classification_df['country'].astype('string')


0      Australia
1        Austria
2        Belgium
3         Canada
4         Cyprus
         ...    
189    Venezuela
190      Vietnam
191        Yemen
192       Zambia
193     Zimbabwe
Name: country, Length: 194, dtype: string

In [6]:
classification_df['country classification'].astype('string')

0       Developed Countries
1       Developed Countries
2       Developed Countries
3       Developed Countries
4       Developed Countries
               ...         
189    Developing Countries
190    Developing Countries
191    Developing Countries
192    Developing Countries
193    Developing Countries
Name: country classification, Length: 194, dtype: string

In [7]:
dataset_df['country'].astype('string')

0           Argentina
1           Argentina
2           Argentina
3           Argentina
4           Argentina
            ...      
505    United Kingdom
506    United Kingdom
507    United Kingdom
508    United Kingdom
509    United Kingdom
Name: country, Length: 510, dtype: string

In [8]:
# Replace Russian Federation with Russia
dataset_df.replace(to_replace='Russian Federation', value='Russia', inplace=True)

In [9]:
# Confirm that Russia was replaced correctly
dataset_df['country'].unique()

array(['Argentina', 'Armenia', 'Australia', 'Austria', 'Belgium',
       'Belize', 'Brazil', 'Brunei Darussalam', 'Chile', 'Colombia',
       'Croatia', 'Cuba', 'Cyprus', 'Czech Republic', 'Denmark',
       'Ecuador', 'Estonia', 'Finland', 'Germany', 'Greece', 'Guatemala',
       'Hungary', 'Iceland', 'Israel', 'Italy', 'Japan', 'Kazakhstan',
       'Latvia', 'Lithuania', 'Luxembourg', 'Malta', 'Mauritius',
       'Mexico', 'Netherlands', 'Norway', 'Panama', 'Poland', 'Qatar',
       'Romania', 'Russia', 'Serbia', 'Seychelles', 'Singapore',
       'Slovenia', 'South Africa', 'Spain', 'Sweden', 'Switzerland',
       'Thailand', 'Turkmenistan', 'United Kingdom'], dtype=object)

In [10]:
# Merge the datasets to make a new dataframe with classifications added to the dataset_df
classified_data_df = pd.merge(dataset_df, classification_df, on='country', how='left')

In [11]:
classified_data_df

Unnamed: 0,country,year,"number of suicides (per 100,000 people)",health expenditure (per capita),country classification
0,Argentina,2006,8.652830,447.361176,Developing Countries
1,Argentina,2007,8.112240,551.750000,Developing Countries
2,Argentina,2008,8.327544,694.682434,Developing Countries
3,Argentina,2009,7.761451,742.843018,Developing Countries
4,Argentina,2010,7.831615,891.137756,Developing Countries
...,...,...,...,...,...
505,United Kingdom,2011,7.460703,3501.949707,Developed Countries
506,United Kingdom,2012,7.442810,3492.889648,Developed Countries
507,United Kingdom,2013,8.027719,4207.887695,Developed Countries
508,United Kingdom,2014,7.904841,4601.137207,Developed Countries


In [14]:
# Drop null values from the dataset
classified_data_df = classified_data_df.dropna(how='any')

In [16]:
# Note that Cuba is now dropped because it is not an IMF member
classified_data_df['country'].unique()

array(['Argentina', 'Armenia', 'Australia', 'Austria', 'Belgium',
       'Belize', 'Brazil', 'Brunei Darussalam', 'Chile', 'Colombia',
       'Croatia', 'Cyprus', 'Czech Republic', 'Denmark', 'Ecuador',
       'Estonia', 'Finland', 'Germany', 'Greece', 'Guatemala', 'Hungary',
       'Iceland', 'Israel', 'Italy', 'Japan', 'Kazakhstan', 'Latvia',
       'Lithuania', 'Luxembourg', 'Malta', 'Mauritius', 'Mexico',
       'Netherlands', 'Norway', 'Panama', 'Poland', 'Qatar', 'Romania',
       'Russia', 'Serbia', 'Seychelles', 'Singapore', 'Slovenia',
       'South Africa', 'Spain', 'Sweden', 'Switzerland', 'Thailand',
       'Turkmenistan', 'United Kingdom'], dtype=object)

In [18]:
grouped_data = classified_data_df.groupby(classified_data_df['country classification'])


<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000272955550B8>

In [19]:
# Create a dataframe for developed countries
developed_df = grouped_data.get_group("Developed Countries")
developed_df

Unnamed: 0,country,year,"number of suicides (per 100,000 people)",health expenditure (per capita),country classification
20,Australia,2006,11.078407,3177.460693,Developed Countries
21,Australia,2007,11.294980,3794.251709,Developed Countries
22,Australia,2008,11.635008,4088.778076,Developed Countries
23,Australia,2009,11.383720,3997.537109,Developed Countries
24,Australia,2010,11.608080,4952.777344,Developed Countries
...,...,...,...,...,...
505,United Kingdom,2011,7.460703,3501.949707,Developed Countries
506,United Kingdom,2012,7.442810,3492.889648,Developed Countries
507,United Kingdom,2013,8.027719,4207.887695,Developed Countries
508,United Kingdom,2014,7.904841,4601.137207,Developed Countries


In [20]:
# Create a dataframe for developing countries
developing_df = grouped_data.get_group("Developing Countries")
developing_df

Unnamed: 0,country,year,"number of suicides (per 100,000 people)",health expenditure (per capita),country classification
0,Argentina,2006,8.652830,447.361176,Developing Countries
1,Argentina,2007,8.112240,551.750000,Developing Countries
2,Argentina,2008,8.327544,694.682434,Developing Countries
3,Argentina,2009,7.761451,742.843018,Developing Countries
4,Argentina,2010,7.831615,891.137756,Developing Countries
...,...,...,...,...,...
495,Turkmenistan,2011,2.810145,276.651703,Developing Countries
496,Turkmenistan,2012,2.942394,318.807312,Developing Countries
497,Turkmenistan,2013,2.127187,376.223877,Developing Countries
498,Turkmenistan,2014,2.826722,437.752808,Developing Countries


In [21]:
# Export the DataFrame's into CSV's

# Export the data with classified countries into a CSV
classified_data_df.to_csv("output_data/complete_dataset_classified.csv", index=False)

# Export the developed countries into a CSV
developed_df.to_csv("output_data/developed_countries.csv", index=False)

# Export the developing countries into a CSV
developing_df.to_csv("output_data/developing_countries.csv", index=False)