In [1]:
# imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import requests

%matplotlib inline

In [2]:
# creating folder for dataset on machine
folder = 'dataset'

if not os.path.exists(folder):
  os.mkdir(folder)

In [3]:
# importing the dataset

url = 'https://raw.githubusercontent.com/idowujames/Suicide-Rates-Overview-1985-to-2016/main/master.csv'

data = requests.get(url)

with open('dataset/master.csv', 'wb') as file:
  file.write(data.content)

## Loading the dataset and viewing it

In [4]:
df = pd.read_csv('dataset/master.csv')

print(df.info())

df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27820 entries, 0 to 27819
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   country             27820 non-null  object 
 1   year                27820 non-null  int64  
 2   sex                 27820 non-null  object 
 3   age                 27820 non-null  object 
 4   suicides_no         27820 non-null  int64  
 5   population          27820 non-null  int64  
 6   suicides/100k pop   27820 non-null  float64
 7   country-year        27820 non-null  object 
 8   HDI for year        8364 non-null   float64
 9    gdp_for_year ($)   27820 non-null  object 
 10  gdp_per_capita ($)  27820 non-null  int64  
 11  generation          27820 non-null  object 
dtypes: float64(2), int64(4), object(6)
memory usage: 2.5+ MB
None


Unnamed: 0,country,year,sex,age,suicides_no,population,suicides/100k pop,country-year,HDI for year,gdp_for_year ($),gdp_per_capita ($),generation
0,Albania,1987,male,15-24 years,21,312900,6.71,Albania1987,,2156624900,796,Generation X
1,Albania,1987,male,35-54 years,16,308000,5.19,Albania1987,,2156624900,796,Silent
2,Albania,1987,female,15-24 years,14,289700,4.83,Albania1987,,2156624900,796,Generation X
3,Albania,1987,male,75+ years,1,21800,4.59,Albania1987,,2156624900,796,G.I. Generation
4,Albania,1987,male,25-34 years,9,274300,3.28,Albania1987,,2156624900,796,Boomers


In [5]:
df.sample(8)

Unnamed: 0,country,year,sex,age,suicides_no,population,suicides/100k pop,country-year,HDI for year,gdp_for_year ($),gdp_per_capita ($),generation
23322,South Africa,1998,male,5-14 years,3,5145480,0.06,South Africa1998,,137774361015,3512,Millenials
24711,Sweden,2015,female,25-34 years,48,609532,7.87,Sweden2015,,497918109302,54629,Millenials
24829,Switzerland,2003,female,35-54 years,116,1116103,10.39,Switzerland2003,,352914820747,50639,Boomers
9925,Germany,2007,female,5-14 years,9,3835107,0.23,Germany2007,,3439953462907,43667,Generation Z
1225,Armenia,2014,female,75+ years,6,98118,6.12,Armenia2014,0.733,11609512940,4142,Silent
647,Argentina,1989,female,5-14 years,11,3149000,0.35,Argentina1989,,76636898036,2670,Generation X
5684,Colombia,1989,male,75+ years,17,197086,8.63,Colombia1989,,39540080200,1350,G.I. Generation
339,Antigua and Barbuda,1991,female,5-14 years,0,7089,0.0,Antigua and Barbuda1991,,481706333,7976,Millenials


### Checking to see the countries that are represented in the dataset

In [6]:
df.country.unique()

array(['Albania', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Aruba',
       'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain',
       'Barbados', 'Belarus', 'Belgium', 'Belize',
       'Bosnia and Herzegovina', 'Brazil', 'Bulgaria', 'Cabo Verde',
       'Canada', 'Chile', 'Colombia', 'Costa Rica', 'Croatia', 'Cuba',
       'Cyprus', 'Czech Republic', 'Denmark', 'Dominica', 'Ecuador',
       'El Salvador', 'Estonia', 'Fiji', 'Finland', 'France', 'Georgia',
       'Germany', 'Greece', 'Grenada', 'Guatemala', 'Guyana', 'Hungary',
       'Iceland', 'Ireland', 'Israel', 'Italy', 'Jamaica', 'Japan',
       'Kazakhstan', 'Kiribati', 'Kuwait', 'Kyrgyzstan', 'Latvia',
       'Lithuania', 'Luxembourg', 'Macau', 'Maldives', 'Malta',
       'Mauritius', 'Mexico', 'Mongolia', 'Montenegro', 'Netherlands',
       'New Zealand', 'Nicaragua', 'Norway', 'Oman', 'Panama', 'Paraguay',
       'Philippines', 'Poland', 'Portugal', 'Puerto Rico', 'Qatar',
       'Republic of Korea', 'Romania', '

## Creating a Wrangle function for the dataset

In [7]:
def wrangle(df):

  #Filling the missing values of hdi column with -999
  df['HDI for year'] = df['HDI for year'].fillna(-999)

  #Dropping country-year column for tidyness purposes
  df = df.drop(columns=['country-year'])

  #converting gdp for year feature to float
  df[' gdp_for_year ($) '] = df[' gdp_for_year ($) '].str.replace(',','')
  df[' gdp_for_year ($) '] = pd.to_numeric(df[' gdp_for_year ($) '], errors='coerce')

  #renaming age column
  df = df.rename(
                  columns={
                            'age': 'age-group', 
                            ' gdp_for_year ($) ':'gdp_per_year($)',
                            'suicides/100k pop':'suicides/100k',
                            'HDI for year':'hdi_per_year',
                            'gdp_per_capita ($)':'gdp_per_capita($)'
                          }
                 )  

  return df


### Making a clean wrangled copy of the dataframe

In [8]:
df_clean = wrangle(df)

In [9]:
print(df_clean.info())
df_clean.sample(6)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27820 entries, 0 to 27819
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   country            27820 non-null  object 
 1   year               27820 non-null  int64  
 2   sex                27820 non-null  object 
 3   age-group          27820 non-null  object 
 4   suicides_no        27820 non-null  int64  
 5   population         27820 non-null  int64  
 6   suicides/100k      27820 non-null  float64
 7   hdi_per_year       27820 non-null  float64
 8   gdp_per_year($)    27820 non-null  int64  
 9   gdp_per_capita($)  27820 non-null  int64  
 10  generation         27820 non-null  object 
dtypes: float64(2), int64(5), object(4)
memory usage: 2.3+ MB
None


Unnamed: 0,country,year,sex,age-group,suicides_no,population,suicides/100k,hdi_per_year,gdp_per_year($),gdp_per_capita($),generation
22165,Serbia,2014,female,55-74 years,103,974430,10.57,0.771,44210806366,6500,Boomers
4308,Brazil,1996,male,15-24 years,1164,16172705,7.2,-999.0,850426432992,5771,Generation X
13688,Japan,2012,male,75+ years,2262,5783000,39.11,0.888,6203213121334,51379,Silent
1264,Aruba,1995,female,35-54 years,0,12391,0.0,-999.0,1320670391,17949,Boomers
634,Argentina,1988,male,5-14 years,20,3210000,0.62,-999.0,126206817196,4458,Generation X
91,Albania,1996,female,35-54 years,9,362000,2.49,-999.0,3314898292,1127,Boomers
