In [72]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import statsmodels.formula.api as smf

import heapq

from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
from sklearn.datasets.samples_generator import make_blobs
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.model_selection import train_test_split

from scipy.spatial import distance
import warnings


In [73]:
df = pd.read_csv('./Notebooks/Datasets/master.csv',encoding='latin-1')
print(df.head())

   country  year     sex          age  suicides_no  population  \
0  Albania  1987    male  15-24 years           21      312900   
1  Albania  1987    male  35-54 years           16      308000   
2  Albania  1987  female  15-24 years           14      289700   
3  Albania  1987    male    75+ years            1       21800   
4  Albania  1987    male  25-34 years            9      274300   

   suicides/100k pop country-year  HDI for year  gdp_for_year ($)   \
0               6.71  Albania1987           NaN      2,156,624,900   
1               5.19  Albania1987           NaN      2,156,624,900   
2               4.83  Albania1987           NaN      2,156,624,900   
3               4.59  Albania1987           NaN      2,156,624,900   
4               3.28  Albania1987           NaN      2,156,624,900   

   gdp_per_capita ($)       generation  
0                 796     Generation X  
1                 796           Silent  
2                 796     Generation X  
3                 

In [74]:
df.describe()

Unnamed: 0,year,suicides_no,population,suicides/100k pop,HDI for year,gdp_per_capita ($)
count,27820.0,27820.0,27820.0,27820.0,8364.0,27820.0
mean,2001.258375,242.574407,1844794.0,12.816097,0.776601,16866.464414
std,8.469055,902.047917,3911779.0,18.961511,0.093367,18887.576472
min,1985.0,0.0,278.0,0.0,0.483,251.0
25%,1995.0,3.0,97498.5,0.92,0.713,3447.0
50%,2002.0,25.0,430150.0,5.99,0.779,9372.0
75%,2008.0,131.0,1486143.0,16.62,0.855,24874.0
max,2016.0,22338.0,43805210.0,224.97,0.944,126352.0


In [75]:
df.country.nunique()

101

In [76]:
df.country.unique()

array(['Albania', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Aruba',
       'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain',
       'Barbados', 'Belarus', 'Belgium', 'Belize',
       'Bosnia and Herzegovina', 'Brazil', 'Bulgaria', 'Cabo Verde',
       'Canada', 'Chile', 'Colombia', 'Costa Rica', 'Croatia', 'Cuba',
       'Cyprus', 'Czech Republic', 'Denmark', 'Dominica', 'Ecuador',
       'El Salvador', 'Estonia', 'Fiji', 'Finland', 'France', 'Georgia',
       'Germany', 'Greece', 'Grenada', 'Guatemala', 'Guyana', 'Hungary',
       'Iceland', 'Ireland', 'Israel', 'Italy', 'Jamaica', 'Japan',
       'Kazakhstan', 'Kiribati', 'Kuwait', 'Kyrgyzstan', 'Latvia',
       'Lithuania', 'Luxembourg', 'Macau', 'Maldives', 'Malta',
       'Mauritius', 'Mexico', 'Mongolia', 'Montenegro', 'Netherlands',
       'New Zealand', 'Nicaragua', 'Norway', 'Oman', 'Panama', 'Paraguay',
       'Philippines', 'Poland', 'Portugal', 'Puerto Rico', 'Qatar',
       'Republic of Korea', 'Romania', '

In [77]:
df['sex'].count()

27820

In [78]:
df[df['sex']=='female'].count()

country               13910
year                  13910
sex                   13910
age                   13910
suicides_no           13910
population            13910
suicides/100k pop     13910
country-year          13910
HDI for year           4182
 gdp_for_year ($)     13910
gdp_per_capita ($)    13910
generation            13910
dtype: int64

In [79]:
df[df['sex']=='male'].count()

country               13910
year                  13910
sex                   13910
age                   13910
suicides_no           13910
population            13910
suicides/100k pop     13910
country-year          13910
HDI for year           4182
 gdp_for_year ($)     13910
gdp_per_capita ($)    13910
generation            13910
dtype: int64

In [80]:
13910+13910

27820

### so there's an even split between men and women?? seems fishy...

In [81]:
df['year'].nunique()

32

In [82]:
df['year'].unique()

array([1987, 1988, 1989, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
       2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
       1985, 1986, 1990, 1991, 2012, 2013, 2014, 2015, 2011, 2016])

In [83]:
df['age'].unique()

array(['15-24 years', '35-54 years', '75+ years', '25-34 years',
       '55-74 years', '5-14 years'], dtype=object)

In [84]:
df['generation'].unique()

array(['Generation X', 'Silent', 'G.I. Generation', 'Boomers',
       'Millenials', 'Generation Z'], dtype=object)

Greatest Generation aka G.I. Generation == born early 1900s to mid to late 1920s.

Silent Generation == born mid-to-late 1920s to early-to-mid 1940s

Baby boomers == born from 1946 to 1964

Generation X == born from early-to-mid 1960s to the early 1980s

Millennials == born from early 1980s to early 2000s

Generation Z == born from mid-1990s to mid-2000s

In [85]:
df

Unnamed: 0,country,year,sex,age,suicides_no,population,suicides/100k pop,country-year,HDI for year,gdp_for_year ($),gdp_per_capita ($),generation
0,Albania,1987,male,15-24 years,21,312900,6.71,Albania1987,,2156624900,796,Generation X
1,Albania,1987,male,35-54 years,16,308000,5.19,Albania1987,,2156624900,796,Silent
2,Albania,1987,female,15-24 years,14,289700,4.83,Albania1987,,2156624900,796,Generation X
3,Albania,1987,male,75+ years,1,21800,4.59,Albania1987,,2156624900,796,G.I. Generation
4,Albania,1987,male,25-34 years,9,274300,3.28,Albania1987,,2156624900,796,Boomers
5,Albania,1987,female,75+ years,1,35600,2.81,Albania1987,,2156624900,796,G.I. Generation
6,Albania,1987,female,35-54 years,6,278800,2.15,Albania1987,,2156624900,796,Silent
7,Albania,1987,female,25-34 years,4,257200,1.56,Albania1987,,2156624900,796,Boomers
8,Albania,1987,male,55-74 years,1,137500,0.73,Albania1987,,2156624900,796,G.I. Generation
9,Albania,1987,female,5-14 years,0,311000,0.00,Albania1987,,2156624900,796,Generation X


In [86]:
total_suicides = 0
for number in df['suicides_no']:
    total_suicides += number
print(total_suicides)    

6748420


### the total number of suicides for all countries from 1987 to 2016 is 6,748,420.

In [87]:
num_suicides_per_country = []

current_country = "Albania"
count = 0
for i, country in enumerate(df['country']):
    if current_country == country:
        count += df["suicides_no"][i]
    else:
        num_suicides_per_country.append((current_country,count))
        count = df["suicides_no"][i]
        current_country = country
        
print(num_suicides_per_country)

count = 0
for country, suicide_sum in num_suicides_per_country:
    count+= suicide_sum

print(count)
        
        

[('Albania', 1970), ('Antigua and Barbuda', 11), ('Argentina', 82219), ('Armenia', 1905), ('Aruba', 101), ('Australia', 70111), ('Austria', 50073), ('Azerbaijan', 1656), ('Bahamas', 93), ('Bahrain', 463), ('Barbados', 177), ('Belarus', 59892), ('Belgium', 62761), ('Belize', 348), ('Bosnia and Herzegovina', 318), ('Brazil', 226613), ('Bulgaria', 36388), ('Cabo Verde', 42), ('Canada', 107561), ('Chile', 40895), ('Colombia', 53080), ('Costa Rica', 6792), ('Croatia', 18429), ('Cuba', 41418), ('Cyprus', 412), ('Czech Republic', 43687), ('Denmark', 15297), ('Dominica', 0), ('Ecuador', 20660), ('El Salvador', 11683), ('Estonia', 7034), ('Fiji', 304), ('Finland', 33677), ('France', 329127), ('Georgia', 3224), ('Germany', 291262), ('Greece', 12368), ('Grenada', 38), ('Guatemala', 8149), ('Guyana', 3426), ('Hungary', 73891), ('Iceland', 1108), ('Ireland', 12574), ('Israel', 11294), ('Italy', 132060), ('Jamaica', 184), ('Japan', 806902), ('Kazakhstan', 101546), ('Kiribati', 53), ('Kuwait', 966), 

### total number of suicides found above (in cell 61) == 6,748,420

### total number of suicides found above (in cell 68) == 6,713,617

### similar numbers, but slightly different. where is the discrepency coming from?

### 6,748,420 != 6,713,617

In [88]:
max_suicides = [None, 0]
for country, suicide_sum in num_suicides_per_country:
    max_suicides[1] = max(suicide_sum, max_suicides[1])
    if max_suicides[1] == suicide_sum:
        max_suicides[0] = country

print(max_suicides)

['Russian Federation', 1209742]


### The "Russian Federation" has the highest rate of suicides from 1987 to 2016, with 1,209,742 reported suicides.

In [93]:
suicide_lookup_dictionary = {}
array_for_heap = []

for country, suicide_sum in num_suicides_per_country:
    suicide_lookup_dictionary[suicide_sum] = country
    array_for_heap.append(suicide_sum)
    
heapq.heapify(array_for_heap)
five_most_suicidal_countries = heapq.nlargest(5,array_for_heap)

for i, suicide_number in enumerate(five_most_suicidal_countries):
    print(str(suicide_lookup_dictionary[suicide_number]) + " had " + str(suicide_number))

Russian Federation had 1209742
United States had 1034013
Japan had 806902
France had 329127
Ukraine had 319950


## the five most suicidal countries

### Russian Federation had 1,209,742

### United States had 1,034,013

### Japan had 806,902

### France had 329,127

### Ukraine had 319,950

In [94]:
suicide_lookup_dictionary = {}
array_for_heap = []

for country, suicide_sum in num_suicides_per_country:
    suicide_sum = suicide_sum * -1
    suicide_lookup_dictionary[suicide_sum] = country
    array_for_heap.append(suicide_sum)
    
heapq.heapify(array_for_heap)
five_least_suicidal_countries = heapq.nlargest(5,array_for_heap)

for i, suicide_number in enumerate(five_least_suicidal_countries):
    print(str(suicide_lookup_dictionary[suicide_number]) + " had " + str(-1*suicide_number))

Saint Kitts and Nevis had 0
Saint Kitts and Nevis had 0
San Marino had 4
Antigua and Barbuda had 11
Maldives had 20


### Saint Kitts and Nevis is the least suicidal nation with 0 reported suicides.