In [36]:
# Import dependencies
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as st
import numpy as np


In [37]:
# Import the csv files needed to conduct the analysis
total_dataset_df = pd.read_csv("output_data/data_06_to_2015.csv")

developed_df = pd.read_csv("output_data/developed_countries.csv")

developing_df = pd.read_csv("output_data/developing_countries.csv")

In [38]:
# View total dataset
total_dataset_df

Unnamed: 0,country,year,"number of suicides (per 100,000 people)",health expenditure (per capita)
0,Argentina,2006,8.652830,447.361176
1,Argentina,2007,8.112240,551.750000
2,Argentina,2008,8.327544,694.682434
3,Argentina,2009,7.761451,742.843018
4,Argentina,2010,7.831615,891.137756
...,...,...,...,...
505,United Kingdom,2011,7.460703,3501.949707
506,United Kingdom,2012,7.442810,3492.889648
507,United Kingdom,2013,8.027719,4207.887695
508,United Kingdom,2014,7.904841,4601.137207


In [39]:
# View developed countries df
developed_df

Unnamed: 0,country,year,"number of suicides (per 100,000 people)",health expenditure (per capita),country classification
0,Australia,2006,11.078407,3177.460693,Developed Countries
1,Australia,2007,11.294980,3794.251709,Developed Countries
2,Australia,2008,11.635008,4088.778076,Developed Countries
3,Australia,2009,11.383720,3997.537109,Developed Countries
4,Australia,2010,11.608080,4952.777344,Developed Countries
...,...,...,...,...,...
255,United Kingdom,2011,7.460703,3501.949707,Developed Countries
256,United Kingdom,2012,7.442810,3492.889648,Developed Countries
257,United Kingdom,2013,8.027719,4207.887695,Developed Countries
258,United Kingdom,2014,7.904841,4601.137207,Developed Countries


In [40]:
# View developing countries df
developing_df

Unnamed: 0,country,year,"number of suicides (per 100,000 people)",health expenditure (per capita),country classification
0,Argentina,2006,8.652830,447.361176,Developing Countries
1,Argentina,2007,8.112240,551.750000,Developing Countries
2,Argentina,2008,8.327544,694.682434,Developing Countries
3,Argentina,2009,7.761451,742.843018,Developing Countries
4,Argentina,2010,7.831615,891.137756,Developing Countries
...,...,...,...,...,...
235,Turkmenistan,2011,2.810145,276.651703,Developing Countries
236,Turkmenistan,2012,2.942394,318.807312,Developing Countries
237,Turkmenistan,2013,2.127187,376.223877,Developing Countries
238,Turkmenistan,2014,2.826722,437.752808,Developing Countries


In [41]:
# Means of necessary values from total dataset
country_suicide_means = total_dataset_df.groupby("country")["number of suicides (per 100,000 people)"].mean()

country_health_expenditure_means = total_dataset_df.groupby("country")["health expenditure (per capita)"].mean()

In [42]:
# Generate a scatter plot of average health expenditure vs suicides over 2006-2015
%matplotlib widget
fig1, ax1 = plt.subplots(figsize=(10, 8))
plt.title("Average Health Expenditure per Capita vs Suicide Rate for each country",fontsize =14)
plt.scatter(country_health_expenditure_means, country_suicide_means, s=50,color="blue")
plt.xlabel('Average Health Expenditure per Capita ($USD)',fontsize =12)
plt.ylabel('Number of Suicides (per 100,000 people)',fontsize =12)
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [43]:
# Calculate a linear regression on the above data
ove_slope, ove_int, ove_r, ove_p, ove_std_err = st.linregress(country_health_expenditure_means, country_suicide_means)

In [44]:
# Create equation of line to calculate predicted average suicide rates
ove_fit = ove_slope * country_health_expenditure_means + ove_int

In [78]:
%matplotlib widget
fig1, ax1 = plt.subplots(figsize=(10, 8))
plt.title("Average Health Expenditure per Capita vs Suicide Rate for each country",fontsize =14)
line_eq = "y = " + str(round(ove_slope,2)) + "x + " + str(round(ove_int,2))
plt.scatter(country_health_expenditure_means, country_suicide_means, s=50,color="blue")
plt.plot(country_health_expenditure_means, ove_fit, color="red")
plt.annotate(line_eq,(5000,20),fontsize=15,color="red")
plt.xlabel('Average Health Expenditure per Capita ($USD)',fontsize =12)
plt.ylabel('Number of Suicides (per 100,000 people)',fontsize =12)
print(f"The r is: {ove_r}")
print(f"The r-squared is: {ove_r**2}")
plt.show()
plt.savefig("images/aveexpvsavesr_overall.png")

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

The r is: 0.1315351863393518
The r-squared is: 0.017301505245328


The above scatterplot and regression analysis shows that based on the mean values of health expenditure per capita and number of suicides,there is almost no correlation. The r-squared value of ~0.0173 suggests that ~98.27% of the variation in number of suicides is explained by variables other than health expenditure.

In [58]:
# Regression analysis on the first year of our dataset - first need to isolate 2006 data
# Create a dataframe for 2006 data
data06_df = total_dataset_df[total_dataset_df.year == 2006]
data06_df.head()

Unnamed: 0,country,year,"number of suicides (per 100,000 people)",health expenditure (per capita)
0,Argentina,2006,8.65283,447.361176
10,Armenia,2006,2.821593,126.244705
20,Australia,2006,11.078407,3177.460693
30,Austria,2006,16.481836,3879.533203
40,Belgium,2006,19.422442,3474.425781


In [59]:
# Means of necessary values from 2006 dataset
country_suicide_means_06 = data06_df.groupby("country")["number of suicides (per 100,000 people)"].mean()

country_health_expenditure_means_06 = data06_df.groupby("country")["health expenditure (per capita)"].mean()

In [60]:
# Calculate a linear regression on these datasets
slope_06, int_06, r_06, p_06, std_err_06 = st.linregress(country_health_expenditure_means_06, country_suicide_means_06)

In [61]:
# Create equation of line to calculate predicted average suicide rates
fit_06 = slope_06 * country_health_expenditure_means_06 + int_06

In [77]:
# Scatterplot for 2006 and the line of best fit
%matplotlib widget
fig1, ax1 = plt.subplots(figsize=(10, 8))
plt.title("Average Health Expenditure per Capita vs Suicide Rate for each country (2006)",fontsize=14)
line_eq = "y = " + str(round(slope_06,2)) + "x + " + str(round(int_06,2))
plt.scatter(country_health_expenditure_means_06, country_suicide_means_06, s=50,color="blue")
plt.plot(country_health_expenditure_means_06, fit_06, color="red")
plt.annotate(line_eq,(1500, 15),fontsize=15, color="red")
plt.xlabel('Average Health Expenditure per Capita ($USD)',fontsize =12)
plt.ylabel('Number of Suicides (per 100,000 people)',fontsize =12)
print(f"The r is: {r_06}")
print(f"The r-squared is: {r_06**2}")
plt.show()
plt.savefig("images/aveexpvavesr_06.png")

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

The r is: 0.09232090019637891
The r-squared is: 0.008523148613069756


The above scatterplot and regression analysis shows that based on the mean values of health expenditure per capita and number of suicides,there is almost no correlation. The r-squared value of ~0.008 suggests that ~99.2% of the variation in number of suicides is explained by variables other than health expenditure.

In [63]:
# Regression analysis on the last year of our dataset - first need to isolate 2015 data
# Create a dataframe for 2015 data
data15_df = total_dataset_df[total_dataset_df.year == 2015]
data15_df.head()

Unnamed: 0,country,year,"number of suicides (per 100,000 people)",health expenditure (per capita)
9,Argentina,2015,7.740627,1305.39502
19,Armenia,2015,2.647268,366.049225
29,Australia,2015,13.610131,4887.800781
39,Austria,2015,15.220115,4610.966797
49,Belgium,2015,17.57283,4171.053223


In [64]:
# Means of necessary values from 2015 dataset
country_suicide_means_15 = data15_df.groupby("country")["number of suicides (per 100,000 people)"].mean()

country_health_expenditure_means_15 = data15_df.groupby("country")["health expenditure (per capita)"].mean()

In [65]:
# Calculate a linear regression on these datasets
slope_15, int_15, r_15, p_15, std_err_15 = st.linregress(country_health_expenditure_means_15, country_suicide_means_15)

In [66]:
# Create equation of line to calculate predicted average suicide rates
fit_15 = slope_15 * country_health_expenditure_means_15 + int_15

In [72]:
# Scatterplot for 2015 and the line of best fit
%matplotlib widget
fig1, ax1 = plt.subplots(figsize=(10, 8))
plt.title("Average Health Expenditure per Capita vs Suicide Rate for each country (2015)",fontsize=14)
line_eq = "y = " + str(round(slope_15,2)) + "x + " + str(round(int_15,2))
plt.scatter(country_health_expenditure_means_15, country_suicide_means_15, s=50,color="blue")
plt.plot(country_health_expenditure_means_15, fit_15, color='red')
plt.annotate(line_eq,(1500, 15),fontsize=15,color="red")
plt.xlabel('Average Health Expenditure per Capita ($USD)',fontsize =12)
plt.ylabel('Number of Suicides (per 100,000 people)',fontsize =12)
print(f"The r is: {r_15}")
print(f"The r-squared is: {r_15**2}")
plt.show()
plt.savefig("images/aveexpvavesr_15.png")

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

The r is: 0.1640556760558859
The r-squared is: 0.026914264846153772


The above scatterplot and regression analysis shows that based on the mean values of health expenditure per capita and number of suicides,there is almost no correlation. The r-squared value of ~0.027 suggests that ~97.3% of the variation in number of suicides is explained by variables other than health expenditure.

In [79]:
# Linear regression analysis for developed countries data
# Means of necessary values from developed dataset
country_suicide_means_rich = developed_df.groupby("country")["number of suicides (per 100,000 people)"].mean()

country_health_expenditure_means_rich = developed_df.groupby("country")["health expenditure (per capita)"].mean()

In [80]:
# Calculate a linear regression for developed countries
rich_slope, rich_int, rich_r, rich_p, rich_std_err = st.linregress(country_health_expenditure_means_rich, country_suicide_means_rich)

In [81]:
# Create equation of line to calculate predicted average suicide rates
rich_fit = rich_slope * country_health_expenditure_means_rich + rich_int

In [82]:
# Scatterplot with line of best fit for developed countries data
%matplotlib widget
fig1, ax1 = plt.subplots(figsize=(10, 8))
plt.title("Average Health Expenditure per Capita vs Suicide Rate for Developed Countries",fontsize =14)
line_eq = "y = " + str(round(rich_slope,2)) + "x + " + str(round(rich_int,2))
plt.scatter(country_health_expenditure_means_rich, country_suicide_means_rich, s=50,color="blue")
plt.plot(country_health_expenditure_means_rich, rich_fit, color="red")
plt.annotate(line_eq,(5000,20),fontsize=15,color="red")
plt.xlabel('Average Health Expenditure per Capita ($USD)',fontsize =12)
plt.ylabel('Number of Suicides (per 100,000 people)',fontsize =12)
print(f"The r is: {rich_r}")
print(f"The r-squared is: {rich_r**2}")
plt.show()
plt.savefig("images/aveexpvavesr_developed.png")

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

The r is: -0.1455950738602167
The r-squared is: 0.021197925532361956


The above scatterplot and regression analysis shows that based on the mean values of health expenditure per capita and number of suicides,there is almost no correlation. The r-squared value of ~0.021 suggests that ~97.9% of the variation in number of suicides is explained by variables other than health expenditure. It is interesting to note that the r-value is very slightly negative, but still would be considered to have no correlation.

In [83]:
# Linear regression analysis for developing countries data
# Means of necessary values from developing dataset
country_suicide_means_poor = developing_df.groupby("country")["number of suicides (per 100,000 people)"].mean()

country_health_expenditure_means_poor = developing_df.groupby("country")["health expenditure (per capita)"].mean()

In [84]:
# Calculate a linear regression for developing countries
poor_slope, poor_int, poor_r, poor_p, poor_std_err = st.linregress(country_health_expenditure_means_poor, country_suicide_means_poor)

In [85]:
# Create equation of line to calculate predicted average suicide rates
poor_fit = poor_slope * country_health_expenditure_means_poor + poor_int

In [87]:
# Scatterplot with line of best fit for developing countries data
%matplotlib widget
fig1, ax1 = plt.subplots(figsize=(10, 8))
plt.title("Average Health Expenditure per Capita vs Suicide Rate for Developing Countries",fontsize =14)
line_eq = "y = " + str(round(poor_slope,2)) + "x + " + str(round(poor_int,2))
plt.scatter(country_health_expenditure_means_poor, country_suicide_means_poor, s=50,color="blue")
plt.plot(country_health_expenditure_means_poor, poor_fit, color="red")
plt.annotate(line_eq,(1000,15),fontsize=15,color="red")
plt.xlabel('Average Health Expenditure per Capita ($USD)',fontsize =12)
plt.ylabel('Number of Suicides (per 100,000 people)',fontsize =12)
print(f"The r is: {poor_r}")
print(f"The r-squared is: {poor_r**2}")
plt.show()
plt.savefig("images/aveexpvavesr_developing.png")

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

The r is: 0.13945183524516883
The r-squared is: 0.01944681435324571


The above scatterplot and regression analysis shows that based on the mean values of health expenditure per capita and number of suicides,there is almost no correlation. The r-squared value of ~0.019 suggests that ~98.1% of the variation in number of suicides is explained by variables other than health expenditure.