In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import requests
from scipy.stats import linregress
import os
import seaborn as sns
import plotly.express as px

#output_data_file = "output_data/.csv"

**1. Importing/Cleaning Data**

In [7]:
data_file = "Resources/life_expectancy_data_region.csv"
life_df = pd.read_csv(data_file)

#trim the dataframe to the columns that we need
life_df = life_df[["Country", "Year", "Region", "Status", "Life expectancy ", "Adult Mortality", "Alcohol", 
                   "percentage expenditure", "Hepatitis B", "Measles ", " BMI ", "Polio", "Total expenditure",
                  "Diphtheria ", " HIV/AIDS", "GDP", "Population", "Schooling"]]


life_df = life_df.rename(columns={'Life expectancy ': 'Life Expectancy', 
                        'percentage expenditure': 'Percentage Expenditure',
                        'Measles ': 'Measles',
                        ' BMI ': 'BMI',
                        "Total expenditure": "Total Expenditure",
                        "Diphtheria ": "Diphtheria",
                        " HIV/AIDS": "HIV/AIDS",
                        "under-five deaths": "Under Five Deaths"
                        })

disease_df = life_df[["Country", "Year", "Region", "Status", "Life Expectancy", "HIV/AIDS"]]# "Polio", "Diphtheria", "Measles", "Hepatitis B"]]
disease_df = disease_df.dropna(0)

#Create 2 datasframes (1 with NaN = mean, 1 with NaN = 0)
nan0_df = life_df.fillna(0)
nan0_df = nan0_df.sort_values("Year")
mean_df = life_df.fillna(life_df.mean())
mean_df = mean_df.sort_values("Year")
drop_df = life_df.dropna(0)
drop_df = drop_df.sort_values("Year")

#Create dataframes for each region for each broad dataframe
em_mean = mean_df[mean_df['Region']=='Eastern Mediterranean']
em_0 = nan0_df[nan0_df['Region']=='Eastern Mediterranean']

euro_mean = mean_df[mean_df['Region']=='Europe']
euro_0 = nan0_df[nan0_df['Region']=='Europe']

africa_mean = mean_df[mean_df['Region']=='Africa']
africa_0 = nan0_df[nan0_df['Region']=='Africa']
africa_drop = drop_df[drop_df['Region']=='Africa']

americas_mean = mean_df[mean_df['Region']=='Americas']
americas_0 = nan0_df[nan0_df['Region']=='Americas']

wp_mean = mean_df[mean_df['Region']=='Western Pacific']
wp_0 = nan0_df[nan0_df['Region']=='Western Pacific']

sea_mean = mean_df[mean_df['Region']=='South-East Asia']
sea_0 = nan0_df[nan0_df['Region']=='South-East Asia']

#Create Dataframes for Developed vs. Developing Countries
undev_mean= mean_df.loc[mean_df["Status"] == "Developing"]
undev_0 = nan0_df.loc[nan0_df["Status"] == "Developing"]
dev_mean = mean_df.loc[mean_df["Status"] == "Developed"]
dev_0 = nan0_df.loc[nan0_df["Status"] == "Developed"]

#Scratch Code to quickly find a value in a column
# max_value = nan0_df["Adult Mortality"].mean()
# max_value

disease_df.count()

Country            2928
Year               2928
Region             2928
Status             2928
Life Expectancy    2928
HIV/AIDS           2928
dtype: int64

In [10]:

HIV_mean = disease_df.groupby('Country', 'Year']).mean()

HIV_mean = HIV_mean.reset_index()

HIV_df = HIV_mean.merge(disease_df, how='right', on=['Country', 'Year'])
HIV_df.head()

# regions = disease_df['Region'].tolist()

# HIV_data = []

# for region in regions:
    
#     #Locate the rows that contain HIV data in each region
#     HIV_rate = disease_df.loc[disease_df["Region"] == region]
    
#     HIV_df = HIV_rate.loc[HIV_rate['HIV/AIDS'] == HIV_rate['HIV/AIDS']]
    
#     values = HIV_df['HIV/AIDS']
#     HIV_data.append(values)
    
#     quartiles = values.quantile([.25,.5,.75])
#     lowerq = quartiles[0.25]
#     upperq = quartiles[0.75]
#     iqr = upperq - lowerq
#     print(f'The IQR for {region} is: {iqr}')
    
#     #Calculate and print lower and upper bounds
#     lower_bound = lowerq - (1.5*iqr)
#     upper_bound = upperq + (1.5*iqr)
# #     print(f'The Lower Bound quartile for {region} is: {lower_bound}')
# #     print(f'The Upper Bound quartile for {region} is: {upper_bound}')

#     #Caluculate and print outliers by creating variable for outlier count
#     outliers_count = (values.loc[(HIV_df['HIV/AIDS'] >= upper_bound) |
#                                  (HIV_df['HIV/AIDS'] <= lower_bound)]).count()
                                 
#     print(f'Outliers in {region}: {outliers_count}')
#     print(f"Values above {upper_bound} could be outliers.")
#     print('-------------------------------------------------------------')
    

Unnamed: 0,Country,Year,Region,Status,Life Expectancy,HIV/AIDS
0,Afghanistan,2015,Eastern Mediterranean,Developing,65.0,0.1
1,Afghanistan,2014,Eastern Mediterranean,Developing,59.9,0.1
2,Afghanistan,2013,Eastern Mediterranean,Developing,59.9,0.1
3,Afghanistan,2012,Eastern Mediterranean,Developing,59.5,0.1
4,Afghanistan,2011,Eastern Mediterranean,Developing,59.2,0.1


In [6]:
#Finding Outliers for HIV/AIDS



HIV_data = undev_0["HIV/AIDS"]
quartiles = HIV_data.quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq

print(f"The lower quartile of HIV/AIDS cases is: {lowerq}")
print(f"The upper quartile of HIV/AIDS cases is: {upperq}")
print(f"The interquartile range of HIV/AIDS cases is: {iqr}")
print(f"The the median of HIV/AIDS cases is: {quartiles[0.5]} ")

lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)
print('---------------------------------------------------')
print(f"Values above {upper_bound} could be outliers.")


The lower quartile of HIV/AIDS cases is: 0.1
The upper quartile of HIV/AIDS cases is: 1.4
The interquartile range of HIV/AIDS cases is: 1.2999999999999998
The the median of HIV/AIDS cases is: 0.1 
---------------------------------------------------
Values above 3.3499999999999996 could be outliers.


**2. Graphing Code**

In [None]:
#Basic Line Graph: Life Expectancy vs. Time by Region (switch df for desired region)
le_time = africa_0.groupby(['Year']).mean()['Life Expectancy']
le_time.plot(kind='line')
plt.show()

In [None]:
#Linear Regression/Basic Scatter Plot Code (switch df and x for desired outcome)
#Input Variable vs Life Expectancy
(slope, intercept, rvalue, pvalue, stderr) = linregress(africa_0["HIV/AIDS"], africa_0['Life Expectancy'])
regress_values = africa_0["HIV/AIDS"] * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))

#Print r-value >>>>>>>>>>>>>> figure out print formatting
print("The r-value is:")
print(rvalue)

#Annotate LR, create labels, then print scatter plot w/ linear regress equation
plt.scatter(africa_0["HIV/AIDS"], africa_0['Life Expectancy'])
plt.plot(africa_0["HIV/AIDS"], regress_values, "r-")
plt.annotate(line_eq,(15,70),fontsize=15,color="red")
plt.xlabel("HIV/AIDS")
plt.ylabel("Life Expectancy")
plt.title("Life Expectancy vs. Variable in Specified df")
#plt.savefig("output_data/north_lat_vs_maxtemp.png")
plt.show()

In [None]:
#Seaborn Scatter Plot for Input Variable(x) vs. Life Expectancy
(slope, intercept, rvalue, pvalue, stderr) = linregress(africa_0["HIV/AIDS"], africa_0['Life Expectancy'])
regress_values = africa_0["HIV/AIDS"] * slope + intercept
print("The r-value is:")
print(rvalue)

line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
sns.scatterplot(data=africa_0, x="HIV/AIDS", y="Life Expectancy", hue='Country', legend = False)
plt.plot(africa_0["HIV/AIDS"], regress_values, "r-")
plt.annotate(line_eq,(15,70),fontsize=15,color="red")


In [None]:
#Plotly Scatter Plot (switch out df and x_axis for desired outcome)

#Adult Mortality vs. Life Expectancy

fig = px.scatter(mean_df, x="Adult Mortality", y="Life Expectancy", animation_frame="Year", animation_group="Country",
           size="Population", color="Region", hover_name="Country",
           log_x=False, size_max=55, range_x=[0,700], range_y=[25,90])

fig["layout"].pop("updatemenus") # optional, drop animation buttons
fig.show()

In [None]:
#HIV/AIDS vs. Adult Mortality


fig = px.scatter(undev_0, x="HIV/AIDS", y="Adult Mortality", animation_frame="Year", animation_group="Country",
           size="Population", color="Region", hover_name="Country",
           log_x=True, size_max=55, range_x=[.05,50], range_y=[0,730])

fig["layout"].pop("updatemenus") # optional, drop animation buttons
fig.show()

In [None]:
#HIV/AIDS vs. Life Expectancy

fig = px.scatter(undev_0, x="HIV/AIDS", y="Life Expectancy", animation_frame="Year", animation_group="Country",
           size="Population", color="Region", hover_name="Country",
           log_x=True, size_max=55, range_x=[.08,50], range_y=[25,90])

fig["layout"].pop("updatemenus") # optional, drop animation buttons
fig.show()