In [1]:
import pandas as pd
import numpy as np
df = pd.read_csv('https://query.data.world/s/brsolzsycfrm34kywedlvsfzxf7u5p', encoding='latin-1')

In [2]:
def UN_region(country):
    if country in ['Burundi', 'Comoros', 'Djibouti', 'Eritrea', 'Ethiopia', 'Kenya', 'Madagascar', 'Malawi', 'Mauritius', 'Mayotte', 'Mozambique', 'Reunion', 'Rwanda', 'Seychelles', 'Somalia', 'Somaliland', 'South Sudan', 'United Republic of Tanzania', 'Uganda', 'Zambia', 'Zimbabwe']:
        return 'East Africa'
    elif country in ['Angola', 'Cameroon', 'Central African Republic', 'Chad', 'Congo', 'Democratic Republic of the Congo', 'Equatorial Guinea', 'Gabon']:
        return 'Central Africa'
    elif country in ['Benin', 'Burkina Faso', 'Cape Verde', "Côte d'Ivoire", 'Gambia', 'Ghana', 'Guinea', 'Guinea-Bissau', 'Liberia', 'Mali', 'Mauritania', 'Niger', 'Nigeria', 'Senegal', 'Sierra Leone', 'Togo']:
        return 'West Africa'
    elif country in ['Botswana', 'Eswatini', 'Lesotho', 'Namibia', 'South Africa']:
        return 'Southern Africa'
    else:
        return 'Other'

In [3]:
df.rename(columns = {'Estimated incidence rate of new HIV infection per 1 000 uninfected population ': 'new_infections_per_1k', 'Estimated number of annual AIDS related deaths': 'deaths', 'Estimated number of annual new HIV infections': 'new_infections', 'Estimated number of people living with HIV': 'total_infections', 'Estimated rate of annual AIDS related deaths  per 100 000 population ': 'deaths_per_100k'}, inplace = True)

df['Age'] = df['Age'].map(lambda x: x.split(' ')[-1])
df['UN Region'] = df['Country'].map(lambda x: UN_region(x))

df.head()

Unnamed: 0,Country,UNICEF Region,Year,Sex,Age,new_infections_per_1k,deaths,new_infections,total_infections,deaths_per_100k,UN Region
0,Angola,Eastern and Southern Africa,1990,Female,10-19,0.64,100.0,500.0,860,0.36,Central Africa
1,Angola,Eastern and Southern Africa,1990,Male,10-19,0.15,100.0,100.0,200,0.07,Central Africa
2,Angola,Eastern and Southern Africa,1991,Female,10-19,0.75,100.0,500.0,1100,0.42,Central Africa
3,Angola,Eastern and Southern Africa,1991,Male,10-19,0.17,100.0,200.0,500,0.14,Central Africa
4,Angola,Eastern and Southern Africa,1992,Female,10-19,0.88,100.0,590.0,1300,0.54,Central Africa


In [4]:
df.to_csv('W02.csv', index = False)

In [9]:
df_grouped = df.groupby(['UN Region', 'Year'])['new_infections'].sum().reset_index()

df_grouped['Last Year'] = df_grouped['Year'] - 1
df_merged = pd.merge(df_grouped[['UN Region', 'Year', 'new_infections']], df_grouped[['UN Region', 'Last Year', 'new_infections']], left_on = ['UN Region', 'Year'], right_on = ['UN Region', 'Last Year'])
df_merged['new_infections_delta'] = df_merged['new_infections_y'] - df_merged['new_infections_x']

sigmoid_weights = [0] + [1 / (1 + np.exp(-x)) for x in np.arange(-2 * np.pi, (2 * np.pi) + (np.pi / 6), np.pi / 6)]
df_smoothed = []
for row in df_merged.to_dict('records'):
    for index, weight in list(enumerate(sigmoid_weights)):
        new_row = {}
        new_row['UN Region'] = row['UN Region']
        new_row['year_smoothed'] = row['Year'] + (index / 26)
        new_row['new_infections_smoothed'] = row['new_infections_x'] + (row['new_infections_delta'] * weight)
        df_smoothed.append(new_row)
df_smoothed = pd.DataFrame(df_smoothed)

In [10]:
df_smoothed.to_csv('W02_smoothed.csv', index = False)