In [1]:
# importing common libraries
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from mlxtend.frequent_patterns import apriori, association_rules
from sklearn.model_selection import train_test_split


# warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
# load dataset

dataset = pd.read_csv("MalariaAfricaDataset.csv")
df = pd.DataFrame(dataset)
df.head(10)

FileNotFoundError: [Errno 2] No such file or directory: 'MalariaAfricaDataset.csv'

In [None]:
# number of rows and columns
number_of_columns = df.shape[1]
number_of_rows = df.shape[0]
print(f'Number of columns: {number_of_columns}')
print(f'Number of rows: {number_of_rows}')

In [None]:
# Column names
df.columns

In [None]:
# column datatypes
df.dtypes

In [None]:
# Converting year datatype to datetime
df['Year'] = pd.to_datetime(df.Year,format='%Y').dt.year
df.dtypes

In [None]:
# Renaming columns
new_column_names = {
    'Country Name': 'Country',
    'Year': 'Year',
    'Country code': 'Country_code',
    'Malaria cases reported': 'Reported_cases',
    'Incidence of malaria (per 1,000 population at risk)': 'Incidences_per_1000_population_at_risk',
    'Intermittent preventive treatment (IPT) of malaria in pregnancy (% of pregnant women)': '%_of_pregnant_women_using_IPT',
    'Use of insecticide-treated bed nets (% of under-5 population)': '%_using_IBNs',
    'Children with fever receiving antimalarial drugs (% of children under age 5 with fever)': '%_of_children_under_age_5_with_fever',
    'Rural population (% of total population)': '%_of_rural_population',
    'Urban population (% of total population)': '%_of_urban_population',
    'Rural population growth (annual %)': 'annual_%_growth_of_rural_population',
    'Urban population growth (annual %)': 'annual_%_growth_of_urban_population',
    'People using safely managed sanitation services (% of population)': '%_using_safe_sanity_services',
    'People using safely managed sanitation services, urban  (% of urban population)': '%_of_urban_using_safe_sanity_services',
    'People using safely managed sanitation services, rural (% of rural population)': '%_of_rural_using_safe_sanity_services',
    'People using at least basic sanitation services (% of population)': '%_using_atleast_basic_sanity_services',
    'People using at least basic sanitation services, urban  (% of urban population)': '%_of_urban_using_atleast_basic_sanity_services',
    'People using at least basic sanitation services, rural (% of rural population)': '%_of_rural_using_atleast_basic_sanity_services',
    'People using at least basic drinking water services (% of population)': '%_using_atleast_basic_drinking_water_services',
    'People using at least basic drinking water services, urban (% of urban population)': '%_of_urban_using_atleast_basic_drinking_water_services',
    'People using at least basic drinking water services, rural (% of rural population)': '%_of_rural_using_atleast_basic_drinking_water_services',
    'People using safely managed drinking water services (% of population)': '%_using_safe_drinking_water_services',
    'People using safely managed drinking water services, urban (% of urban population)': '%_of_urban_using_safe_drinking_water_services',
    'People using safely managed drinking water services, rural (% of rural population)': '%_of_rural_using_safe_drinking_water_services',
    'longitude': 'longitude',
    'latitude': 'latitude',
    'geometry': 'geometry'
}

df.rename(columns=new_column_names, inplace=True)


In [None]:
# Review changes
df.columns

In [None]:
# number of missing values per column in descending order
number_of_missing_values = df.isnull().sum().sort_values(ascending=False)
number_of_missing_values

In [None]:
# Percentage of null values per column in descending order
percentage_of_missing_values = (df.isnull().sum()/df.shape[0]*100).sort_values(ascending=False)
percentage_of_missing_values = percentage_of_missing_values.apply(lambda x: f'{x:.2f}%')
percentage_of_missing_values

Checking for rows where the sum of '%_of_rural_population' and '%_of_urban_population' is not equal to 100

In [None]:
# Obtain sum of %_of_rural_population and %_of_urban_population
df['%_of_total_population'] = df[['%_of_rural_population', '%_of_urban_population']].sum(axis=1)
# Checking for rows where %_of_rural_population + %_of_urban_population is not equal to 100
invalid_total_population = df[(df['%_of_rural_population'].notnull()) & (df['%_of_urban_population'].notnull()) &
                             (df['%_of_rural_population'] + df['%_of_urban_population'] != 100)].reset_index(drop=True)

print(f'Number of such rows: {invalid_total_population.shape[0]}')
    
invalid_total_population[['Country', 'Year', '%_of_rural_population', '%_of_urban_population', '%_of_total_population']]

Algorithm to normalize the '%_of_rural_population' and '%_of_urban_population' columns for rows where the sum is not 100.

In [None]:
# Algorithm to normalize %_of_rural_population and %_of_urban_population where the total population percentage is not 100
def normalize_percentages(df):
    for index, row in df.iterrows():
        if pd.notnull(row['%_of_rural_population']) and pd.notnull(row['%_of_urban_population']) and \
                (row['%_of_rural_population'] + row['%_of_urban_population'] != 100):
            # Calculate normalized values
            normalized_rural_percentage = (row['%_of_rural_population'] / (row['%_of_rural_population'] + row['%_of_urban_population'])) * 100
            normalized_urban_percentage = (row['%_of_urban_population'] / (row['%_of_rural_population'] + row['%_of_urban_population'])) * 100
            # Round to 2 decimal places
            normalized_rural_percentage = round(normalized_rural_percentage, 2)
            normalized_urban_percentage = round(normalized_urban_percentage, 2)
            # Assign the values to new columns
            df.at[index, '%_of_rural_population'] = normalized_rural_percentage
            df.at[index, '%_of_urban_population'] = normalized_urban_percentage
    return df

df = normalize_percentages(df)

Review normalization results

In [None]:
# Checking for rows where %_of_rural_population + %_of_urban_population is not equal to 100
new_invalid_total_population = df[(df['%_of_rural_population'].notnull()) & (df['%_of_urban_population'].notnull()) &
                             (df['%_of_rural_population'] + df['%_of_urban_population'] != 100)].reset_index(drop=True)

print(f'Number of such rows: {new_invalid_total_population.shape[0]}')
    
new_invalid_total_population[['Country', 'Year', '%_of_rural_population', '%_of_urban_population', '%_of_total_population']]

Fill cells where values are "Missing At Random", which means that their values can be determined using values from other columns.

The algorithm is based off of four equations which represent a mathematical relationship between some of the columns as shown below.
1. %_using_safe_sanity_services = (%_of_rural_using_safe_sanity_services * %_of_rural_population) + (%_of_urban_using_safe_sanity_services * %_of_urban_population)/100

2. %_using_atleast_basic_drinking_water_services = (%_of_rural_using_atleast_basic_drinking_water_services * %_of_rural_population) + (%_of_urban_using_atleast_basic_drinking_water_services * %_of_urban_population)/100

3. %_using_atleast_basic_sanity_services = (%_of_rural_using_atleast_basic_sanity_services * %_of_rural_population) + (%_of_urban_using_atleast_basic_sanity_services * %_of_urban_population)/100

4. %_using_safe_drinking_water_services = (%_of_rural_using_safe_drinking_water_services * %_of_rural_population) + (%_of_urban_using_safe_drinking_water_services * %_of_urban_population)/100

Eritrea is the only country with null values in the '%_of_rural_population' and '%_of_urban_population' columns, which can be calculated using values provided in the columns '%_using_atleast_basic_sanity_services', '%_of_rural_using_atleast_basic_sanity_services', '%_of_urban_using_atleast_basic_sanity_services', '%_using_atleast_basic_drinking_water_services', '%_of_rural_using_atleast_basic_drinking_water_services', and '%_of_urban_using_atleast_basic_drinking_water_services'

Eritrea is also the only contry with null values in the 'annual_&#37;\_growth_of_rural_population' and 'annual\_&#37;_growth_of_urban_population' columns
 

In [None]:
# Obtain rows where the country is Eritrea
eritrea = df[df['Country'] == 'Eritrea'].reset_index(drop=True)
# Display population percentages
eritrea[['Country', 'Year', '%_of_rural_population', '%_of_urban_population', 'annual_%_growth_of_rural_population', 'annual_%_growth_of_urban_population']]


Algorithm to fill the %_of_rural_population and %_of_urban_population columns.

In [None]:

def fill_missing_percentage_population(df):
    for index, row in df.iterrows():
        if pd.notnull(row['%_using_atleast_basic_drinking_water_services']) and pd.notnull(row['%_of_rural_using_atleast_basic_drinking_water_services']) \
            and pd.notnull(row['%_of_urban_using_atleast_basic_drinking_water_services']) and pd.notnull(row['%_using_atleast_basic_sanity_services']) \
            and pd.notnull(row['%_of_rural_using_atleast_basic_sanity_services']) and pd.notnull(row['%_of_urban_using_atleast_basic_sanity_services']) and \
            pd.isnull(row['%_of_rural_population']) and pd.isnull(row['%_of_urban_population']):

            coefficients = [[row['%_of_rural_using_atleast_basic_drinking_water_services']/100, row['%_of_urban_using_atleast_basic_drinking_water_services']/100],
                                     [row['%_of_rural_using_atleast_basic_sanity_services']/100, row['%_of_urban_using_atleast_basic_sanity_services']/100]]
            constants = [row['%_using_atleast_basic_drinking_water_services'], row['%_using_atleast_basic_sanity_services']]
            
            try:
                # Try to solve the system using the regular solve() function
                solution = np.linalg.solve(coefficients, constants)
                rural_population_percentage = float(format(solution[0], '.2f'))
                urban_population_percentage = float(format(solution[1], '.2f'))

            except np.linalg.LinAlgError as e:
                # If the matrix is singular, use the inverse instead
                print(f"Warning: {e}")
                inverse = np.linalg.pinv(coefficients)
                solution = np.dot(inverse, constants)
                rural_population_percentage = float(format(solution[0], '.2f'))
                urban_population_percentage = float(format(solution[1], '.2f'))

            if (rural_population_percentage + urban_population_percentage) != 100:
                 # Calculate normalized values
                normalized_rural_percentage = (rural_population_percentage / (rural_population_percentage + urban_population_percentage)) * 100
                normalized_urban_percentage = (urban_population_percentage / (rural_population_percentage + urban_population_percentage)) * 100
                # Round to 2 decimal places
                normalized_rural_percentage = round(normalized_rural_percentage, 2)
                normalized_urban_percentage = round(normalized_urban_percentage, 2)
                # Assign the values to new columns
                df.at[index, '%_of_rural_population'] = normalized_rural_percentage
                df.at[index, '%_of_urban_population'] = normalized_urban_percentage
            else:
                df.at[index, '%_of_rural_population'] = rural_population_percentage
                df.at[index, '%_of_urban_population'] = urban_population_percentage
        
        # For rows with no values for '%_of_rural_using_atleast_basic_drinking_water_services', '%_of_urban_using_atleast_basic_drinking_water_services',
        # '%_of_rural_using_atleast_basic_sanity_services', and '%_of_urban_using_atleast_basic_sanity_services' insert 0's because they can't be calculated
        if pd.notnull(row['%_using_atleast_basic_drinking_water_services']) and pd.isnull(row['%_of_rural_using_atleast_basic_drinking_water_services']) \
            and pd.isnull(row['%_of_urban_using_atleast_basic_drinking_water_services']) and pd.notnull(row['%_using_atleast_basic_sanity_services']) \
            and pd.isnull(row['%_of_rural_using_atleast_basic_sanity_services']) and pd.isnull(row['%_of_urban_using_atleast_basic_sanity_services']) and \
            pd.notnull(row['%_of_rural_population']) and pd.notnull(row['%_of_urban_population']):

            df.at[index, '%_of_rural_using_atleast_basic_drinking_water_services'] = 0.00
            df.at[index, '%_of_urban_using_atleast_basic_drinking_water_services'] = 0.00
            df.at[index, '%_of_rural_using_atleast_basic_sanity_services'] = 0.00
            df.at[index, '%_of_urban_using_atleast_basic_sanity_services'] = 0.00

    return df        

            
df = fill_missing_percentage_population(df)

We can now see that '%_of_rural_population' and '%_of_urban_population' have been filled.

The last row, however, still has null values because '%_using_atleast_basic_drinking_water_services', '%_of_rural_using_atleast_basic_drinking_water_services', '%_of_urban_using_atleast_basic_drinking_water_services', '%_using_atleast_basic_sanity_services', '%_of_rural_using_atleast_basic_sanity_services', '%_of_urban_using_atleast_basic_sanity_services' are null and yet these are the values required by the algorithm to calculate the '%_of_rural_population' and '%_of_urban_population'

In [None]:
# Obtain rows where the country is Eritrea
eritrea = df[df['Country'] == 'Eritrea'].reset_index(drop=True)
# Display population percentages
print("One row still has null values for '%_of_rural_population' and '%_of_urban_population' because the columns whose values are used to calculate them were null")
eritrea[['Country', 'Year', '%_of_rural_population', '%_of_urban_population', 'annual_%_growth_of_rural_population', 'annual_%_growth_of_urban_population']]

We realised that we are working with time-series data, which means that the data contains trends or patterns which are clearly visible to the naked eye.

Mean and mode imputation were therefore not an option because they would introduce biases and affect existing trends or patterns.

We developed an algorithm that utilises regression, to predict and fill the missing values.

In [None]:
all_columns = df.columns
def fill_with_regression(original_dataframe, new_dataframe, target_column, feature_column):
    # Filter rows with available data to train the model
    df_train = new_dataframe.dropna(subset=[target_column])

    # Create a linear regression model
    model = LinearRegression()

    # Fit the model
    X_train = df_train[feature_column].to_numpy().reshape(-1, 1)
    y_train = df_train[target_column].to_numpy()
    model.fit(X_train, y_train)

    # Predict the missing values
    X_pred = new_dataframe[new_dataframe[target_column].isnull()][feature_column].to_numpy().reshape(-1, 1)
    y_pred = model.predict(X_pred)

    # Update the DataFrame with the rounded predicted values
    new_dataframe.loc[new_dataframe[target_column].isnull(), target_column] = y_pred

    # Round off to 2 decimal places
    new_dataframe[target_column] = new_dataframe[target_column].round(2)

    # merging the original dataframe with the dataframe containing filled in values 
    merged_df = pd.merge(original_dataframe, new_dataframe, on=['Country', "Country Code", 'Year'], how='left')

    exclude_columns = ['Country', 'Year', 'Country Code']

    for column in all_columns:
        if column not in exclude_columns:
            # Add data from the eritrea dataframe where it is missing in the original dataframe
            merged_df[column] = np.where(
            (merged_df[column + '_x'].isnull()) &
            (~merged_df[column + '_y'].isnull()),
            merged_df[column + '_y'],
            merged_df[column + '_x']
            )
            # Delete the unnecessary columns
            merged_df = merged_df.drop([column + '_x', column + '_y'], axis=1)

    # Calculate MSE and R-squared for the model
    mse = round(mean_squared_error(y_train, model.predict(X_train)), 2)
    r_squared = round(r2_score(y_train, model.predict(X_train)), 2)

    return merged_df, mse, r_squared, target_column

eritrea_columns = ['%_of_rural_population', '%_of_urban_population','annual_%_growth_of_rural_population', 'annual_%_growth_of_urban_population']

for column in eritrea_columns:
    df, mse, r_squared, target_column = fill_with_regression(df, eritrea, column, 'Year')
    print(f'The Mean Squared Error for the model used to predict {target_column} values is: {mse}')
    print(f'The r_squared for the model used to predict {target_column} values is: {r_squared}')
    print("")

print('All missing values have been filled')

eritrea[['Country', 'Year', '%_of_rural_population', '%_of_urban_population', 'annual_%_growth_of_rural_population', 'annual_%_growth_of_urban_population']]


Missing values for '%\_of_rural_population', '%\_of_rural_population', 'annual_%\_growth_of_rural_population' and 'annual\_%_growth_of_urban_population' have been filled.

We then went ahead to fill missing values for '%\_of_urban_using_atleast_basic_sanity_services', '%\_of_rural_using_atleast_basic_drinking_water_services', '%\_of_urban_using_atleast_basic_drinking_water_services', '%\_of_rural_using_atleast_basic_sanity_services', '%_using_atleast_basic_drinking_water_services', '%_using_atleast_basic_sanity_services'   

South Sudan, Eritrea and Central African Republic have rows with no data for these columns.

We again noticed trends in this data and therefore, regression imputation was used.

    

In [None]:
South_Sudan = df[df['Country'] == "South Sudan"]
South_Sudan[['Country', '%_using_atleast_basic_sanity_services', '%_using_atleast_basic_drinking_water_services', '%_of_rural_using_atleast_basic_drinking_water_services', '%_of_urban_using_atleast_basic_drinking_water_services', '%_of_rural_using_atleast_basic_sanity_services', '%_of_urban_using_atleast_basic_sanity_services']].reset_index(drop=True)

In [None]:

null_columns = ['%_using_atleast_basic_drinking_water_services', '%_using_atleast_basic_sanity_services', '%_of_rural_using_atleast_basic_drinking_water_services', '%_of_urban_using_atleast_basic_drinking_water_services',
               '%_of_rural_using_atleast_basic_sanity_services', '%_of_urban_using_atleast_basic_sanity_services']
for column in null_columns:
    df, mse, r_squared, target_column = fill_with_regression(df, South_Sudan, column, 'Year')
    print(f'The Mean Squared Error for the model used to predict {target_column} values is: {mse}')
    print(f'The r_squared for the model used to predict {target_column} values is: {r_squared}')
    print('')

print("All specified columns with missing values have been filled ")

South_Sudan[['Country', '%_using_atleast_basic_sanity_services', '%_using_atleast_basic_drinking_water_services', '%_of_rural_using_atleast_basic_drinking_water_services', '%_of_urban_using_atleast_basic_drinking_water_services', 
             '%_of_rural_using_atleast_basic_sanity_services', '%_of_urban_using_atleast_basic_sanity_services']].reset_index(drop=True)

In [None]:
# Obtain rows where the country is Eritrea
eritrea = df[df['Country'] == 'Eritrea'].reset_index(drop=True)
# Display population percentages
print("One row has null values for these columns")
eritrea[['Country', 'Year', '%_using_atleast_basic_sanity_services', '%_using_atleast_basic_drinking_water_services', '%_of_rural_using_atleast_basic_drinking_water_services', '%_of_urban_using_atleast_basic_drinking_water_services',
          '%_of_rural_using_atleast_basic_sanity_services', '%_of_urban_using_atleast_basic_sanity_services']]

In [None]:
for column in null_columns:
    df, mse, r_squared, target_column = fill_with_regression(df, eritrea, column, 'Year')
    print(f'The Mean Squared Error for the model used to predict {target_column} values is: {mse}')
    print(f'The r_squared for the model used to predict {target_column} values is: {r_squared}')
    print('')
    
print("All specified columns with missing values have been filled using regression")

eritrea[['Country', '%_using_atleast_basic_sanity_services', '%_using_atleast_basic_drinking_water_services', '%_of_rural_using_atleast_basic_drinking_water_services', '%_of_urban_using_atleast_basic_drinking_water_services', 
             '%_of_rural_using_atleast_basic_sanity_services', '%_of_urban_using_atleast_basic_sanity_services']].reset_index(drop=True)

In [None]:
central_african_republic = df[df['Country'] == 'Central African Republic'].reset_index(drop=True)
# Display population percentages
print("One row has null values for these columns")
central_african_republic[['Country', 'Year', '%_using_atleast_basic_sanity_services', '%_using_atleast_basic_drinking_water_services', '%_of_rural_using_atleast_basic_drinking_water_services', '%_of_urban_using_atleast_basic_drinking_water_services',
          '%_of_rural_using_atleast_basic_sanity_services', '%_of_urban_using_atleast_basic_sanity_services']]

In [None]:
for column in null_columns:
    df, mse, r_squared, target_column = fill_with_regression(df, central_african_republic, column, 'Year')
    print(f'The Mean Squared Error for the model used to predict {target_column} values is: {mse}')
    print(f'The r_squared for the model used to predict {target_column} values is: {r_squared}')
    print('')
    
print("All specified columns with missing values have been filled using regression")

central_african_republic[['Country', '%_using_atleast_basic_sanity_services', '%_using_atleast_basic_drinking_water_services', '%_of_rural_using_atleast_basic_drinking_water_services', '%_of_urban_using_atleast_basic_drinking_water_services', 
             '%_of_rural_using_atleast_basic_sanity_services', '%_of_urban_using_atleast_basic_sanity_services']].reset_index(drop=True)

We dropped the columns, '%_of_rural_using_safe_drinking_water_services', '%_of_urban_using_safe_drinking_water_services', '%_using_safe_sanity_services', '%_of_rural_using_safe_sanity_services', '%_of_urban_using_safe_sanity_services' because of the reasons below.

1. They have a very high percentage of missing values. This being time-series data(data with patterns or trends), most if not all of the possible imputation methods were not applicable. Regression imputation was not possible because there was no tangible amount of data to train the models and mean or mode imputation would affect underlying trends and introduce bias.

2. According to "World Bank Open Data website", The percentage of people using at least basic services encompasses both people using basic services as well as those using safely managed services. We then based our analysis on columns with "People using atleast basic services" to avoid redundancy in our analysis.

In [None]:
columns_to_drop = ['%_of_rural_using_safe_drinking_water_services', '%_of_urban_using_safe_drinking_water_services', '%_using_safe_sanity_services', 
                   '%_of_rural_using_safe_sanity_services', '%_of_urban_using_safe_sanity_services', '%_using_safe_drinking_water_services']

df.drop(columns=columns_to_drop, inplace=True)

Fill missing values for 'Incidences_per_1000_population_at_risk', 'Reported_cases'.

Tunisia,  Mauritius, Lesotho and Libya have no data for these columns. 

We filled these missing values with 0's because mean, mode or regression imputation were not applicable. Using other countries data to fill missing values for "Incidences_per_1000_population_at_risk" and "Reported_cases" for another country was not logical.

In [None]:
# Replace null values in the specified columns with 0
columns_to_fill = ['Incidences_per_1000_population_at_risk', 'Reported_cases']
df[columns_to_fill] = df[columns_to_fill].fillna(0)


Filling missing values for '%_using_IBNs', '%_of_children_under_age_5_with_fever', '%_of_pregnant_women_using_IPT'

Filling one country's missing values using data from other countries was not logical. 

Therefore, we grouped each countries rows into a single dataframe, calculated the mean for each column and used it to fill the missing values.

For countries with no values for each of the three columns, the missing values were replaced with 0's.

In [None]:
columns_of_interest = ['%_using_IBNs', '%_of_children_under_age_5_with_fever', '%_of_pregnant_women_using_IPT']
for col in columns_of_interest:
    df[col] = df[col].fillna(df.groupby('Country')[col].transform('mean'))
    df[col] = df[col].round(2)
    
# Replace NaN with 0 after filling missing values
df.fillna(0, inplace=True)

Confirming whether all columns have been filled.

In [None]:
percentage_of_null_values = (df.isnull().sum()/df.shape[0]*100).sort_values(ascending=False)
percentage_of_null_values = percentage_of_null_values.apply(lambda x: f'{x:.2f}%')
percentage_of_null_values

Pattern Discovery

In [None]:
import plotly.express as px
fig1 = px.choropleth(df,
                     locations=df['Country Code'],
                     color=df['Incidences_per_1000_population_at_risk'],
                     color_continuous_scale='Blues',
                     locationmode='ISO-3',scope='africa',
                     animation_frame=df['Year'],
                     title="Incidence of Malaria at risk in Africa",
                     labels={'color':'Incidence of Malaria'})

fig1.show()

In [None]:
fig2 = px.line(df, x='Year', y='Incidences_per_1000_population_at_risk', 
               color='Country', title='Trend of Malaria Incidences over the Years',
               labels={'Year': 'Year', 'Incidences_per_1000_population_at_risk': 'Malaria Incidences'})

# Show the plot
fig2.show()

In [None]:
df.columns

In [None]:
import plotly.express as px

# Create a histogram for the 'Incidences_per_1000_population_at_risk' variable
fig1 = px.histogram(df, x='Incidences_per_1000_population_at_risk', nbins=20,
                    title='Distribution of Malaria Incidences per 1000 Population at Risk')
fig1.show()

# Create a histogram for the '%_using_IBNs' variable
fig2 = px.histogram(df, x='%_using_IBNs', nbins=20,
                    title='Distribution of Percentage of People Using Insecticide-Treated Bed Nets')
fig2.show()

# Create a histogram for the '%_of_rural_population' variable
fig3 = px.histogram(df, x='%_of_rural_population', nbins=20,
                    title='Distribution of Percentage of Rural Population')
fig3.show()

# Add more histograms for other variables as needed...



In [None]:
average_incidences_by_year = df.groupby('Year')['Incidences_per_1000_population_at_risk'].mean().reset_index()

# Create a line plot to visualize the trend of average malaria incidences over the years
fig3 = px.line(average_incidences_by_year, x='Year', y='Incidences_per_1000_population_at_risk',
               title='Trend of Average Malaria Incidences over the Years',
               labels={'Year': 'Year', 'Incidences_per_1000_population_at_risk': 'Average Malaria Incidences'})

# Show the plot
fig3.show()

In [None]:


# Select the relevant columns for the correlation analysis
correlation_df = df[['%_using_IBNs', 'Incidences_per_1000_population_at_risk']]

# Compute the correlation coefficient (Pearson correlation)
correlation_coefficient = correlation_df.corr().iloc[0, 1]

# Print the correlation coefficient
print("Correlation Coefficient:", correlation_coefficient)

# Visualize the relationship with a scatter plot
plt.scatter(correlation_df['%_using_IBNs'], correlation_df['Incidences_per_1000_population_at_risk'])
plt.xlabel('% of People Using Insecticide-Treated Bed Nets')
plt.ylabel('Malaria Incidences per 1000 Population at Risk')
plt.title('Correlation between Use of Insecticide-Treated Bed Nets and Malaria Incidences')
plt.show()


In [None]:
# Remove leading and trailing spaces from column name
correlation_df = df[['%_of_rural_using_atleast_basic_drinking_water_services', 'Incidences_per_1000_population_at_risk']]

# Compute the correlation coefficient (Pearson correlation)
correlation_coefficient = correlation_df.corr().iloc[0, 1]

# Print the correlation coefficient
print("Correlation Coefficient:", correlation_coefficient)

# Visualize the relationship with a scatter plot
plt.scatter(correlation_df['%_of_rural_using_atleast_basic_drinking_water_services'], correlation_df['Incidences_per_1000_population_at_risk'])
plt.xlabel('% of People Using Safely Managed Drinking Water Services (Rural)')
plt.ylabel('Malaria Incidences per 1000 Population at Risk')
plt.title('Correlation between Access to Safe Drinking Water Services (Rural) and Malaria Incidences')
plt.show()

In [None]:
import plotly.express as px

# Boxplot for "Incidences_per_1000_population_at_risk" across different years
fig_year = px.box(df, x='Year', y='Incidences_per_1000_population_at_risk',
                  title='Boxplot of Malaria Incidences per 1000 Population at Risk Across Years')
fig_year.show()

# Boxplot for "Incidences_per_1000_population_at_risk" across different countries
fig_country = px.box(df, x='Country', y='Incidences_per_1000_population_at_risk',
                     title='Boxplot of Malaria Incidences per 1000 Population at Risk Across Countries')
fig_country.show()


In [None]:
import plotly.express as px

# Create a scatter plot to explore the relationship between annual growth rates of rural and urban populations
fig = px.scatter(df, x='annual_%_growth_of_rural_population', y='annual_%_growth_of_urban_population',
                 title='Relationship between Annual Growth Rates of Rural and Urban Populations',
                 labels={'annual_%_growth_of_rural_population': 'Annual Growth Rate of Rural Population',
                         'annual_%_growth_of_urban_population': 'Annual Growth Rate of Urban Population'})

# Show the scatter plot
fig.show()


In [None]:
import statsmodels.api as sm

# Define the independent variable (X) and dependent variable (y)
X = df['annual_%_growth_of_rural_population']
y = df['annual_%_growth_of_urban_population']

# Add a constant term to the independent variable (X) for the intercept term in the regression model
X = sm.add_constant(X)

# Fit the linear regression model
model = sm.OLS(y, X).fit()

# Print the summary of the regression model
print(model.summary())


In [None]:
print("Investigate whether access to basic drinking water services is associated with the incidence of malaria.")
correlation_df = df[['%_using_atleast_basic_drinking_water_services', 'Incidences_per_1000_population_at_risk']]

# Compute the correlation coefficient (Pearson correlation)
correlation_coefficient = correlation_df.corr().iloc[0, 1]

print("Correlation Coefficient:", correlation_coefficient)


In [None]:
print("")
# Select the relevant columns for the correlation analysis
correlation_df_rural = df[['%_of_rural_using_atleast_basic_drinking_water_services', 'Incidences_per_1000_population_at_risk']]

# Compute the correlation coefficient (Pearson correlation) for rural areas
correlation_coefficient_rural = correlation_df_rural.corr().iloc[0, 1]

print("Correlation Coefficient (Rural):", correlation_coefficient_rural)


In [None]:
from scipy.stats import mannwhitneyu

# Separate the data for rural areas with access and without access to basic drinking water services
rural_with_access = df[df['%_of_rural_using_atleast_basic_drinking_water_services'] > 0]
rural_without_access = df[df['%_of_rural_using_atleast_basic_drinking_water_services'] == 0]

# Extract the incidence data for the two groups
incidences_with_access = rural_with_access['Incidences_per_1000_population_at_risk']
incidences_without_access = rural_without_access['Incidences_per_1000_population_at_risk']

# Perform the Mann-Whitney U test
statistic, p_value = mannwhitneyu(incidences_with_access, incidences_without_access, alternative='two-sided')

print("Mann-Whitney U Statistic:", statistic)
print("P-value:", p_value)
