In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.stattools import adfuller
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import linregress
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import BaggingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler

ModuleNotFoundError: No module named 'plotly'

# Impoprting Data

In [None]:
from google.colab import files
files.upload()

In [None]:
!pip install -q kaggle

In [None]:
!mkdir ~/.kaggle

In [None]:
!cp kaggle.json ~/.kaggle

In [None]:
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download -d iamsouravbanerjee/cause-of-deaths-around-the-world

In [None]:
!unzip cause-of-deaths-around-the-world.zip -d datasets

In [None]:
df = pd.read_csv("/content/datasets/cause_of_deaths.csv")
df.head()

# pre processing

In [None]:
df.isnull().sum()

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df['Year'].nunique()

In [None]:
df['Year'].unique()

In [None]:
# Exclude non-numeric columns before computing correlation
numeric_columns = df.select_dtypes(include=['int64']).columns
correlation_with_year = df[numeric_columns].corr()['Year']
print(correlation_with_year)

In [None]:
df['Country/Territory'].nunique()


In [None]:
df['Country/Territory'].value_counts()

In [None]:
cause_of_deaths = ['Meningitis',
       'Alzheimer\'s Disease and Other Dementias', 'Parkinson\'s Disease',
       'Nutritional Deficiencies', 'Malaria', 'Drowning',
       'Interpersonal Violence', 'Maternal Disorders', 'HIV/AIDS',
       'Drug Use Disorders', 'Tuberculosis', 'Cardiovascular Diseases',
       'Lower Respiratory Infections', 'Neonatal Disorders',
       'Alcohol Use Disorders', 'Self-harm', 'Exposure to Forces of Nature',
       'Diarrheal Diseases', 'Environmental Heat and Cold Exposure',
       'Neoplasms', 'Conflict and Terrorism', 'Diabetes Mellitus',
       'Chronic Kidney Disease', 'Poisonings', 'Protein-Energy Malnutrition',
       'Road Injuries', 'Chronic Respiratory Diseases',
       'Cirrhosis and Other Chronic Liver Diseases', 'Digestive Diseases',
       'Fire, Heat, and Hot Substances', 'Acute Hepatitis']

In [None]:
df['Total_no_of_Deaths'] = df[cause_of_deaths].sum(axis=1)

In [None]:
top10_Total_no_of_Deaths = df.sort_values(by='Total_no_of_Deaths',ascending=False)[:10][['Total_no_of_Deaths','Country/Territory']]
top10_Total_no_of_Deaths

In [None]:
disease_df = df[cause_of_deaths].sum().to_frame().reset_index()
disease_df.rename(columns={"index": "Disease", 0:"Total cases"}, inplace=True)
disease_df

In [None]:
desending = disease_df.sort_values(by=['Total cases'], ascending=False)
desending

In [None]:
df

In [None]:
country_df = df.groupby('Country/Territory')['Total_no_of_Deaths'].sum().sort_values(ascending=False).reset_index()
country_df

In [None]:
China_Total_no_of_Deaths_df = df[df['Country/Territory']=='China'].sort_values(by='Total_no_of_Deaths',ascending=False)

In [None]:
Top10_deaths = df.groupby('Country/Territory')['Total_no_of_Deaths'].sum().sort_values(ascending=False).head(10).reset_index()
Top10_deaths

# visualization

In [None]:
correlation_matrix = df.corr()
plt.figure(figsize=(12, 20))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Matrix")
plt.show()

In [None]:
plt.figure(figsize=(16,9))

sns.barplot(data = Top10_deaths,
            x = 'Country/Territory',
            y = 'Total_no_of_Deaths',
            palette = 'pastel'
            )

plt.xticks(rotation = 90)
plt.xlabel('Country', size = 20)
plt.ylabel('Total Number of Deaths(in millions)', size = 20)
plt.title('Top 10 Countries with the Highest Number of Deaths', size =20)

# 2019 Predicted vs	Actual

In [None]:
features = ['Year']

train_data = df[df['Year'] < 2019]

X_train, X_test, y_train, y_test = train_test_split(train_data[features], train_data[cause_of_deaths], test_size=0.2, random_state=42)


dfs_KNN = []

for target in cause_of_deaths:
    model = KNeighborsRegressor()


    model.fit(X_train, y_train[target])

    X_2019 = pd.DataFrame({'Year': [2019]})
    predicted_deaths_2019 = model.predict(X_2019)[0]

    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test[target], y_pred)

    actual_deaths_2019 = df[(df['Year'] == 2019)][target].values[0]

    results_df_KNN = pd.DataFrame({
        'Cause': [target],
        'Predicted Deaths': [predicted_deaths_2019],
        'Actual Deaths': [actual_deaths_2019],
        'Mean Squared Error': [mse]
    })

    dfs_KNN.append(results_df_KNN)

results_df_KNN = pd.concat(dfs_KNN, ignore_index=True)

In [None]:
results_df_KNN.head()

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
sum_actual=results_df_KNN['Actual Deaths'].sum()

In [None]:
sum_actual

In [None]:
sum_knn=results_df_KNN['Predicted Deaths'].sum()

In [None]:
sum_knn

In [None]:
print(results_df_KNN.to_latex())

In [None]:
plt.figure(figsize=(25,7))
plt.plot(results_df_KNN['Cause'], results_df_KNN['Predicted Deaths'],color='blue')
plt.plot(results_df_KNN['Cause'], results_df_KNN['Actual Deaths'], color='red')
plt.xlabel('Cause of Death')
plt.ylabel('Number of Deaths')
plt.title('Predicted Deaths vs Actual Deaths using KNN',size='20')
plt.legend(['Predicted Deaths','Actual Deaths'],fontsize='25')
plt.xticks(rotation = 70)
plt.show()

In [None]:
from sklearn.ensemble import RandomForestRegressor

dfs_RandomForest = []

for target in cause_of_deaths:
    model = RandomForestRegressor(random_state=42)

    model.fit(X_train, y_train[target])

    X_2019 = pd.DataFrame({'Year': [2019]})
    predicted_deaths_2019 = model.predict(X_2019)[0]

    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test[target], y_pred)

    actual_deaths_2019 = df[(df['Year'] == 2019)][target].values[0]

    results_df_RandomForest = pd.DataFrame({
        'Cause': [target],
        'Predicted Deaths': [predicted_deaths_2019],
        'Actual Deaths': [actual_deaths_2019],
        'Mean Squared Error': [mse]
    })

    dfs_RandomForest.append(results_df_RandomForest)

results_df_RandomForest = pd.concat(dfs_RandomForest, ignore_index=True)


In [None]:
results_df_RandomForest.head()

In [None]:
print(results_df_RandomForest.to_latex())

In [None]:
plt.figure(figsize=(25,7))
plt.plot(results_df_RandomForest['Cause'], results_df_RandomForest['Predicted Deaths'],color='blue')
plt.plot(results_df_RandomForest['Cause'], results_df_RandomForest['Actual Deaths'], color='red')
plt.xlabel('Cause of Death ')
plt.ylabel('Number of Deaths')
plt.title('Predicted Deaths vs Actual Deaths using RandomForest',size='20')
plt.legend(['Predicted Deaths','Actual Deaths'],fontsize='25')
plt.xticks(rotation = 70)
plt.show()

In [None]:
features = ['Year']

train_data = df[df['Year'] < 2019]

X_train, X_test, y_train, y_test = train_test_split(train_data[features], train_data[cause_of_deaths], test_size=0.2, random_state=42)


dfs_SVR = []

for target in cause_of_deaths:
    model =  SVR(kernel='linear')

    model.fit(X_train, y_train[target])

    X_2019 = pd.DataFrame({'Year': [2019]})
    predicted_deaths_2019 = model.predict(X_2019)[0]

    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test[target], y_pred)

    actual_deaths_2019 = df[(df['Year'] == 2019)][target].values[0]

    results_df_SVR = pd.DataFrame({
        'Cause': [target],
        'Predicted Deaths': [predicted_deaths_2019],
        'Actual Deaths': [actual_deaths_2019],
        'Mean Squared Error': [mse]
    })

    dfs_SVR.append(results_df_SVR)

results_df_SVR = pd.concat(dfs_SVR, ignore_index=True)

In [None]:
results_df_SVR.head()

In [None]:
print(results_df_SVR.to_latex())

In [None]:
plt.figure(figsize=(25,7))
plt.plot(results_df_SVR['Cause'], results_df_SVR['Predicted Deaths'],color='blue')
plt.plot(results_df_SVR['Cause'], results_df_SVR['Actual Deaths'], color='red')
plt.xlabel('Cause of Death ')
plt.ylabel('Number of Deaths')
plt.title('Predicted Deaths vs Actual Deaths using SVR',size='20')
plt.legend(['Predicted Deaths','Actual Deaths'],fontsize='25')
plt.xticks(rotation = 70)
plt.show()

In [None]:
features = ['Year']

train_data = df[df['Year'] < 2019]

X_train, X_test, y_train, y_test = train_test_split(train_data[features], train_data[cause_of_deaths], test_size=0.2, random_state=42)


dfs_Bagging = []

for target in cause_of_deaths:
    model =  BaggingRegressor(random_state=42)

    model.fit(X_train, y_train[target])

    X_2019 = pd.DataFrame({'Year': [2019]})
    predicted_deaths_2019 = model.predict(X_2019)[0]

    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test[target], y_pred)

    actual_deaths_2019 = df[(df['Year'] == 2019)][target].values[0]

    results_df_Bagging = pd.DataFrame({
        'Cause': [target],
        'Predicted Deaths': [predicted_deaths_2019],
        'Actual Deaths': [actual_deaths_2019],
        'Mean Squared Error': [mse]
    })

    dfs_Bagging.append(results_df_Bagging)

results_df_Bagging = pd.concat(dfs_Bagging, ignore_index=True)

In [None]:
results_df_Bagging.head()

In [None]:
print(results_df_Bagging.to_latex())

In [None]:
plt.figure(figsize=(25,7))
plt.plot(results_df_Bagging['Cause'], results_df_Bagging['Predicted Deaths'],color='blue')
plt.plot(results_df_Bagging['Cause'], results_df_Bagging['Actual Deaths'], color='red')
plt.xlabel('Cause of Death ')
plt.ylabel('Number of Deaths')
plt.title('Predicted Deaths vs Actual Deaths using Bagging',size='20')
plt.legend(['Predicted Deaths','Actual Deaths'],fontsize='25')
plt.xticks(rotation = 70)
plt.show()

In [None]:
features = ['Year']

train_data = df[df['Year'] < 2019]

X_train, X_test, y_train, y_test = train_test_split(train_data[features], train_data[cause_of_deaths], test_size=0.2, random_state=42)


dfs_Ridge = []

for target in cause_of_deaths:
    model = Ridge(alpha=1.0)

    model.fit(X_train, y_train[target])

    X_2019 = pd.DataFrame({'Year': [2019]})
    predicted_deaths_2019 = model.predict(X_2019)[0]

    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test[target], y_pred)

    actual_deaths_2019 = df[(df['Year'] == 2019)][target].values[0]

    results_df_Ridge = pd.DataFrame({
        'Cause': [target],
        'Predicted Deaths': [predicted_deaths_2019],
        'Actual Deaths': [actual_deaths_2019],
        'Mean Squared Error': [mse]
    })

    dfs_Ridge.append(results_df_Ridge)

results_df_Ridge = pd.concat(dfs_Ridge, ignore_index=True)

In [None]:
results_df_Ridge.head()

In [None]:
print(results_df_Ridge.to_latex())

In [None]:
plt.figure(figsize=(25,7))
plt.plot(results_df_Ridge['Cause'], results_df_Ridge['Predicted Deaths'],color='blue')
plt.plot(results_df_Ridge['Cause'], results_df_Ridge['Actual Deaths'], color='red')
plt.xlabel('Cause of Death ')
plt.ylabel('Number of Deaths')
plt.title('Predicted Deaths vs Actual Deaths using Ridge',size='20')
plt.legend(['Predicted Deaths','Actual Deaths'],fontsize='25')
plt.xticks(rotation = 70)
plt.show()

In [None]:
features = ['Year']

train_data = df[df['Year'] < 2019]

X_train, X_test, y_train, y_test = train_test_split(train_data[features], train_data[cause_of_deaths], test_size=0.2, random_state=42)


dfs_Lasso = []

for target in cause_of_deaths:
    model = Lasso(alpha=1.0)

    model.fit(X_train, y_train[target])

    X_2019 = pd.DataFrame({'Year': [2019]})
    predicted_deaths_2019 = model.predict(X_2019)[0]

    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test[target], y_pred)

    actual_deaths_2019 = df[(df['Year'] == 2019)][target].values[0]

    results_df_Lasso = pd.DataFrame({
        'Cause': [target],
        'Predicted Deaths': [predicted_deaths_2019],
        'Actual Deaths': [actual_deaths_2019],
        'Mean Squared Error': [mse]
    })

    dfs_Lasso.append(results_df_Lasso)

results_df_Lasso = pd.concat(dfs_Lasso, ignore_index=True)

In [None]:
results_df_Lasso.head()

In [None]:
print(results_df_Lasso.to_latex())

In [None]:
plt.figure(figsize=(25,7))
plt.plot(results_df_Lasso['Cause'], results_df_Lasso['Predicted Deaths'],color='blue')
plt.plot(results_df_Lasso['Cause'], results_df_Lasso['Actual Deaths'], color='red')
plt.xlabel('Cause of Death ')
plt.ylabel('Number of Deaths')
plt.title('Predicted Deaths vs Actual Deaths using Lasso',size='20')
plt.legend(['Predicted Deaths','Actual Deaths'],fontsize='25')
plt.xticks(rotation = 70)
plt.show()

# 2020 pridiction for each cause


In [None]:
features = ['Year']

train_data = df[df['Year'] < 2019]

dfs1_KNN = []

for target in cause_of_deaths:
    model = KNeighborsRegressor()
    model.fit(train_data[features], train_data[target])

    X_2020 = pd.DataFrame({'Year': [2020]})

    predicted_deaths_2020 = model.predict(X_2020)[0]

    results_df_KNN = pd.DataFrame({
        'Cause': [target],
        '2020 pridiction using KNN': [predicted_deaths_2020]
    })

    dfs1_KNN.append(results_df_KNN)

results_df1_KNN = pd.concat(dfs1_KNN, ignore_index=True)

In [None]:
print(results_df1_KNN)

In [None]:
plt.figure(figsize=(12, 6))
plt.bar(results_df1_KNN['Cause'], results_df1_KNN['2020 pridiction using KNN'], label='Predicted Deaths 2020')
plt.xlabel('Cause of Death')
plt.ylabel('Number of Deaths')
plt.title('Predicted Deaths for Each Cause in 2020 using KNN')
plt.xticks(rotation=45, ha='right')
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
features = ['Year']

train_data = df[df['Year'] <= 2019]

dfs1_RandomForest = []

for target in cause_of_deaths:
    model = RandomForestRegressor(random_state=42)
    model.fit(train_data[features], train_data[target])

    X_2020 = pd.DataFrame({'Year': [2020]})

    predicted_deaths_2020 = model.predict(X_2020)[0]

    results_df_RandomForest = pd.DataFrame({
        'Cause': [target],
        '2020 pridiction using RandomForest': [predicted_deaths_2020]
    })

    dfs1_RandomForest.append(results_df_RandomForest)

results_df1_RandomForest = pd.concat(dfs1_RandomForest, ignore_index=True)

In [None]:
print(results_df1_RandomForest)

In [None]:
plt.figure(figsize=(12, 6))
plt.bar(results_df1_RandomForest['Cause'], results_df1_RandomForest['2020 using RandomForest'], label='Predicted Deaths 2020')
plt.xlabel('Cause of Death')
plt.ylabel('Number of Deaths')
plt.title('Predicted Deaths for Each Cause in 2020 using Random Forest')
plt.xticks(rotation=45, ha='right')
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
features = ['Year']

train_data = df[df['Year'] <= 2019]

dfs1_SVR = []

for target in cause_of_deaths:
    model = SVR(kernel='linear')
    model.fit(train_data[features], train_data[target])

    X_2020 = pd.DataFrame({'Year': [2020]})

    predicted_deaths_2020 = model.predict(X_2020)[0]

    results_df_SVR = pd.DataFrame({
        'Cause': [target],
        '2020 pridiction using SVR': [predicted_deaths_2020]
    })

    dfs1_SVR.append(results_df_SVR)

results_df1_SVR = pd.concat(dfs1_SVR, ignore_index=True)

In [None]:
print(results_df1_SVR)

In [None]:
plt.figure(figsize=(12, 6))
plt.bar(results_df1_SVR['Cause'], results_df1_SVR['2020 pridiction using SVR'], label='Predicted Deaths 2020')
plt.xlabel('Cause of Death')
plt.ylabel('Number of Deaths')
plt.title('Predicted Deaths for Each Cause in 2020 using SVR')
plt.xticks(rotation=45, ha='right')
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
features = ['Year']

train_data = df[df['Year'] <= 2019]

dfs1_Bagging = []

for target in cause_of_deaths:
    model = BaggingRegressor(random_state=42)
    model.fit(train_data[features], train_data[target])

    X_2020 = pd.DataFrame({'Year': [2020]})

    predicted_deaths_2020 = model.predict(X_2020)[0]

    results_df_Bagging = pd.DataFrame({
        'Cause': [target],
        '2020 pridiction using Bagging': [predicted_deaths_2020]
    })

    dfs1_Bagging.append(results_df_Bagging)

results_df1_Bagging = pd.concat(dfs1_Bagging, ignore_index=True)

In [None]:
print(results_df1_Bagging)

In [None]:
plt.figure(figsize=(12, 6))
plt.bar(results_df1_Bagging['Cause'], results_df1_Bagging['2020 pridiction using Bagging'], label='Predicted Deaths 2020')
plt.xlabel('Cause of Death')
plt.ylabel('Number of Deaths')
plt.title('Predicted Deaths for Each Cause in 2020 using Bagging')
plt.xticks(rotation=45, ha='right')
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
features = ['Year']

train_data = df[df['Year'] <= 2019]

dfs1_Ridge = []

for target in cause_of_deaths:
    model = Ridge(alpha=1.0)
    model.fit(train_data[features], train_data[target])

    X_2020 = pd.DataFrame({'Year': [2020]})

    predicted_deaths_2020 = model.predict(X_2020)[0]

    results_df_Ridge = pd.DataFrame({
        'Cause': [target],
        '2020 pridiction using Ridge': [predicted_deaths_2020]
    })

    dfs1_Ridge.append(results_df_Ridge)

results_df1_Ridge = pd.concat(dfs1_Ridge, ignore_index=True)

In [None]:
print(results_df1_Ridge)

In [None]:
plt.figure(figsize=(12, 6))
plt.bar(results_df1_Ridge['Cause'], results_df1_Ridge['2020 pridiction using Ridge'], label='Predicted Deaths 2020')
plt.xlabel('Cause of Death')
plt.ylabel('Number of Deaths')
plt.title('Predicted Deaths for Each Cause in 2020 using Ridge')
plt.xticks(rotation=45, ha='right')
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
features = ['Year']

train_data = df[df['Year'] <= 2019]

dfs1_Lasso = []

for target in cause_of_deaths:
    model = Lasso(alpha=1.0)
    model.fit(train_data[features], train_data[target])

    X_2020 = pd.DataFrame({'Year': [2020]})

    predicted_deaths_2020 = model.predict(X_2020)[0]

    results_df_Lasso = pd.DataFrame({
        'Cause': [target],
        '2020 pridiction using Lasso': [predicted_deaths_2020]
    })

    dfs1_Lasso.append(results_df_Lasso)

results_df1_Lasso = pd.concat(dfs1_Lasso, ignore_index=True)

In [None]:
print(results_df1_Lasso)

In [None]:
plt.figure(figsize=(12, 6))
plt.bar(results_df1_Lasso['Cause'], results_df1_Lasso['2020 pridiction using Lasso'], label='Predicted Deaths 2020')
plt.xlabel('Cause of Death')
plt.ylabel('Number of Deaths')
plt.title('Predicted Deaths for Each Cause in 2020 using Lasso')
plt.xticks(rotation=45, ha='right')
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
all_results_dfs = [results_df1_KNN, results_df1_RandomForest, results_df1_SVR, results_df1_Bagging, results_df1_Ridge, results_df1_Lasso]

combined_results_df = pd.concat(all_results_dfs, axis=1)

In [None]:
combined_results_df = pd.DataFrame({'Cause': results_df1_KNN['Cause']})

combined_results_df['2020 pridiction using KNN'] = results_df1_KNN['2020 pridiction using KNN']
combined_results_df['2020 pridiction using RandomForest'] = results_df1_RandomForest['2020 pridiction using RandomForest']
combined_results_df['2020 pridiction using SVR'] = results_df1_SVR['2020 pridiction using SVR']
combined_results_df['2020 pridiction using Bagging'] = results_df1_Bagging['2020 pridiction using Bagging']
combined_results_df['2020 pridiction using Ridge'] = results_df1_Ridge['2020 pridiction using Ridge']
combined_results_df['2020 pridiction using Lasso'] = results_df1_Lasso['2020 pridiction using Lasso']

print(combined_results_df)

In [None]:
combined_results_df.to_csv('combined_results.csv', index=False)

In [None]:
df2=pd.read_csv('combined_results.csv',sep=",")

In [None]:
df2

In [None]:
plt.figure(figsize=(10,7))
models = ['KNN', 'RandomForest', 'SVR', 'Bagging', 'Ridge', 'Lasso']

bar_width = 0.15
bar_positions = np.arange(len(df2['Cause']))

for i, model in enumerate(models):
    column_name = f'2020 pridiction using {model}'
    plt.bar(bar_positions + i * bar_width, df2[column_name], label=model, width=bar_width)

plt.xlabel('Cause of Death')
plt.ylabel('Number of Deaths')
plt.title('Predicted Deaths for Each Cause in 2020 from Different Models')
plt.xticks(bar_positions + (len(models) / 2) * bar_width, df2['Cause'], rotation=45, ha='right')
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
latex_table = df2.to_latex(index=False)
print(latex_table)


# total death in 2020 pridiction



In [None]:
features = ['Year']

train_data = df[df['Year'] < 2019]

X_train, X_test, y_train, y_test = train_test_split(train_data[features], train_data[cause_of_deaths], test_size=0.2, random_state=42)


dfs_Ridge = []

for target in cause_of_deaths:
    model = Ridge(alpha=1.0)

    model.fit(X_train, y_train[target])

    X_2019 = pd.DataFrame({'Year': [2019]})
    predicted_deaths_2019 = model.predict(X_2019)[0]

    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test[target], y_pred)

    actual_deaths_2019 = df[(df['Year'] == 2019)][target].values[0]

    results_df_Ridge = pd.DataFrame({
        'Cause': [target],
        'Predicted Deaths': [predicted_deaths_2019],
        'Actual Deaths': [actual_deaths_2019],
        'Mean Squared Error': [mse]
    })

    dfs_Ridge.append(results_df_Ridge)

results_df_Ridge = pd.concat(dfs_Ridge, ignore_index=True)

In [None]:
results_df_Ridge.head()

In [None]:
plt.figure(figsize=(12, 6))
total_deaths_by_year = df.groupby('Year')['Total_no_of_Deaths'].sum()
total_deaths_by_year.plot(marker='o')
plt.title('Total Number of Deaths Over the Years')
plt.xlabel('Year')
plt.ylabel('Total Number of Deaths')
plt.show()

In [None]:
time_series_data = df[['Year', 'Total_no_of_Deaths']]
time_series_data = time_series_data.groupby('Year')['Total_no_of_Deaths'].sum().reset_index()

slope, intercept, r_value, p_value, std_err = linregress(time_series_data['Year'], time_series_data['Total_no_of_Deaths'])

trend_line = intercept + slope * time_series_data['Year']

plt.figure(figsize=(12, 6))
plt.scatter(time_series_data['Year'], time_series_data['Total_no_of_Deaths'], label='Actual Data')
plt.plot(time_series_data['Year'], trend_line, color='red', label='Trend Line')
plt.title('Time Series Trend Analysis')
plt.xlabel('Year')
plt.ylabel('Total Number of Deaths')
plt.legend()
plt.show()

print(f'Trend Slope: {slope}')
print(f'R-squared: {r_value**2}')

In [None]:
year_2020_prediction_trend_analysis = intercept + slope * 2020
print(f'Predicted number of deaths for 2020 using trend analysis: {year_2020_prediction_trend_analysis}')

In [None]:
time_series_data = df[['Year', 'Total_no_of_Deaths']]
time_series_data = time_series_data.groupby('Year')['Total_no_of_Deaths'].sum()

result_adf = adfuller(time_series_data)
print(f'ADF Statistic: {result_adf[0]}')
print(f'p-value: {result_adf[1]}')

order = (1, 1, 1)
model = SARIMAX(time_series_data, order=order, enforce_stationarity=False, enforce_invertibility=False)
results = model.fit(disp=False)

print(results.summary())

forecast_steps = 1  # Set this to 1 to predict 2020
forecast = results.get_forecast(steps=forecast_steps)
forecast_index = pd.RangeIndex(start=time_series_data.index[-1] + 1, stop=time_series_data.index[-1] + forecast_steps + 1)
forecast_series = pd.Series(forecast.predicted_mean.values, index=forecast_index)

plt.figure(figsize=(12, 6))
plt.plot(time_series_data, label='Original Time Series')
plt.plot(results.fittedvalues, color='red', label='Fitted Values')
plt.plot(forecast_series, color='green', linestyle='dashed', label='Forecast')
plt.title('ARIMA Model Forecast')
plt.xlabel('Year')
plt.ylabel('Total Number of Deaths')
plt.legend()
plt.show()

In [None]:
print(f'Predicted number of deaths for 2020 using ARIMA: {forecast_series.iloc[0]}')

In [None]:
time_series_data = df[['Year', 'Total_no_of_Deaths']]
time_series_data = time_series_data.groupby('Year')['Total_no_of_Deaths'].sum().reset_index()

X = time_series_data[['Year']]
y = time_series_data['Total_no_of_Deaths']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

rf_model.fit(X_train, y_train)

predictions = rf_model.predict(X_test)

mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

plt.figure(figsize=(12, 6))
plt.scatter(X_test, y_test, color='blue', label='Actual')
plt.scatter(X_test, predictions, color='red', label='Predicted')
plt.title('Random Forest Regression: Actual vs Predicted')
plt.xlabel('Year')
plt.ylabel('Total Number of Deaths')
plt.legend()
plt.show()

In [None]:
year_2020_prediction_random_forest = rf_model.predict([[2020]])
print(f'Predicted number of deaths for 2020 using random forest: {year_2020_prediction_random_forest[0]}')

In [None]:
time_series_data = df[['Year', 'Total_no_of_Deaths']]
time_series_data = time_series_data.groupby('Year')['Total_no_of_Deaths'].sum().reset_index()

X = time_series_data[['Year']]
y = time_series_data['Total_no_of_Deaths']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

svr_model = SVR(kernel='linear')

svr_model.fit(X_train, y_train)
predictions = svr_model.predict(X_test)
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

plt.figure(figsize=(12, 6))
plt.scatter(X_test, y_test, color='blue', label='Actual')
plt.scatter(X_test, predictions, color='red', label='Predicted')
plt.title('SVR Regression: Actual vs Predicted')
plt.xlabel('Year')
plt.ylabel('Total Number of Deaths')
plt.legend()
plt.show()

In [None]:
print(f'Predicted number of deaths for 2020 using trend analysis: {year_2020_prediction_trend_analysis}')
print(f'Predicted number of deaths for 2020 using ARIMA: {forecast_series.iloc[0]}')
print(f'Predicted number of deaths for 2020 using random forest: {year_2020_prediction_random_forest[0]}')
#print(f'Predicted number of deaths for 2020 using SVR: {year_2020_prediction_using_SVR[0]}')

# 2025

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load your dataset (assuming it's stored in a DataFrame named df)

# Assuming you have your data stored in X and y arrays
X = df.drop(columns=['Country/Territory', 'Code', 'Year'])  # Drop non-feature columns
y = df.drop(columns=['Country/Territory', 'Code', 'Year'])  # Use all cause of death columns as target variables

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Linear Regression model
model_lr = LinearRegression()
model_lr.fit(X_train, y_train)

# Make predictions
y_pred_lr = model_lr.predict(X_test)

# Evaluate the model
mae_lr = mean_absolute_error(y_test, y_pred_lr)
mse_lr = mean_squared_error(y_test, y_pred_lr)
rmse_lr = np.sqrt(mse_lr)
r2_lr = r2_score(y_test, y_pred_lr)

print("Linear Regression:")
print(f"Mean Absolute Error: {mae_lr:.2f}")
print(f"Root Mean Squared Error: {rmse_lr:.2f}")
print(f"R-squared: {r2_lr:.2f}")

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Train the Random Forest Regression model
model_rf = RandomForestRegressor(n_estimators=100, random_state=42)
model_rf.fit(X_train, y_train)

# Make predictions
y_pred_rf = model_rf.predict(X_test)

# Evaluate the model
mae_rf = mean_absolute_error(y_test, y_pred_rf)
mse_rf = mean_squared_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mse_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print("Random Forest Regression:")
print(f"Mean Absolute Error: {mae_rf:.2f}")
print(f"Root Mean Squared Error: {rmse_rf:.2f}")
print(f"R-squared: {r2_rf:.2f}")

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Define the model architecture
model_ann = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(1, activation='linear')  # Linear activation for regression
])

# Compile the model
model_ann.compile(optimizer='adam',
                  loss='mean_squared_error',
                  metrics=['mean_absolute_error', 'mean_squared_error'])

# Train the model
history = model_ann.fit(X_train, y_train, epochs=1000, batch_size=32, validation_split=0.2)

# Evaluate the model
mae_ann, _, rmse_ann = model_ann.evaluate(X_test, y_test)

print("Artificial Neural Network (ANN):")
print(f"Mean Absolute Error: {mae_ann:.2f}")
print(f"Root Mean Squared Error: {rmse_ann:.2f}")

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.neural_network import MLPRegressor


# Splitting features and target
X = df.drop(columns=['Country/Territory', 'Code', 'Year', 'Total_no_of_Deaths'])
y = df['Total_no_of_Deaths']

# Splitting into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing: Standardize features
numeric_features = X.columns
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features)
    ])

# Train Random Forest Regressor
rf_regressor = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

rf_regressor.fit(X_train, y_train)
rf_predictions = rf_regressor.predict(X_test)
rf_mse = mean_squared_error(y_test, rf_predictions)

# Train Linear Regression
linear_regressor = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

linear_regressor.fit(X_train, y_train)
linear_predictions = linear_regressor.predict(X_test)
linear_mse = mean_squared_error(y_test, linear_predictions)

# Train Neural Network
mlp_regressor = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42))
])

mlp_regressor.fit(X_train, y_train)
mlp_predictions = mlp_regressor.predict(X_test)
mlp_mse = mean_squared_error(y_test, mlp_predictions)

print("Random Forest MSE:", rf_mse)
print("Linear Regression MSE:", linear_mse)
print("Neural Network MSE:", mlp_mse)

# Use the best-performing model to predict number of deaths for each cause in 2025
best_model = linear_regressor  # Change this to the best performing model
X_2025 = X  # Assuming you want to predict for the entire dataset
predictions_2025 = best_model.predict(X_2025)

# Print predictions for 2025
print("Predictions for 2025:")
print(predictions_2025)