In [None]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Loading the datasets
usa_county_data = pd.read_csv(r"C:\Users\Welcome Sir\Downloads\archive\usa_county_wise.csv")
worldometer_data = pd.read_csv(r"C:\Users\Welcome Sir\Downloads\archive\worldometer_data.csv")
country_wise_data = pd.read_csv(r"C:\Users\Welcome Sir\Downloads\archive\country_wise_latest.csv")
covid_clean_complete_data = pd.read_csv(r"C:\Users\Welcome Sir\Downloads\archive\covid_19_clean_complete.csv")
day_wise_data = pd.read_csv(r"C:\Users\Welcome Sir\Downloads\archive\day_wise.csv")
full_grouped_data = pd.read_csv(r"C:\Users\Welcome Sir\Downloads\archive\full_grouped.csv")

# Displaying the first few rows of each dataset
print("USA County Data:\n", usa_county_data.head())
print("Worldometer Data:\n", worldometer_data.head())
print("Country-Wise Data:\n", country_wise_data.head())
print("COVID Clean Complete Data:\n", covid_clean_complete_data.head())
print("Day-Wise Data:\n", day_wise_data.head())
print("Full Grouped Data:\n", full_grouped_data.head())

# Data Cleaning
# Checking for missing values
print("Missing values in datasets:")
print(covid_clean_complete_data.isnull().sum())

# Fill or drop missing values (example: dropping rows with missing values in 'Province/State')
covid_clean_complete_data = covid_clean_complete_data.dropna(subset=['Province/State'])

# Feature Engineering
# Creating a new column for Growth Rate
covid_clean_complete_data['Growth_Rate'] = (
    covid_clean_complete_data['Confirmed'] - covid_clean_complete_data['Deaths'] - covid_clean_complete_data['Recovered']
)

# Adding a mortality rate column
covid_clean_complete_data['Mortality Rate'] = (
    covid_clean_complete_data['Deaths'] / covid_clean_complete_data['Confirmed']
) * 100

# Adding a recovery rate column
covid_clean_complete_data['Recovery Rate'] = (
    covid_clean_complete_data['Recovered'] / covid_clean_complete_data['Confirmed']
) * 100

# EDA: Analyzing global trends
plt.figure(figsize=(10, 6))
sns.lineplot(data=day_wise_data, x='Date', y='Confirmed', label='Confirmed Cases')
sns.lineplot(data=day_wise_data, x='Date', y='Deaths', label='Deaths')
sns.lineplot(data=day_wise_data, x='Date', y='Recovered', label='Recovered')
plt.title("Global COVID-19 Trends Over Time")
plt.xlabel("Date")
plt.ylabel("Number of Cases")
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# EDA: Distribution of Mortality Rate
plt.figure(figsize=(8, 6))
sns.histplot(covid_clean_complete_data['Mortality Rate'], bins=30, kde=True, color='red')
plt.title("Distribution of Mortality Rate")
plt.xlabel("Mortality Rate (%)")
plt.ylabel("Frequency")
plt.show()

In [None]:
# Data Preparation for Modeling
# Selecting features and target variable
features = ['Confirmed', 'Deaths', 'Recovered', 'Active']
target = 'Mortality Rate'

In [None]:
# Splitting into training and test sets
X = covid_clean_complete_data[features].fillna(0)  # Fill missing values
y = covid_clean_complete_data[target].fillna(0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Model Development: Random Forest for Mortality Rate Prediction
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

In [None]:
# Predictions
y_pred = model.predict(X_test)

In [None]:
# Model Evaluation
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
print(f"RMSE: {rmse}")
print(f"R²: {r2}")

In [None]:
# Visualization of Predictions vs Actual
plt.figure(figsize=(8, 6))
sns.scatterplot(x=y_test, y=y_pred, alpha=0.6, color='blue')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', linewidth=2)
plt.title("Actual vs Predicted Mortality Rate")
plt.xlabel("Actual Mortality Rate")
plt.ylabel("Predicted Mortality Rate")
plt.tight_layout()
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

In [None]:
# Load the dataset
covid_clean_complete_data = pd.read_csv(r"C:\Users\Welcome Sir\Downloads\archive\covid_19_clean_complete.csv")

In [None]:
# Data Cleaning and Feature Engineering
covid_clean_complete_data['Confirmed'] = covid_clean_complete_data['Confirmed'].fillna(0)
covid_clean_complete_data['Deaths'] = covid_clean_complete_data['Deaths'].fillna(0)
covid_clean_complete_data['Recovered'] = covid_clean_complete_data['Recovered'].fillna(0)
covid_clean_complete_data['Active'] = (
    covid_clean_complete_data['Confirmed'] - covid_clean_complete_data['Deaths'] - covid_clean_complete_data['Recovered']
)
covid_clean_complete_data['Mortality Rate'] = (
    covid_clean_complete_data['Deaths'] / covid_clean_complete_data['Confirmed']
) * 100

In [None]:
# Define binary classification target (High Mortality)
threshold = 2  # Set threshold for high mortality rate
covid_clean_complete_data['High Mortality'] = (
    covid_clean_complete_data['Mortality Rate'] > threshold
).astype(int)

In [None]:
# Select features and target
features = ['Confirmed', 'Deaths', 'Recovered', 'Active']
X = covid_clean_complete_data[features].fillna(0)
y = covid_clean_complete_data['High Mortality']

In [None]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Build and train the classification model
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

In [None]:
# Make predictions
y_pred = clf.predict(X_test)

In [None]:
# Generate the classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

In [None]:
# Accuracy score
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy Score: {accuracy:.2f}")

In [None]:
# Convert 'Date' column to datetime if not already
day_wise_data['Date'] = pd.to_datetime(day_wise_data['Date'])

In [None]:
# Filter data for confirmed cases over time
confirmed_data = day_wise_data[['Date', 'Confirmed']].set_index('Date')

In [None]:
# Handle missing data (interpolation method can be used here if necessary)
confirmed_data = confirmed_data.ffill()  # Use forward fill

In [None]:
# Set the frequency explicitly to daily
confirmed_data = confirmed_data.asfreq('D')

In [None]:
# Split into training and test data (e.g., using the last 30 days for testing)
train = confirmed_data[:-30]
test = confirmed_data[-30:]

In [None]:
# Fit ARIMA model
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error
import math
arima_model = ARIMA(train, order=(5, 1, 0))  # You can tune the order (p, d, q) based on ACF/PACF analysis
arima_model_fit = arima_model.fit()

In [None]:
# Predict future cases
forecast = arima_model_fit.forecast(steps=30)

In [None]:
# Visualize the forecast
plt.figure(figsize=(10, 6))
plt.plot(train, label="Training Data")
plt.plot(test, label="Test Data")
plt.plot(pd.date_range(start=test.index[0], periods=30, freq='D'), forecast, label="Forecast", color='red')
plt.title("ARIMA Forecast for COVID-19 Cases")
plt.xlabel("Date")
plt.ylabel("Confirmed Cases")
plt.legend()
plt.show()

In [None]:
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

# Plot ACF and PACF for determining p and q
plt.figure(figsize=(12, 6))
plt.subplot(121)
plot_acf(confirmed_data['Confirmed'], ax=plt.gca())
plt.subplot(122)
plot_pacf(confirmed_data['Confirmed'], ax=plt.gca())
plt.show()

In [None]:
# Analyze the impact of Active Cases on Mortality Rate
plt.figure(figsize=(10, 6))
sns.scatterplot(data=covid_clean_complete_data, x='Active', y='Mortality Rate', alpha=0.6)
plt.title("Impact of Active Cases on Mortality Rate")
plt.xlabel("Active Cases")
plt.ylabel("Mortality Rate (%)")
plt.tight_layout()
plt.show()

In [None]:
# Load datasets
worldometer_data = pd.read_csv(r"C:\Users\Welcome Sir\Downloads\archive\worldometer_data.csv")
country_wise_data = pd.read_csv(r"C:\Users\Welcome Sir\Downloads\archive\country_wise_latest.csv")

In [None]:
# Upward Trends Analysis
# Calculate growth rate: New Cases / Total Cases
country_wise_data['Growth_Rate'] = (
    country_wise_data['New cases'] / (country_wise_data['Confirmed'] + 1)
)

# Select top 10 countries with highest growth rates
upward_trends = country_wise_data[['Country/Region', 'Growth_Rate']].sort_values(
    by='Growth_Rate', ascending=False
).head(10)

print("Top 10 Countries with Upward Trends:\n", upward_trends)

In [None]:
# Plot upward trends
plt.figure(figsize=(10, 6))
sns.barplot(data=upward_trends, x='Growth_Rate', y='Country/Region', palette='viridis')
plt.title("Top 10 Countries with Upward Trends")
plt.xlabel("Growth Rate")
plt.ylabel("Country")
plt.tight_layout()
plt.show()

In [None]:
# Top 10 Countries by Mortality Rate
country_wise_data['Mortality_Rate'] = (
    country_wise_data['Deaths'] / (country_wise_data['Confirmed'] + 1) * 100
)

top_mortality = country_wise_data[['Country/Region', 'Mortality_Rate']].sort_values(
    by='Mortality_Rate', ascending=False
).head(10)

print("Top 10 Countries by Mortality Rate:\n", top_mortality)


In [None]:
# Plot mortality rates
plt.figure(figsize=(10, 6))
sns.barplot(data=top_mortality, x='Mortality_Rate', y='Country/Region', palette='Reds')
plt.title("Top 10 Countries by Mortality Rate")
plt.xlabel("Mortality Rate (%)")
plt.ylabel("Country")
plt.tight_layout()
plt.show()

In [None]:
# Dense Population Analysis
# Use the correct column name for population density
dense_population = worldometer_data[['Country/Region', 'Population', 'Tot Cases/1M pop', 'Deaths/1M pop']]

# Sort by population in descending order and select the top 10
dense_population = dense_population.sort_values(by='Population', ascending=False).head(10)

print("Top 10 Densely Populated Countries:\n", dense_population)

# Plot population vs total cases
plt.figure(figsize=(10, 6))
sns.scatterplot(data=dense_population, x='Population', y='Tot Cases/1M pop', size='Deaths/1M pop', hue='Country/Region', sizes=(50, 300))
plt.title("Densely Populated Countries and COVID-19 Cases")
plt.xlabel("Population")
plt.ylabel("Tot Cases/1M pop")
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

In [None]:
# Save the model and outputs
import joblib
joblib.dump(model, 'covid_mortality_model.pkl')

In [None]:
# Final Report Outline
report = """
COVID-19 Predictive Modelling
=============================
- RMSE: {:.2f}
- R²: {:.2f}
- Key Insights: 
    1 ARIMA model predicts an upward trend in cases in specific countries.
    2 Top 10 countries with the highest mortality rates include...
    3 Mortality rates are significantly influenced by population density and healthcare availability.
Recommendations:
    - Focus on reducing active cases through early interventions.
    - Allocate healthcare resources to regions with high mortality rates.
    - Use predictive analytics to anticipate future outbreaks and allocate resources effectively.
""".format(rmse, r2)

with open("final_report.txt", "w") as f:
    f.write(report)

In [None]:
# Adding more insights to the report
additional_insights = """
Additional Insights:
- ARIMA model predicts an upward trend in cases in specific countries.
- Top 10 countries with the highest mortality rates include...
- Mortality rates are significantly influenced by population density and healthcare availability.

Geographic Findings:
- Regions with dense populations showed higher case numbers but mixed recovery rates.
- Developed countries tended to have lower mortality rates despite higher confirmed cases.

"""
report += additional_insights

with open(report_path, "w") as file:
    file.write(report)

In [None]:
# Save the report to a file
report_path = "final_report.txt"
with open(report_path, "w") as file:
    file.write(report)

print(f"Report saved successfully at: {report_path}")

In [None]:
print(worldometer_data.columns)

In [79]:
import os
import matplotlib.pyplot as plt
import seaborn as sns

# Directory to save the images
output_dir = "visualizations"
os.makedirs(output_dir, exist_ok=True)

# Define a function to save all visuals
def save_all_visuals():
    # Plot 1: Global COVID-19 Trends Over Time
    plt.figure(figsize=(10, 6))
    sns.lineplot(data=day_wise_data, x='Date', y='Confirmed', label='Confirmed Cases')
    sns.lineplot(data=day_wise_data, x='Date', y='Deaths', label='Deaths')
    sns.lineplot(data=day_wise_data, x='Date', y='Recovered', label='Recovered')
    plt.title("Global COVID-19 Trends Over Time")
    plt.xlabel("Date")
    plt.ylabel("Number of Cases")
    plt.legend()
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig(f"{output_dir}/global_covid_trends.png", dpi=300)
    plt.close()  # Close the figure after saving

    # Plot 2: Distribution of Mortality Rate
    plt.figure(figsize=(8, 6))
    sns.histplot(covid_clean_complete_data['Mortality Rate'], bins=30, kde=True, color='red')
    plt.title("Distribution of Mortality Rate")
    plt.xlabel("Mortality Rate (%)")
    plt.ylabel("Frequency")
    plt.tight_layout()
    plt.savefig(f"{output_dir}/mortality_rate_distribution.png", dpi=300)
    plt.close()

    # Plot 3: Actual vs Predicted Mortality Rate
    plt.figure(figsize=(8, 6))
    sns.scatterplot(x=y_test, y=y_pred, alpha=0.6, color='blue')
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', linewidth=2)
    plt.title("Actual vs Predicted Mortality Rate")
    plt.xlabel("Actual Mortality Rate")
    plt.ylabel("Predicted Mortality Rate")
    plt.tight_layout()
    plt.savefig(f"{output_dir}/actual_vs_predicted_mortality.png", dpi=300)
    plt.close()

    # Plot 4: ARIMA Forecast for COVID-19 Cases
    plt.figure(figsize=(10, 6))
    plt.plot(train, label="Training Data")
    plt.plot(test, label="Test Data")
    plt.plot(pd.date_range(start=test.index[0], periods=30, freq='D'), forecast, label="Forecast", color='red')
    plt.title("ARIMA Forecast for COVID-19 Cases")
    plt.xlabel("Date")
    plt.ylabel("Confirmed Cases")
    plt.legend()
    plt.tight_layout()
    plt.savefig(f"{output_dir}/arima_forecast.png", dpi=300)
    plt.close()

    # Plot 5: ACF and PACF
    plt.figure(figsize=(12, 6))
    plt.subplot(121)
    plot_acf(confirmed_data['Confirmed'], ax=plt.gca())
    plt.subplot(122)
    plot_pacf(confirmed_data['Confirmed'], ax=plt.gca())
    plt.tight_layout()
    plt.savefig(f"{output_dir}/acf_pacf.png", dpi=300)
    plt.close()

    # Plot 6: Impact of Active Cases on Mortality Rate
    plt.figure(figsize=(10, 6))
    sns.scatterplot(data=covid_clean_complete_data, x='Active', y='Mortality Rate', alpha=0.6)
    plt.title("Impact of Active Cases on Mortality Rate")
    plt.xlabel("Active Cases")
    plt.ylabel("Mortality Rate (%)")
    plt.tight_layout()
    plt.savefig(f"{output_dir}/active_cases_mortality.png", dpi=300)
    plt.close()

    # Plot 7: Upward Trends
    plt.figure(figsize=(10, 6))
    sns.barplot(data=upward_trends, x='Growth_Rate', y='Country/Region', palette='viridis')
    plt.title("Top 10 Countries with Upward Trends")
    plt.xlabel("Growth Rate")
    plt.ylabel("Country")
    plt.tight_layout()
    plt.savefig(f"{output_dir}/upward_trends.png", dpi=300)
    plt.close()

    # Plot 8: Mortality Rates
    plt.figure(figsize=(10, 6))
    sns.barplot(data=top_mortality, x='Mortality_Rate', y='Country/Region', palette='Reds')
    plt.title("Top 10 Countries by Mortality Rate")
    plt.xlabel("Mortality Rate (%)")
    plt.ylabel("Country")
    plt.tight_layout()
    plt.savefig(f"{output_dir}/top_mortality.png", dpi=300)
    plt.close()

    # Plot 9: Population vs Total Cases
    plt.figure(figsize=(10, 6))
    sns.scatterplot(data=dense_population, x='Population', y='Tot Cases/1M pop', size='Deaths/1M pop', hue='Country/Region', sizes=(50, 300))
    plt.title("Densely Populated Countries and COVID-19 Cases")
    plt.xlabel("Population")
    plt.ylabel("Tot Cases/1M pop")
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.savefig(f"{output_dir}/population_vs_cases.png", dpi=300)
    plt.close()

    print(f"All visuals saved as PNG files in the '{output_dir}' directory.")

# Call the function to save all plots
save_all_visuals()



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(data=upward_trends, x='Growth_Rate', y='Country/Region', palette='viridis')

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(data=top_mortality, x='Mortality_Rate', y='Country/Region', palette='Reds')


All visuals saved as PNG files in the 'visualizations' directory.
