In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Load and Display Data
df = pd.read_csv("covid_19_country_wise_latest.csv")
print(df.head())

# 2. Set the Dataset index as Unnamed: 0
df.set_index("Unnamed: 0", inplace=True)

# 3. Replace the index column name with "index"
df.index.name = "index"

# 4. Function to replace whitespace with underscore in column names
def clean_column_names(df):
    df.columns = df.columns.str.replace(" ", "_")
    return df

df = clean_column_names(df)

# 5. Check Basic Information
print(df.info())
print(df.isnull().sum())  # Missing values

# 6. Fill NaN values with the mean of the column
df.fillna(df.mean(numeric_only=True), inplace=True)

# 7. Count unique countries
unique_countries = df["Country/Region"].nunique()
print("Unique Countries:", unique_countries)

# 8. Remove duplicate country entries (if any)
df.drop_duplicates(subset=["Country/Region"], keep="first", inplace=True)

# 9. Mean, median, and standard deviation of total cases
print("Mean Total Cases:", df["Total_Cases"].mean())
print("Median Total Cases:", df["Total_Cases"].median())
print("Standard Deviation Total Cases:", df["Total_Cases"].std())

# 10. Replace string values in Deaths column with mean of Deaths
df["Deaths"] = pd.to_numeric(df["Deaths"], errors="coerce")
df["Deaths"].fillna(df["Deaths"].mean(), inplace=True)

# 11. Convert Deaths column to integer
df["Deaths"] = df["Deaths"].astype(int)

# 12. Total Deaths and Recovered worldwide
total_deaths = df["Deaths"].sum()
total_recovered = df["Recovered"].sum()
print("Total Deaths:", total_deaths)
print("Total Recovered:", total_recovered)

# 13. Count countries with more than 1 million cases
high_case_countries = df[df["Total_Cases"] > 1_000_000].shape[0]
print("Countries with >1 million cases:", high_case_countries)

# 14. Countries with recovery rate above 95%
df["Recovery_Rate"] = (df["Recovered"] / df["Total_Cases"]) * 100
high_recovery_countries = df[df["Recovery_Rate"] > 95]["Country/Region"]
print("Countries with recovery rate >95%:\n", high_recovery_countries)

# 15. Drop columns WHO Region and Confirmed
df.drop(columns=["WHO_Region", "Confirmed"], inplace=True)

# 16. Country with Maximum Deaths
max_death_country = df.loc[df["Deaths"].idxmax(), "Country/Region"]
print("Country with Maximum Deaths:", max_death_country)

# 17. Sort Countries by Deaths (Descending)
df_sorted = df.sort_values(by="Deaths", ascending=False)
print(df_sorted[["Country/Region", "Deaths"]].head())

# 18. Create new column "Total_cases" as sum of Deaths, Recovered, Active
df["Total_cases"] = df["Deaths"] + df["Recovered"] + df["Active"]

# 19. Calculate Death Rate
df["Death_Rate"] = (df["Deaths"] / df["Total_cases"]) * 100

# 20. Countries where total cases increasing but death rates remain low
low_death_countries = df[(df["Total_Cases"] > df["Total_Cases"].median()) & (df["Death_Rate"] < 2)]
print("Countries with increasing cases but low death rates:\n", low_death_countries[["Country/Region", "Total_Cases", "Death_Rate"]])

# 21. Print head of Country and Death_Rate columns
print(df[["Country/Region", "Death_Rate"]].head())

# 22. Scatter plot: Total Cases vs Total Deaths
plt.figure(figsize=(10,6))
sns.scatterplot(x=df["Total_Cases"], y=df["Deaths"], alpha=0.6)
plt.xlabel("Total Cases")
plt.ylabel("Total Deaths")
plt.title("Total Cases vs Total Deaths")
plt.show()

# 23. Save dataset in CSV format
df.to_csv("processed_covid_data.csv", index=False)
print("Dataset saved as 'processed_covid_data.csv'")
