<a href="https://colab.research.google.com/github/jekwa1/Project/blob/main/GroupProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
!pip install numerizer



In [15]:
#Libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from numerizer import numerize
import scipy
import scipy.stats as stats
from sklearn.linear_model import LinearRegression

In [16]:

# GitHub repository URL and file names
base_url = 'https://github.com/jekwa1/covid19-public/raw/main/epidemic/'
file_names = ['cases_malaysia.csv', 'deaths_malaysia.csv', 'icu.csv']

# Create a folder and download files
os.makedirs('covid_data', exist_ok=True)
[open(f'covid_data/{file_name}', 'wb').write(requests.get(f'{base_url}{file_name}').content) for file_name in file_names]

print('Files downloaded successfully.')

# Load files into DataFrames and print
file_mapping = {'case_my': 'cases_malaysia.csv', 'death_my': 'deaths_malaysia.csv', 'icu_my': 'icu.csv'}
data_frames = {label: pd.read_csv(f'covid_data/{file_name}') for label, file_name in file_mapping.items()}

[print(f'{label} DataFrame:\n{df.head()}\n{"-"*50}\n') for label, df in data_frames.items()]

NameError: ignored

In [None]:
#mount to drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#load cases_malaysia csv file using Pandas
folder = '/content/drive/My Drive/HIAProject302/'
filename_1 = folder+'cases_malaysia.csv'
case_my= pd.read_csv(filename_1)

#load deaths_malaysia csv file using Pandas
folder = '/content/drive/My Drive/HIAProject302/'
filename_2 = folder+'deaths_malaysia.csv'
death_my= pd.read_csv(filename_2)

#load icu csv file using Pandas
folder = '/content/drive/My Drive/HIAProject302/'
filename_3 = folder+'icu.csv'
icu_my= pd.read_csv(filename_3)


Verifying Datasets (case_malaysia.csv), (death_malaysia.csv) and (icu.csv)

In [None]:
#Check null in case_my
case_my.isnull().sum()

#Check null in death_my
death_my.isnull().sum()

#Check null in icu_my
icu_my.isnull().sum()

Selecting required variables from each datasets

In [None]:
#Select interested column in case_my
case_my_new =case_my.loc[:,['date','cases_new','cases_0_4', 'cases_5_11', 'cases_12_17',
'cases_18_29', 'cases_30_39', 'cases_40_49', 'cases_50_59',
'cases_60_69', 'cases_70_79', 'cases_80']]
print(case_my_new)
print(case_my_new.columns)
#check null value
case_my_new.isnull().sum()
case_my_new.info()


In [None]:
#Select interested column in death_my
death_my_new = death_my.loc[:,['date','deaths_new']]
print(death_my_new)
print(death_my_new.columns)
#check null value
death_my_new.isnull().sum()
death_my_new.info()


In [None]:
#Select interested column in icu_my
icu_my_new = icu_my.loc[:,['date','icu_covid']]
icu_my_new_group = icu_my_new.groupby('date')['icu_covid'].sum().reset_index()
print(icu_my_new_group)
#check null value
icu_my_new_group.isnull().sum()
icu_my_new.info()

Creating new dataframe (my_covid) - Merging datasets (based on date)

In [None]:
# Ensuring formate of date - datetime type
case_my_new['date'] = pd.to_datetime(case_my_new['date'])
death_my_new['date'] = pd.to_datetime(death_my_new['date'])
icu_my_new_group['date'] = pd.to_datetime(icu_my_new_group['date'])

# Merge DataFrames
my_covid = pd.merge(case_my_new, death_my_new, on='date', how='outer')
my_covid = pd.merge(my_covid, icu_my_new_group, on='date', how='outer')

# Sort DataFrame by 'date'
my_covid = my_covid.sort_values(by='date')

# Fill NaN values with 0
my_covid = my_covid.fillna(0)



Validating new data frame (my_covid)

In [None]:
## Validation
print(my_covid)
print(my_covid.isnull().sum())

num_rows, num_columns = my_covid.shape
print(f"Number of Rows: {num_rows}")
print(f"Number of Columns: {num_columns}")

# Ensuring formate of date - datetime type
print(my_covid.dtypes)
# my_covid['date'] = pd.to_datetime(my_covid['date'])

my_covid.to_csv('my_covid.csv', index=False)

Creating new dataframe based on 7 days aggregate (sum) and filtered for date (01/01/2020 - 30/06/2023) = my_covid_group

In [None]:
# Filtering based on date (01/01/2000 - 30/06/2023)
start_date = '2000-01-01'
end_date = '2023-06-30'

date_mask = (my_covid['date'] >= start_date) & (my_covid['date'] <= end_date)
filtered_my_covid = my_covid[date_mask].copy()  # Make a copy to avoid SettingWithCopyWarning

# Save the filtered dataframe with 'date' column
filtered_my_covid.to_csv('filtered_my_covid.csv', index=False)

# New dataframe based on 7 days aggregate (sum) = my_covid_group

filtered_my_covid['date'] = pd.to_datetime(filtered_my_covid['date'])
filtered_my_covid.set_index('date', inplace=True)

my_covid_group = filtered_my_covid.resample('7D').sum()
my_covid_group.reset_index(inplace=True)
print(my_covid_group)

my_covid_group.to_csv('my_covid_group.csv', index=False)

Create new dataframe based on 7-days average (mean)= seven_days_mean and validate

In [None]:
seven_days_mean = filtered_my_covid.resample('7D').mean()
seven_days_mean = seven_days_mean.round(2)
seven_days_mean.reset_index(inplace=True)

## Validation
print(seven_days_mean)
print(seven_days_mean.isnull().sum())

seven_days_mean.to_csv('seven_days_mean.csv', index=False)
print(seven_days_mean.columns)

Creating working data frame from seven_days_mean and factoring in predominant circulating COVID-19 variant

In [None]:
# New dataframe:seven_days_mean_variant (this with predominant circulating variant)

seven_days_mean_variant = seven_days_mean.copy()

# Create a new column 'prominent_variant'
seven_days_mean_variant['prominent_variant'] = pd.cut(
    seven_days_mean_variant['date'],
    bins=[pd.to_datetime('2000-01-01'), pd.to_datetime('2021-06-08'), pd.to_datetime('2021-07-05'), pd.to_datetime('2021-12-20'), pd.to_datetime('2100-01-01')],
    labels=['other', 'beta', 'delta', 'omicron'],
    right=False
)

## Validation
print(seven_days_mean_variant)
print(seven_days_mean_variant.isnull().sum())

print(seven_days_mean_variant.head())
seven_days_mean_variant.to_csv('seven_days_mean_variant.csv', index=False)

Statistical Analysis

In [None]:

from io import StringIO


# Push changes to a different GitHub repository
new_repo_url = 'https://github.com/jekwa1/Project.git'
local_repo_path = '/path/to/Project/'

# Copy CSV files to the new repository
for file_name in ['my_covid.csv', 'my_covid_group.csv', 'seven_days_mean.csv', 'seven_days_mean_variant.csv']:
    response = requests.get(f'{base_url}{file_name}')
    with open(os.path.join(local_repo_path, file_name), 'wb') as file:
        file.write(response.content)

# Change into the repository directory
os.chdir(local_repo_path)

# Add, commit, and push the changes
os.system('git add .')
os.system('git commit -m "Add transformed datasets"')
os.system('git push origin main')

print('Files pushed to the new repository successfully.')

In [None]:
#  #Boxplot
numerical_columns = seven_days_mean_variant.select_dtypes(include=[np.number]).columns
plt.figure(figsize=(16, 8))
for i, column in enumerate(numerical_columns, start=1):
    plt.subplot(3, 5, i)
    sns.boxplot(x='prominent_variant', y=column, data=seven_days_mean_variant)
    plt.title(f'Box Plot for {column}')
    plt.xlabel('Prominent Variant')
    plt.ylabel(column)

plt.tight_layout()
plt.show()

In [None]:
# Line graph - New cases , ICU and Deaths with Variant period

plt.figure(figsize=(14, 8))

# Plotting trends
plt.plot(seven_days_mean_variant['date'], seven_days_mean_variant['cases_new'], label='Cases_New', color='blue', linewidth=3.0)
plt.plot(seven_days_mean_variant['date'], seven_days_mean_variant['deaths_new'], label='Deaths_New', color='darkgreen', linewidth=3.0)
plt.plot(seven_days_mean_variant['date'], seven_days_mean_variant['icu_covid'], label='ICU_COVID', color='purple', linewidth=3.0)

# Adding shaded regions for variant periods
variant_periods = seven_days_mean_variant[['date', 'prominent_variant']].drop_duplicates()
variant_colors = {'other': 'darkcyan', 'beta': 'lime', 'delta': 'gold', 'omicron': 'crimson'}

# Create legend handles for variant colors
variant_handles = [plt.Line2D([0], [0], marker='o', color=color, label=f'{variant}') for variant, color in variant_colors.items()]

# Adding shaded regions for variant periods
for i, row in variant_periods.iterrows():
    variant_start = row['date']
    if i + 1 < len(variant_periods):
        variant_end = variant_periods.iloc[i + 1]['date']
    else:
        variant_end = seven_days_mean_variant['date'].max()

    plt.axvspan(variant_start, variant_end, color=variant_colors[row['prominent_variant']], alpha=0.3)

# Combine legends for trends and variant colors
plt.legend(loc='upper left', bbox_to_anchor=(1, 1), handles=variant_handles + plt.legend().legendHandles)

plt.xlabel('Date')
plt.ylabel('Count')
plt.title('Trends of Cases_New, Deaths_New, and ICU_COVID over Time')

plt.tight_layout()
plt.show()

In [None]:
# Stacked bar - new cases, icu and death over prominent variance

plt.figure(figsize=(14, 8))
columns_to_plot = ['cases_new', 'icu_covid', 'deaths_new']
melted_data = pd.melt(seven_days_mean_variant, id_vars=['prominent_variant'], value_vars=columns_to_plot, var_name='attribute')

sns.barplot(x='prominent_variant', y='value', hue='attribute', data=melted_data, ci=None, palette='viridis', estimator=sum)
plt.title('Stacked Bar Chart of New Cases, ICU, and Deaths across Prominent Variants')
plt.xlabel('Prominent Variant Period')
plt.ylabel('Count')
plt.legend(title='Attribute')
plt.show()

In [None]:
# Stacked Bar chart - number of new cases per age category over variant predominance

plt.figure(figsize=(14, 8))
age_group_columns_new_cases = ['cases_0_4', 'cases_5_11', 'cases_12_17', 'cases_18_29', 'cases_30_39',
                                'cases_40_49', 'cases_50_59', 'cases_60_69', 'cases_70_79', 'cases_80']

melted_data_new_cases = pd.melt(seven_days_mean_variant, id_vars=['prominent_variant'], value_vars=age_group_columns_new_cases, var_name='age_group')

sns.barplot(x='prominent_variant', y='value', hue='age_group', data=melted_data_new_cases, ci=None, palette='viridis', estimator=sum)
plt.title('Stacked Bar Chart of New Cases across Age Groups and Prominent Variants')
plt.xlabel('Prominent Variant Period')
plt.ylabel('Number of New Cases')
plt.legend(title='Age Group')
plt.show()

In [None]:
#count total number of each age category using my_covid_variant dataframe
# Get column names
columns = ['cases_0_4', 'cases_5_11', 'cases_12_17', 'cases_18_29', 'cases_30_39',
                                'cases_40_49', 'cases_50_59', 'cases_60_69', 'cases_70_79', 'cases_80','icu_covid','deaths_new']

## count total number of new cases
sum_new_cases= my_covid_variant['cases_new'].sum()
print(f"Total new cases:{sum_new_cases:}")

# Calculate sum for each column
column_sums = []
for col in columns:
    col_sum = my_covid_variant[col].sum()
    percentage = (col_sum /sum_new_cases ) * 100
    column_sums.append([col, col_sum,percentage])
    print(f"Sum of values in column {col}: {col_sum}; {percentage:.2f}%")
    from tabulate import tabulate
print(tabulate(column_sums, headers=['Column', 'Total','Percentage'], tablefmt='simple_grid'))


In [None]:
##Group all the variants into resepective timeframe
# Count total number of other variant
##date = 2020-01-25 - 2021-06-07
other = my_covid_variant.loc[my_covid_variant['date'].between('2020-01-25','2021-06-07')]
sum_other= other['cases_new'].sum()
print(f"Total number of other variant:{sum_other:}")

# Count total number of beta variant
##date = 2021-06-08 - 2021-07-04
beta = my_covid_variant.loc[my_covid_variant['date'].between('2021-06-08','2021-07-04')]
sum_beta= beta['cases_new'].sum()
print(f"Total number of beta variant:{sum_beta:}")

# Count total number of delta variant
##date = 2021-07-05 - 2021-12-20
delta = my_covid_variant.loc[my_covid_variant['date'].between('2021-07-05','2021-12-20')]
sum_delta= delta['cases_new'].sum()
print(f"Total number of delta variant:{sum_delta:}")

# Count total number of omicron variant
##date = 2021-12-21 - 2023-06-30
micron = my_covid_variant.loc[my_covid_variant['date'].between('2021-12-21','2023-06-30')]
sum_omicron= micron['cases_new'].sum()
print(f"Total number of omicron variant{sum_omicron:}")

In [None]:
#Calculate the ratio of each variant in total number of cases
##divide other variant with total number of cases
other_2= sum_other/sum_new_cases
print(f"Other:{other_2:.4f}")

##divide beta variant with total number of cases
beta_2 =sum_beta/sum_new_cases
print(f"Beta:{beta_2:.4f}")

##divide delta variant with total number of cases
delta_2= sum_delta/sum_new_cases
print(f"Delta:{delta_2:.4f}")

##divide omicron variant with total number of cases
omicron_2= sum_omicron/sum_new_cases
print(f"Omicron:{omicron_2:.4f}")

In [None]:
# Pie chart of prominent variants
variant= [other_2, beta_2, delta_2, omicron_2]
variant_labels = ['Other','Beta', 'Delta', 'Omicron']

# Create a pie chart
plt.figure(figsize=(8, 6))
colors=['darkcyan','mediumturquoise','steelblue','aqua']
plt.pie(variant, labels=variant_labels, autopct='%1.1f%%', startangle=140, colors=colors)
plt.title('Distribution of COVID-19 Variants')
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

# Show the pie chart
plt.show()

In [None]:
# description of the data in seven_days_mean_variant
seven_days_mean_variant.describe()

In [None]:
#Regression model of new death and icu covid [overall]
p=sns.regplot(seven_days_mean_variant, x='icu_covid', y='deaths_new', ci=None, line_kws = {"color": "blue"})
#calculate slope and intercept of regression equation
slope, intercept, r, p, sterr = scipy.stats.linregress(x=p.get_lines()[0].get_xdata(),
                                                       y=p.get_lines()[0].get_ydata())
#display slope and intercept of regression equation
print(f"intercept:{intercept:.3f}, slope:{slope:.3f}")

#add regression equation to plot (#2, #95 represents the coordinate to display the equation )
plt.text(2, 150, 'y = ' + str(round(intercept,3)) + ' + ' + str(round(slope,3)) + 'x')


# Add title
plt.title('New_deaths against ICU_Covid')
plt.show()

In [None]:
#R-squared of regression model (icu covid against new death)
#initiate linear regression model
model = LinearRegression()

#define predictor and response variables
X, y = seven_days_mean_variant[['icu_covid']], seven_days_mean_variant.deaths_new

#fit regression model
model.fit(X, y)

#calculate R-squared of regression model (icu covid agaisnt new death)
r_squared = model.score(X, y)
print("R-squared= {:.3f}".format(r_squared))
## R-squared= 0.712; moderately strong

In [None]:
#Regression model of new death and new cases[overall]
p=sns.regplot(seven_days_mean_variant, x='cases_new', y='deaths_new', ci=None, line_kws = {"color": "blue"})
#calculate slope and intercept of regression equation
slope, intercept, r, p, sterr = scipy.stats.linregress(x=p.get_lines()[0].get_xdata(),
                                                       y=p.get_lines()[0].get_ydata())
#display slope and intercept of regression equation
print(f"intercept:{intercept:.3f}, slope:{slope:.3f}")

#add regression equation to plot (#2, #95 represents the coordinate to display the equation )
plt.text(2, 150, 'y = ' + str(round(intercept,3)) + ' + ' + str(round(slope,3)) + 'x')


# Add title
plt.title('New_deaths against New_cases')
plt.show()


In [None]:
#R-squared of regression model (new death against new cases)
#initiate linear regression model
model = LinearRegression()

#define predictor and response variables
X, y = seven_days_mean_variant[['cases_new']], seven_days_mean_variant.deaths_new

#fit regression model
model.fit(X, y)

#calculate R-squared of regression model
r_squared = model.score(X, y)
print("R-squared= {:.3f}".format(r_squared))

## R-squared= 0.5; fair

In [None]:
## Create a scatter plot between new cases and icu covid (other variant)
#time period = 2020-01-25 - 2021-06-05
other_icu= seven_days_mean_variant.loc[seven_days_mean_variant['date'].between('2020-01-25','2021-06-05')]
p=sns.regplot(other_icu, x='cases_new', y='icu_covid', ci=None, line_kws = {"color": "blue"})

# Calculate the slope and intercept of the regression equation along with confidence intervals
slope, intercept, r, p, sterr = scipy.stats.linregress(x=other_icu['cases_new'], y=other_icu['icu_covid'])

# Get the standard error of the slope
stderr_slope = sterr

#display slope and intercept of regression equation
print(f"intercept:{intercept:.3f}, slope:{slope:.3f}")

# Calculate the confidence interval for the slope (95% confidence interval)
#z-value at 95% confidence interval= 1.96
z_value=1.96
confidence_interval = z_value* stderr_slope


# Display the slope, intercept, and confidence interval of the regression equation
print(f"95% Confidence Interval for Slope: [{slope - confidence_interval:.3f}, {slope + confidence_interval:.3f}]")


#add regression equation to plot (#2, #95 represents the coordinate to display the equation )
plt.text(80, 600, 'y = ' + str(round(intercept,3)) + ' + ' + str(round(slope,3)) + 'x')


# Add title
plt.title('ICU_covid against New_cases (Other Variant)')
plt.show()


In [None]:
#R-squared of regression model(other; icu covid against new cases)
#initiate linear regression model
model = LinearRegression()

#define predictor and response variables
X, y = other_icu[['cases_new']], other_icu.icu_covid

#fit regression model
model.fit(X, y)

#calculate R-squared of regression model
r_squared = model.score(X, y)
print("R-squared= {:.3f}".format(r_squared))
## R-squared= 0.941; strong

In [None]:
## Create a scatter plot between new cases and new death (other variant)
#time period = 2020-01-25 - 2021-06-05
other_death= seven_days_mean_variant.loc[seven_days_mean_variant['date'].between('2020-01-25','2021-06-05')]
p=sns.regplot(other_death, x='cases_new', y='deaths_new', ci=None, line_kws = {"color": "blue"})

# Calculate the slope and intercept of the regression equation along with confidence intervals
slope, intercept, r, p, sterr = scipy.stats.linregress(x=other_death['cases_new'], y=other_death['deaths_new'])

# Get the standard error of the slope
stderr_slope = sterr

#display slope and intercept of regression equation
print(f"intercept:{intercept:.3f}, slope:{slope:.3f}")

# Calculate the confidence interval for the slope (95% confidence interval)
#z-value at 95% confidence interval= 1.96
z_value=1.96
confidence_interval = z_value* stderr_slope

# Display the slope, intercept, and confidence interval of the regression equation
print(f"95% Confidence Interval for Slope: [{slope - confidence_interval:.3f}, {slope + confidence_interval:.3f}]")

#add regression equation to plot (#2, #95 represents the coordinate to display the equation )
plt.text(80, 30, 'y = ' + str(round(intercept,3)) + ' + ' + str(round(slope,3)) + 'x')


# Add title
plt.title('New_deaths against New_cases (Other Variant)')
plt.show()

In [None]:
#R-squared of regression model (other variant; new death against new cases)
#initiate linear regression model
model = LinearRegression()

#define predictor and response variables
X, y = other_death[['cases_new']], other_death.deaths_new

#fit regression model
model.fit(X, y)

#calculate R-squared of regression model
r_squared = model.score(X, y)
print("R-squared= {:.3f}".format(r_squared))
## R-squared= 0.762; moderately strong

In [None]:
## Create a scatter plot between new cases and icu covid (beta variant)
#time period = 2021-06-12 - 2021-07-03
beta_icu= seven_days_mean_variant.loc[seven_days_mean_variant['date'].between('2021-06-12','2021-07-03')]
p=sns.regplot(beta_icu, x='cases_new', y='icu_covid', ci=None, line_kws = {"color": "blue"})

# Calculate the slope and intercept of the regression equation along with confidence intervals
slope, intercept, r, p, sterr = scipy.stats.linregress(x=beta_icu['cases_new'], y=beta_icu['icu_covid'])

# Get the standard error of the slope
stderr_slope = sterr

#display slope and intercept of regression equation
print(f"intercept:{intercept:.3f}, slope:{slope:.3f}")

# Calculate the confidence interval for the slope (95% confidence interval)
#z-value at 95% confidence interval= 1.96
z_value=1.96
confidence_interval = z_value* stderr_slope

# Display the slope, intercept, and confidence interval of the regression equation
print(f"95% Confidence Interval for Slope: [{slope - confidence_interval:.3f}, {slope + confidence_interval:.3f}]")

#add regression equation to plot (#2, #95 represents the coordinate to display the equation )
plt.text(5500, 1180, 'y = ' + str(round(intercept,3)) + ' + ' + str(round(slope,3)) + 'x')


# Add title
plt.title('ICU_covid against New_cases (Beta Variant)')
plt.show()

In [None]:
#R-squared of regression model (beta; icu against new cases)
#initiate linear regression model
model = LinearRegression()

#define predictor and response variables
X, y = beta_icu[['cases_new']], beta_icu.icu_covid
#fit regression model
model.fit(X, y)

#calculate R-squared of regression model
r_squared = model.score(X, y)
print("R-squared= {:.3f}".format(r_squared))

## R-squared= 0.983; strong


In [None]:
## Create a scatter plot between new cases and new death (beta variant)
#time period = 2021-06-12 - 2021-07-03
beta_death= seven_days_mean_variant.loc[seven_days_mean_variant['date'].between('2021-06-12','2021-07-03')]
p=sns.regplot(beta_death, x='cases_new', y='deaths_new', ci=None, line_kws = {"color": "blue"})

# Calculate the slope and intercept of the regression equation along with confidence intervals
slope, intercept, r, p, sterr = scipy.stats.linregress(x=beta_death['cases_new'], y=beta_death['deaths_new'])

# Get the standard error of the slope
stderr_slope = sterr

#display slope and intercept of regression equation
print(f"intercept:{intercept:.3f}, slope:{slope:.3f}")

# Calculate the confidence interval for the slope (95% confidence interval)
#z-value at 95% confidence interval= 1.96
z_value=1.96
confidence_interval = z_value* stderr_slope

# Display the slope, intercept, and confidence interval of the regression equation
print(f"95% Confidence Interval for Slope: [{slope - confidence_interval:.3f}, {slope + confidence_interval:.3f}]")


#add regression equation to plot (#2, #95 represents the coordinate to display the equation )
plt.text(5500, 80, 'y = ' + str(round(intercept,3)) + ' + ' + str(round(slope,3)) + 'x')


# Add title
plt.title('New_deaths against New_cases (Beta Variant)')
plt.show()

In [None]:
#R-squared of regression model (beta; new death against new cases)
#initiate linear regression model
model = LinearRegression()

#define predictor and response variables
X, y = beta_death[['cases_new']], beta_death.deaths_new

#fit regression model
model.fit(X, y)

#calculate R-squared of regression model
r_squared = model.score(X, y)
print("R-squared= {:.3f}".format(r_squared))

## R-squared= 0.854; strong

In [None]:
## Create a scatter plot between new cases and new death (delta variant)
#time period = 2021-07-10 - 2021-12-18
delta_death= seven_days_mean_variant.loc[seven_days_mean_variant['date'].between('2021-07-10','2021-12-18')]
p=sns.regplot(delta_death, x='cases_new', y='deaths_new', ci=None, line_kws = {"color": "blue"})

# Calculate the slope and intercept of the regression equation along with confidence intervals
slope, intercept, r, p, sterr = scipy.stats.linregress(x=delta_death['cases_new'], y=delta_death['deaths_new'])

# Get the standard error of the slope
stderr_slope = sterr

#display slope and intercept of regression equation
print(f"intercept:{intercept:.3f}, slope:{slope:.3f}")

# Calculate the confidence interval for the slope (95% confidence interval)
#z-value at 95% confidence interval= 1.96
z_value=1.96
confidence_interval = z_value* stderr_slope

# Display the slope, intercept, and confidence interval of the regression equation
print(f"95% Confidence Interval for Slope: [{slope - confidence_interval:.3f}, {slope + confidence_interval:.3f}]")

#add regression equation to plot (#2, #95 represents the coordinate to display the equation )
plt.text(3000, 150, 'y = ' + str(round(intercept,3)) + ' + ' + str(round(slope,3)) + 'x')


# Add title
plt.title('New_deaths against New_cases (Delta Variant)')
plt.show()

In [None]:
#R-squared of regression model (delta; new death against new cases)
#initiate linear regression model
model = LinearRegression()

#define predictor and response variables
X, y = delta_death[['cases_new']], delta_death.deaths_new

#fit regression model
model.fit(X, y)

#calculate R-squared of regression model
r_squared = model.score(X, y)
print("R-squared= {:.3f}".format(r_squared))

## R-squared= 0.791; strong

In [None]:
## Create a scatter plot between new cases and icu (omicron variant)
#time period = 2021-12-25 - 2023-06-24
omicron_icu= seven_days_mean_variant.loc[seven_days_mean_variant['date'].between('2021-12-25','2023-06-24')]
p=sns.regplot(omicron_icu, x='cases_new', y='icu_covid', ci=None, line_kws = {"color": "blue"})

# Calculate the slope and intercept of the regression equation along with confidence intervals
slope, intercept, r, p, sterr = scipy.stats.linregress(x=omicron_icu['cases_new'], y=omicron_icu['icu_covid'])

# Get the standard error of the slope
stderr_slope = sterr

#display slope and intercept of regression equation
print(f"intercept:{intercept:.3f}, slope:{slope:.3f}")

# Calculate the confidence interval for the slope (95% confidence interval)
#z-value at 95% confidence interval= 1.96
z_value=1.96
confidence_interval = z_value* stderr_slope

# Display the slope, intercept, and confidence interval of the regression equation
print(f"95% Confidence Interval for Slope: [{slope - confidence_interval:.3f}, {slope + confidence_interval:.3f}]")

#add regression equation to plot (#2, #95 represents the coordinate to display the equation )
plt.text(5000, 350, 'y = ' + str(round(intercept,3)) + ' + ' + str(round(slope,3)) + 'x')


# Add title
plt.title('ICU_Covid against New_cases (Omicron Variant)')

In [None]:
#R-squared of regression model (omicron; icu against new cases)
#initiate linear regression model
model = LinearRegression()

#define predictor and response variables
X, y = omicron_icu[['cases_new']], omicron_icu.icu_covid
#fit regression model
model.fit(X, y)

#calculate R-squared of regression model
r_squared = model.score(X, y)
print("R-squared= {:.3f}".format(r_squared))

## R-squared= 0.732; strong

In [None]:
## Create a scatter plot between new cases and new death (omicron variant)
#time period = 2021-12-25 - 2023-06-24
omicron_death= seven_days_mean_variant.loc[seven_days_mean_variant['date'].between('2021-12-25','2023-06-24')]
p=sns.regplot(omicron_death, x='cases_new', y='deaths_new', ci=None, line_kws = {"color": "blue"})

# Calculate the slope and intercept of the regression equation along with confidence intervals
slope, intercept, r, p, sterr = scipy.stats.linregress(x=omicron_death['cases_new'], y=omicron_death['deaths_new'])

# Get the standard error of the slope
stderr_slope = sterr

#display slope and intercept of regression equation
print(f"intercept:{intercept:.3f}, slope:{slope:.3f}")

# Calculate the confidence interval for the slope (95% confidence interval)
#z-value at 95% confidence interval= 1.96
z_value=1.96
confidence_interval = z_value* stderr_slope

# Display the slope, intercept, and confidence interval of the regression equation
print(f"95% Confidence Interval for Slope: [{slope - confidence_interval:.3f}, {slope + confidence_interval:.3f}]")
#add regression equation to plot (#2, #95 represents the coordinate to display the equation )
plt.text(60, 70, 'y = ' + str(round(intercept,3)) + ' + ' + str(round(slope,3)) + 'x')


# Add title
plt.title('New_deaths against New_cases (Omicron Variant)')

In [None]:
#R-squared of regression model (omicron; new death against new cases)
#initiate linear regression model
model = LinearRegression()

#define predictor and response variables
X, y = omicron_death[['cases_new']], omicron_death.deaths_new

#fit regression model
model.fit(X, y)

#calculate R-squared of regression model
r_squared = model.score(X, y)
print("R-squared= {:.3f}".format(r_squared))

## R-squared= 0.830; strong