## MODEL 5: LOGISTIC REGRESSION FOR COVID DATASET

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

In [4]:
url = "https://covid.ourworldindata.org/data/owid-covid-data.csv"

In [11]:
df = pd.read_csv(url, usecols = ['location', 'date', 'new_cases', 'people_vaccinated', 'stringency_index', 'new_tests'])

In [13]:
df

Unnamed: 0,location,date,new_cases,new_tests,people_vaccinated,stringency_index
0,Afghanistan,2020-01-05,0.0,,,0.0
1,Afghanistan,2020-01-06,0.0,,,0.0
2,Afghanistan,2020-01-07,0.0,,,0.0
3,Afghanistan,2020-01-08,0.0,,,0.0
4,Afghanistan,2020-01-09,0.0,,,0.0
...,...,...,...,...,...,...
429430,Zimbabwe,2024-07-31,0.0,,,
429431,Zimbabwe,2024-08-01,0.0,,,
429432,Zimbabwe,2024-08-02,0.0,,,
429433,Zimbabwe,2024-08-03,0.0,,,


In [15]:
country = 'United States'

In [17]:
df_US = df[df['location'] == country]
df_US

Unnamed: 0,location,date,new_cases,new_tests,people_vaccinated,stringency_index
403451,United States,2020-01-05,0.0,,,0.0
403452,United States,2020-01-06,0.0,,,0.0
403453,United States,2020-01-07,0.0,,,0.0
403454,United States,2020-01-08,0.0,,,0.0
403455,United States,2020-01-09,0.0,,,0.0
...,...,...,...,...,...,...
405120,United States,2024-07-31,,,,
405121,United States,2024-08-01,,,,
405122,United States,2024-08-02,,,,
405123,United States,2024-08-03,,,,


In [21]:
df.isnull().sum()

location                  0
date                      0
new_cases             19276
new_tests            354032
people_vaccinated    348303
stringency_index     233245
dtype: int64

In [27]:
# Drop null values
df = df.dropna(subset = ['new_cases', 'people_vaccinated', 'stringency_index', 'new_tests'])

In [33]:
#Create a binary target if new_cases > median, else 0
df['high_cases'] = (df['new_cases'] > 1000).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['high_cases'] = (df['new_cases'] > 1000).astype(int)


In [35]:
X = df[['people_vaccinated', 'stringency_index', 'new_tests']]
y = df['high_cases']

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
model = LogisticRegression(max_iter = 200)
model.fit(X_train, y_train)

In [39]:
y_pred = model.predict(X_test)

In [41]:
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.8784359272164151

In [45]:
df['high_cases']

3719      1
3721      0
3722      0
3723      0
3724      0
         ..
428545    1
428553    0
428557    0
428558    0
428626    0
Name: high_cases, Length: 25826, dtype: int32

In [47]:
print("Classification Report:\n", classification_report(y_test, y_pred, target_names = ['Low', 'High']))

Classification Report:
               precision    recall  f1-score   support

         Low       0.88      1.00      0.94      4538
        High       0.00      0.00      0.00       628

    accuracy                           0.88      5166
   macro avg       0.44      0.50      0.47      5166
weighted avg       0.77      0.88      0.82      5166



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
