## Import necessary libraries

In [None]:
import pandas as pd

## Read in data
### Drop unnecessary columns

In [None]:
df = pd.read_csv('Covid Data.csv')
df

columns_to_drop = ['USMER', 'MEDICAL_UNIT', 'PATIENT_TYPE', 'DATE_DIED', 'ICU', 'INTUBED']
df = df.drop(columns=columns_to_drop)
df

Unnamed: 0,SEX,PNEUMONIA,AGE,PREGNANT,DIABETES,COPD,ASTHMA,INMSUPR,HIPERTENSION,OTHER_DISEASE,CARDIOVASCULAR,OBESITY,RENAL_CHRONIC,TOBACCO,CLASIFFICATION_FINAL
0,1,1,65,2,2,2,2,2,1,2,2,2,2,2,3
1,2,1,72,97,2,2,2,2,1,2,2,1,1,2,5
2,2,2,55,97,1,2,2,2,2,2,2,2,2,2,3
3,1,2,53,2,2,2,2,2,2,2,2,2,2,2,7
4,2,2,68,97,1,2,2,2,1,2,2,2,2,2,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1048570,2,2,40,97,2,2,2,2,2,2,2,2,2,2,7
1048571,2,2,51,97,2,2,2,2,1,2,2,2,2,2,7
1048572,2,2,55,97,2,2,2,2,2,2,2,2,2,2,7
1048573,2,2,28,97,2,2,2,2,2,2,2,2,2,2,7


### Replace missing values of pregnancy for males

In [None]:
df['PREGNANT'] = df.PREGNANT.replace(97,2)
df.head()

Unnamed: 0,SEX,PNEUMONIA,AGE,PREGNANT,DIABETES,COPD,ASTHMA,INMSUPR,HIPERTENSION,OTHER_DISEASE,CARDIOVASCULAR,OBESITY,RENAL_CHRONIC,TOBACCO,CLASIFFICATION_FINAL
0,1,1,65,2,2,2,2,2,1,2,2,2,2,2,3
1,2,1,72,2,2,2,2,2,1,2,2,1,1,2,5
2,2,2,55,2,1,2,2,2,2,2,2,2,2,2,3
3,1,2,53,2,2,2,2,2,2,2,2,2,2,2,7
4,2,2,68,2,1,2,2,2,1,2,2,2,2,2,3


### Replace numerical values representing missing values and dropping null values

In [None]:
# Define a list of values to replace with None
values_to_replace = [97, 98, 99]

# List of columns to apply the replacement to
columns_to_replace = ['DIABETES', 'COPD', 'ASTHMA', 'INMSUPR', 'HIPERTENSION',
                      'OTHER_DISEASE', 'CARDIOVASCULAR', 'OBESITY', 'RENAL_CHRONIC',
                      'TOBACCO', 'CLASIFFICATION_FINAL']

# Apply the replacement to each column
for column in columns_to_replace:
    df[column] = df[column].replace(values_to_replace, None)



df = df.dropna()


## Splitting and Train/test model

### Drop dependent variable column from independent variables

In [None]:
X = df.drop('CLASIFFICATION_FINAL', axis=1)
X

### Set classification as the dependent/predictor variable

In [None]:
y = df['CLASIFFICATION_FINAL']
y

0          3
1          5
2          3
3          7
4          3
          ..
1048570    7
1048571    7
1048572    7
1048573    7
1048574    7
Name: CLASIFFICATION_FINAL, Length: 1041013, dtype: int64

### Import libraries for splitting data and logistic regression

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

### Split data using 80% for training and 20% for testing

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Pipeline Setup for Logistic Regression with Data Scaling in Scikit-Learn

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# Create a pipeline that first scales the data then fits the model
pipeline = make_pipeline(StandardScaler(), LogisticRegression(max_iter=1000))

# Train the pipeline
pipeline.fit(x_train, y_train)

# Now you can use the pipeline to make predictions, and it will automatically scale the new data
y_pred = pipeline.predict(x_test)

## Evaluating Logistic Regression Model Accuracy on Training and Test Data

In [None]:
print("Logistic Regression Train Accuracy: ", pipeline.score(x_train,y_train), "\n")
print("Logistic Regression Test Accuracy: ", pipeline.score(x_test,y_test))



Logistic Regression Train Accuracy:  0.5267720128240535 

Logistic Regression Test Accuracy:  0.5281191913661186


## Save model

In [None]:
import joblib

joblib.dump(pipeline, 'covid-19-predict_joblib')

['covid-19-predict_joblib']

In [None]:
from google.colab import files
files.download('covid-19-predict.joblib')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>