1. Make sure all necessary libraries are loaded. Print your x (independent) and y (dependent) variables from the data frame. Show the results.

In [22]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, confusion_matrix, accuracy_score
from sklearn.model_selection import cross_val_score

# Load the dataset
file_path = 'data/hospital_readmissions.csv'
data = pd.read_csv(file_path)

# Define independent variables (X) and dependent variable (Y)
X = data[['Age', 'Time_in_Hospital', 'Num_Lab_Procedures', 'Num_Medications', 'Num_Outpatient', 'Num_Inpatient', 'Num_Emergency']]
y = data['Readmitted']

# Print X and Y variables
print("Independent Variables (X):\n", X.head())
print("\nDependent Variable (Y):\n", y.head())

Independent Variables (X):
    Age  Time_in_Hospital  Num_Lab_Procedures  Num_Medications  Num_Outpatient  \
0   71                10                  25                1               9   
1   34                 7                  12                3               8   
2   80                 1                   5                2               3   
3   40                 7                  98                9               3   
4   43                 5                  48               19               6   

   Num_Inpatient  Num_Emergency  
0              4              0  
1              0              4  
2              0              4  
3              0              2  
4              1              1  

Dependent Variable (Y):
 0    1
1    0
2    0
3    1
4    0
Name: Readmitted, dtype: int64


2. Fit an ordinary least square method using the OLS method from the seaborn library to figure out your most significant variables. Show the results and interpret the following:
- R-squared
- Adjusted. R-Squared
- Degree of freedom
- Coefficient of determination and P-values

In [23]:
# Add a constant to the independent variables
X = sm.add_constant(X)

# Fit the OLS model
ols_model = sm.OLS(y, X).fit()

# Show the OLS results summary
print(ols_model.summary())

                            OLS Regression Results                            
Dep. Variable:             Readmitted   R-squared:                       0.001
Model:                            OLS   Adj. R-squared:                 -0.006
Method:                 Least Squares   F-statistic:                    0.1969
Date:                Wed, 18 Sep 2024   Prob (F-statistic):              0.986
Time:                        14:54:13   Log-Likelihood:                -636.96
No. Observations:                1000   AIC:                             1290.
Df Residuals:                     992   BIC:                             1329.
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
const                  0.2900      0

3. Test, identify, and detect multicollinearity in your dataset. Explain how you resolved it and report the results.

In [24]:
# Calculate VIF to detect multicollinearity
vif_data = pd.DataFrame()
vif_data["Feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]

# Show VIF values
print(vif_data)

              Feature        VIF
0               const  23.785824
1                 Age   1.005411
2    Time_in_Hospital   1.004908
3  Num_Lab_Procedures   1.011913
4     Num_Medications   1.003586
5      Num_Outpatient   1.010482
6       Num_Inpatient   1.005739
7       Num_Emergency   1.006920


4. Fit a linear regression model using the linear regression method from scikit-learn library. Obtain the coefficients and interpret the results.

In [25]:
# Prepare the LinearRegression model
lin_reg = LinearRegression()

# Fit the model to the entire dataset (you'll train the model in step 6)
lin_reg.fit(X, y)

# Get the coefficients
coefficients = lin_reg.coef_
intercept = lin_reg.intercept_

# Print the coefficients and intercept
print("Coefficients:\n", coefficients)
print("\nIntercept:\n", intercept)

Coefficients:
 [ 0.          0.0001026  -0.00207281  0.00019772 -0.0014242   0.00125463
  0.00155331  0.00713153]

Intercept:
 0.2899908201817057


5. Split the data into 80% training and 20% testing sets using the train_test_split class.

In [26]:
# Split the data into 80% training and 20% testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the sizes of the training and testing sets
print("Training set size:", X_train.shape)
print("Testing set size:", X_test.shape)

Training set size: (800, 8)
Testing set size: (200, 8)


6. Train the multiple regression model on the training set using the LinearRegression class.

In [27]:
# Train the model on the training set
lin_reg.fit(X_train, y_train)

# Get the coefficients and intercept after training
coefficients = lin_reg.coef_
intercept = lin_reg.intercept_

print("Coefficients after training:\n", coefficients)
print("\nIntercept after training:\n", intercept)

Coefficients after training:
 [ 0.         -0.00032389 -0.00617662  0.00039419 -0.00023816 -0.00213766
  0.00085366  0.00782346]

Intercept after training:
 0.34511893963324275


7. Make regression predictions and interpret your results in the context of the question(s) you're trying to answer.

In [28]:
# Make predictions on the test set
y_pred = lin_reg.predict(X_test)

# Calculate the Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Print a few predicted vs actual values for comparison
for i in range(5):
    print(f"Predicted: {y_pred[i]}, Actual: {y_test.values[i]}")

Mean Squared Error: 0.20120730389167385
Predicted: 0.31469911680256557, Actual: 0
Predicted: 0.303164043744045, Actual: 0
Predicted: 0.33462718054674, Actual: 0
Predicted: 0.27108980242054725, Actual: 1
Predicted: 0.29991682278157195, Actual: 0


8. Validate your multiple regression model using a confusion matrix, accuracy score, and k-fold cross-validation.

In [29]:
# For classification purposes, round the predictions to 0 or 1
y_pred_rounded = np.round(y_pred)

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_rounded)
print("Confusion Matrix:\n", conf_matrix)

# Perform 5-fold Cross-Validation
cross_val_scores = cross_val_score(lin_reg, X, y, cv=5)
print("Cross-Validation Scores:\n", cross_val_scores)
print("\nMean Cross-Validation Score:", cross_val_scores.mean())

Confusion Matrix:
 [[147   0]
 [ 53   0]]
Cross-Validation Scores:
 [-0.00926801 -0.01979056 -0.02523639 -0.01285204 -0.05732822]

Mean Cross-Validation Score: -0.024895043387887172
