In [5]:
from myclasses.sql_executor import SQLExecutor
import pandas as pd

if __name__ == "__main__":
    # Create an instance of SQLExecutor
    sql_executor = SQLExecutor()

    # Example SQL query
    query = """
        SELECT 
            (net_income / num_beds) as net_income,
            year,
            tot_fund_balance,
            ownership,
            acct_payable,
            acct_rec,
            total_liabilities,
            current_ratio,
            quick_ratio,
            (tot_days / tot_bed_days_avail) as fill_rate,
            overhead_nonsalary_costs,
            tot_salaries,
            cash,
            chow_last_12mos,
            region,
            state,
            state_lean,
            county_ssa,
            zip,
            tot_discharge_tot,
            def_score,
            fine_tot,
            fine_cnt,
            resfamcouncil,
            sprinkler_status,
            overall_rating,
            quality_rating,
            staffing_rating,
            rn_staffing_rating,
            aidhrd,
            vochrd,
            rnhrd,
            totlichrd,
            tothrd,
            pthrd,
            weighted_all_cycles_score,
            certification,
            snf_avg_stay_len_title_tot,
            pop_over_70,
            over_70_pct,
            has_outpatient,
            bedcert,
            contract_labor
            
        FROM Master
        WHERE year in (2020, 2021, 2022)
        ORDER BY prov_id;
        """

    # Execute the query and get the result as a DataFrame
    df = sql_executor.execute_query(query)

# Assuming NA contract_labor means the Nursing home does have contract_labor
df[['contract_labor']] = df[['contract_labor']].fillna(0)

In [6]:
df.describe()

Unnamed: 0,net_income,year,tot_fund_balance,acct_payable,acct_rec,total_liabilities,current_ratio,quick_ratio,fill_rate,overhead_nonsalary_costs,...,rnhrd,totlichrd,tothrd,pthrd,weighted_all_cycles_score,snf_avg_stay_len_title_tot,pop_over_70,over_70_pct,bedcert,contract_labor
count,36669.0,36669.0,36669.0,36669.0,36669.0,36669.0,36669.0,36669.0,36669.0,36669.0,...,36669.0,36669.0,36669.0,36669.0,28048.0,36669.0,36669.0,36669.0,36669.0,36669.0
mean,1244.034,2020.814421,30991.35,8145.473,18171.11,127787.0,10.474222,5.640781,0.725666,56280.24,...,0.680087,1.581166,3.87504,0.073617,64.156051,2.201931,67758.346014,0.119606,0.986599,7237.352978
std,26558.35,0.758252,309194.6,45220.35,127070.8,651211.8,1093.324057,652.646874,2.580607,56190.01,...,0.363639,0.430981,0.785252,0.069285,67.555344,5.896135,150044.498309,0.031257,0.299421,8853.565609
min,-1700283.0,2020.0,-9221566.0,-1550743.0,-425989.1,-1621231.0,-816.141414,-4793.230126,0.00274,57.67761,...,0.0,0.0,1.50341,0.0,0.0,0.006042,84.0,0.030065,0.00084,0.187726
25%,-4442.894,2020.0,-3767.515,1744.122,6210.2,9168.044,0.667167,0.006828,0.606571,35318.94,...,0.4893,1.38881,3.46626,0.03889,22.0,0.727882,3303.0,0.101151,1.0,3314.414141
50%,1636.36,2021.0,8707.318,3748.097,9974.263,20668.52,1.277315,0.137777,0.724408,47455.7,...,0.642225,1.508395,3.81008,0.06763,44.0,1.243656,16434.0,0.118884,1.0,5510.663043
75%,8427.323,2021.0,31614.71,7994.56,15434.54,56774.32,2.492608,0.81293,0.821013,62113.82,...,0.75416,1.67715,4.06787,0.0809,82.0,2.229352,66948.0,0.135104,1.0,9087.793814
max,1401623.0,2022.0,18699570.0,2027600.0,17010480.0,28969130.0,203696.688525,119671.689243,321.645833,2146148.0,...,7.38405,7.92944,16.32692,1.45044,1118.0,175.703125,973386.0,0.453558,5.555556,825561.760417


In [7]:
def bin_income(data):

    if data >= 6000:
        return '5'
    elif data >= 3000:
        return '4'
    elif data >= 1000:
        return '3'
    elif data >= -1000:
        return '2'
    elif data >= -4000:
        return '1'
    else:
        return '0'
    
df['net_income'] = df['net_income'].apply(bin_income)

In [8]:
from sklearn.model_selection import train_test_split

y = df['net_income']
df = df.drop(columns='net_income')

# Select columns where data type is text (object)
text_columns = df.select_dtypes(include=['object']).columns

# Create dummy variables for text columns
df_encoded = pd.get_dummies(df, columns=text_columns)

# Convert all columns to numeric type
df_encoded = df_encoded.apply(pd.to_numeric, errors='coerce')

# Extract target variable (y) and independent variables (X)
X = df_encoded.iloc[:, 2:].fillna(0)

In [9]:
# import umap.umap_ as umap

# # Assuming X_train is your training data

# # Define UMAP reducer
# reducer = umap.UMAP(n_components=8)

# # Fit and transform the training data
# umap_embeddings = reducer.fit_transform(X)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

In [10]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Input
from keras.callbacks import EarlyStopping
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

num_classes = 6

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Encode target values into integers
y_train_encoded = label_encoder.fit_transform(y_train)

# Convert integer target values to one-hot encoded format
num_classes = len(label_encoder.classes_)
y_train_encoded = to_categorical(y_train_encoded, num_classes=num_classes)

# Define the architecture of the neural network
model = Sequential([
    Input(shape=(X_train.shape[1],)),  # Input layer
    Dense(64, activation='relu'),
    Dense(128, activation='relu'),
    Dense(256, activation='relu'),
    Dense(256, activation='relu'),
    Dense(128, activation='relu'),
    Dense(64, activation='relu'),
    Dense(8, activation='relu'),
    Dense(4, activation='relu'),
    Dense(2, activation='relu'),
    Dense(num_classes, activation='softmax')  # Change activation function for multi-classification
])

# Compile the model with categorical_crossentropy loss
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])  # Change loss and metrics

# Define early stopping
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the model with early stopping
history = model.fit(X_train, y_train_encoded, epochs=30, batch_size=6, validation_split=0.2, callbacks=[early_stop])

2024-04-24 11:27:07.496685: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-24 11:27:07.497229: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-24 11:27:07.502557: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-24 11:27:07.561725: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-04-24 11:27:09.907109: E external/local_xla/xla/stream_executor/cuda/cuda_driver.

Epoch 1/30
[1m2934/2934[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 5ms/step - accuracy: 0.3105 - loss: 1.9398 - val_accuracy: 0.3188 - val_loss: 1.6557
Epoch 2/30
[1m2934/2934[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 6ms/step - accuracy: 0.3179 - loss: 1.6597 - val_accuracy: 0.3188 - val_loss: 1.6556
Epoch 3/30
[1m2934/2934[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 5ms/step - accuracy: 0.3211 - loss: 1.6566 - val_accuracy: 0.3188 - val_loss: 1.6559
Epoch 4/30
[1m2934/2934[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 6ms/step - accuracy: 0.3186 - loss: 1.6607 - val_accuracy: 0.3188 - val_loss: 1.6556
Epoch 5/30
[1m2934/2934[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 6ms/step - accuracy: 0.3180 - loss: 1.6609 - val_accuracy: 0.3188 - val_loss: 1.6556
Epoch 6/30
[1m2934/2934[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 6ms/step - accuracy: 0.3249 - loss: 1.6563 - val_accuracy: 0.3188 - val_loss: 1.6555
Epoch 7/30

In [None]:
import numpy as np
from sklearn.metrics import r2_score, median_absolute_error, max_error, mean_absolute_error, mean_squared_error, explained_variance_score

# Apply the model on test data
y_pred = model.predict(X_test)

# R² Score (Coefficient of Determination)
r2 = r2_score(y_test, y_pred)

# Median Absolute Error
mabe = median_absolute_error(y_test, y_pred)

# Max Error
max_error_value = max_error(y_test, y_pred)

# Mean Absolute Error
mae = mean_absolute_error(y_test, y_pred)

# Mean Squared Error
mse = mean_squared_error(y_test, y_pred)

# Root Mean Squared Error
rmse = np.sqrt(mse)

# Explained Variance Score
explained_variance = explained_variance_score(y_test, y_pred)


print("Root Mean Squared Error: ${:.2f}".format(rmse))
print("Median Absolute Error: ${:.2f}".format(mabe))
print("Mean Absolute Error: ${:.2f}".format(mae))
print("Max Error: ${:.2f}".format(max_error_value))
print("R² Score (Coefficient of Determination): {:.5f}".format(r2))

In [None]:
import matplotlib.pyplot as plt

# Convert values to thousands
y_test_thousands = y_test / 1000
y_pred_thousands = y_pred / 1000

plt.figure(figsize=(8, 6))
plt.scatter(y_test_thousands, y_pred_thousands, color='blue', alpha=0.5, s=1)  # Scatter plot of actual vs. predicted values with smaller point size
plt.plot([y_test_thousands.min(), y_test_thousands.max()], [y_test_thousands.min(), y_test_thousands.max()], 'k--', lw=1)  # Plot the diagonal line
plt.xlabel('Actual (thousands)')
plt.ylabel('Predicted (thousands)')
plt.title('Actual vs. Predicted Values')

# Set axis limits to show the actual values
plt.xlim(left=y_test_thousands.min(), right=y_test_thousands.max())
plt.ylim(bottom=y_test_thousands.min(), top=y_test_thousands.max())

# Add light gridlines
plt.grid(color='gray', linestyle='--', linewidth=0.5)

plt.show()