In [53]:
import pandas as pd

In [54]:
# Load the cleaned datasets
col_data = pd.read_csv('../data/cost_of_living_cleaned.csv')
min_wage_data = pd.read_csv('../data/minimum_wage_cleaned.csv')
bls_data = pd.read_csv('../data/bls_cleaned.csv')
provider_data = pd.read_csv('../data/provider_cleaned.csv')

# Verify successful loading
print("Cost of Living Data:")
print(col_data.head())

print("\nMinimum Wage Data:")
print(min_wage_data.head())

print("\nBLS Data:")
print(bls_data.head())

print("\nProvider Data:")
print(provider_data.head())

Cost of Living Data:
   rank state  cost_of_living_index  grocery  housing  utilities  \
0     1    WV                  84.8     98.4     61.3       96.4   
1     2    KS                  86.7     94.6     70.7      100.8   
2     3    MS                  87.5     96.8     72.2       89.4   
3     4    OK                  87.9     96.5     74.7       96.5   
4     5    AL                  88.8     97.8     71.4      100.5   

   transportation  health   misc.  
0            93.1     96.8   91.2  
1            92.2     92.6   90.3  
2            90.1     95.6   93.8  
3            90.6     99.3   90.5  
4            90.3     90.1   95.8  

Minimum Wage Data:
  state  minimum_wage  increases_planned
0    AL          7.25                  0
1    AK         11.91                  0
2    AZ         14.70                  0
3    AR         11.00                  0
4    CA         16.50                  0

BLS Data:
  area_title state occ_code                            occ_title  tot_emp  \


In [55]:
# Check for missing values
print("Missing values in Cost of Living Data:")
print(col_data.isnull().sum())

print("\nMissing values in Minimum Wage Data:")
print(min_wage_data.isnull().sum())

print("\nMissing values in BLS Data:")
print(bls_data.isnull().sum())

print("\nMissing values in Provider Data:")
print(provider_data.isnull().sum())

# Check for duplicates
print("\nDuplicate rows in Provider Data:")
print(provider_data.duplicated().sum())

Missing values in Cost of Living Data:
rank                    0
state                   0
cost_of_living_index    0
grocery                 0
housing                 0
utilities               0
transportation          0
health                  0
misc.                   0
dtype: int64

Missing values in Minimum Wage Data:
state                0
minimum_wage         0
increases_planned    0
dtype: int64

Missing values in BLS Data:
area_title            0
state                 0
occ_code              0
occ_title             0
tot_emp               0
hourly_median_wage    0
dtype: int64

Missing values in Provider Data:
state                0
provider_name        0
overall_rating     143
staffing_rating    198
qm_rating          265
latitude             0
longitude            0
dtype: int64

Duplicate rows in Provider Data:
1


In [57]:
# Identify all duplicate rows
duplicate_rows = provider_data[provider_data.duplicated(keep=False)]
print(duplicate_rows)

     state provider_name  overall_rating  staffing_rating  qm_rating  \
4457    IA  PRAIRIE GATE             NaN              NaN        NaN   
4458    IA  PRAIRIE GATE             NaN              NaN        NaN   

      latitude  longitude  
4457   41.2538    -95.811  
4458   41.2538    -95.811  


In [58]:
# Drop identical duplicate rows
provider_data = provider_data.drop_duplicates()

# Verify the result
print("Number of duplicates after cleaning:", provider_data.duplicated().sum())

Number of duplicates after cleaning: 0


In [59]:
print(min_wage_data.count())
print(col_data.count())

state                55
minimum_wage         55
increases_planned    55
dtype: int64
rank                    52
state                   52
cost_of_living_index    52
grocery                 52
housing                 52
utilities               52
transportation          52
health                  52
misc.                   52
dtype: int64


In [72]:
# Verify data types
print(bls_data.dtypes)

# Check the first few rows
print(bls_data.head())

area_title             object
state                  object
occ_code               object
occ_title              object
tot_emp                 int64
hourly_median_wage    float64
weighted_wage         float64
dtype: object
  area_title state occ_code                            occ_title  tot_emp  \
0    Alabama    AL  31-1120  Home Health and Personal Care Aides    19910   
1    Alabama    AL  31-1131                   Nursing Assistants    22430   
2    Alabama    AL  31-1132                            Orderlies      500   
3     Alaska    AK  31-1120  Home Health and Personal Care Aides     5660   
4     Alaska    AK  31-1131                   Nursing Assistants     1710   

   hourly_median_wage  weighted_wage  
0               11.45       227969.5  
1               14.69       329496.7  
2               14.38         7190.0  
3               17.81       100804.6  
4               21.87        37397.7  


In [73]:
# Calculate weighted wage
bls_data['weighted_wage'] = bls_data['hourly_median_wage'] * bls_data['tot_emp']

In [75]:
# Aggregate by state
bls_agg = bls_data.groupby('state').agg({
    'weighted_wage': 'sum',  # Total weighted wages
    'tot_emp': 'sum'         # Total employment
}).reset_index()

# Calculate weighted average hourly wage
bls_agg['hourly_median_wage'] = bls_agg['weighted_wage'] / bls_agg['tot_emp']

# Drop intermediate columns if desired
bls_agg = bls_agg[['state', 'hourly_median_wage']]

In [76]:
print("Weighted BLS Data:")
bls_agg.head(60)

Weighted BLS Data:


Unnamed: 0,state,hourly_median_wage
0,AK,18.752083
1,AL,13.180584
2,AR,14.121912
3,AZ,16.632967
4,CA,16.802503
5,CO,18.405378
6,CT,18.261715
7,DC,18.716331
8,DE,15.860027
9,FL,16.271242


In [77]:
print("Aggregated BLS Data (Weighted Averages):")
print(bls_agg.head())

Aggregated BLS Data (Weighted Averages):
  state  hourly_median_wage
0    AK           18.752083
1    AL           13.180584
2    AR           14.121912
3    AZ           16.632967
4    CA           16.802503


In [78]:
# Save the weighted BLS data
bls_agg.to_csv('../data/bls_weighted.csv', index=False)

In [80]:
bls_agg.shape

(54, 2)

In [81]:
# Merge Cost of Living and Minimum Wage Data
state_level_data = pd.merge(col_data, min_wage_data, on='state', how='inner')

# Verify the result
print("State-Level Data (Step 1):")
print(state_level_data.head())

State-Level Data (Step 1):
   rank state  cost_of_living_index  grocery  housing  utilities  \
0     1    WV                  84.8     98.4     61.3       96.4   
1     2    KS                  86.7     94.6     70.7      100.8   
2     3    MS                  87.5     96.8     72.2       89.4   
3     4    OK                  87.9     96.5     74.7       96.5   
4     5    AL                  88.8     97.8     71.4      100.5   

   transportation  health   misc.  minimum_wage  increases_planned  
0            93.1     96.8   91.2          8.75                  0  
1            92.2     92.6   90.3          7.25                  0  
2            90.1     95.6   93.8          7.25                  0  
3            90.6     99.3   90.5          7.25                  0  
4            90.3     90.1   95.8          7.25                  0  


In [82]:
state_level_data.shape

(52, 11)

In [83]:
# Merge BLS Weighted Data
state_level_data = pd.merge(state_level_data, bls_agg, on='state', how='inner')

# Verify the result
print("State-Level Data (Step 2):")
print(state_level_data.head())

State-Level Data (Step 2):
   rank state  cost_of_living_index  grocery  housing  utilities  \
0     1    WV                  84.8     98.4     61.3       96.4   
1     2    KS                  86.7     94.6     70.7      100.8   
2     3    MS                  87.5     96.8     72.2       89.4   
3     4    OK                  87.9     96.5     74.7       96.5   
4     5    AL                  88.8     97.8     71.4      100.5   

   transportation  health   misc.  minimum_wage  increases_planned  \
0            93.1     96.8   91.2          8.75                  0   
1            92.2     92.6   90.3          7.25                  0   
2            90.1     95.6   93.8          7.25                  0   
3            90.6     99.3   90.5          7.25                  0   
4            90.3     90.1   95.8          7.25                  0   

   hourly_median_wage  
0           13.802066  
1           15.113295  
2           12.238887  
3           14.094471  
4           13.180584  

In [84]:
# Merge provider data with state-level information
provider_enriched = pd.merge(provider_data, state_level_data, on='state', how='left')

# Verify the result
print("Provider-Enriched Data:")
print(provider_enriched.head())

# Save the dataset
provider_enriched.to_csv('../data/final_provider_data.csv', index=False)

Provider-Enriched Data:
  state                                provider_name  overall_rating  \
0    AL                     BURNS NURSING HOME, INC.             2.0   
1    AL               COOSA VALLEY HEALTHCARE CENTER             4.0   
2    AL                   HIGHLANDS HEALTH AND REHAB             4.0   
3    AL  EASTVIEW REHABILITATION & HEALTHCARE CENTER             2.0   
4    AL                PLANTATION MANOR NURSING HOME             2.0   

   staffing_rating  qm_rating  latitude  longitude  rank  \
0              4.0        4.0   34.5149    -87.736   5.0   
1              3.0        3.0   33.1637    -86.254   5.0   
2              3.0        2.0   34.6611    -86.047   5.0   
3              1.0        2.0   33.5595    -86.722   5.0   
4              4.0        2.0   33.3221    -87.034   5.0   

   cost_of_living_index  grocery  housing  utilities  transportation  health   \
0                  88.8     97.8     71.4      100.5            90.3     90.1   
1                  8

In [85]:
# Aggregate provider data to state level
provider_agg = provider_data.groupby('state').agg({
    'overall_rating': 'mean',  # Average overall rating
    'staffing_rating': 'mean',  # Average staffing rating
    'qm_rating': 'mean',  # Average quality measure rating
    'latitude': 'mean',  # Average latitude for visualization (optional)
    'longitude': 'mean'  # Average longitude for visualization (optional)
}).reset_index()

# Merge aggregated provider data with state-level data
state_level_aggregated = pd.merge(state_level_data, provider_agg, on='state', how='inner')

# Verify the result
print("State-Level Aggregated Data:")
print(state_level_aggregated.head())

# Save the dataset
state_level_aggregated.to_csv('../data/final_state_level_data.csv', index=False)

State-Level Aggregated Data:
   rank state  cost_of_living_index  grocery  housing  utilities  \
0     1    WV                  84.8     98.4     61.3       96.4   
1     2    KS                  86.7     94.6     70.7      100.8   
2     3    MS                  87.5     96.8     72.2       89.4   
3     4    OK                  87.9     96.5     74.7       96.5   
4     5    AL                  88.8     97.8     71.4      100.5   

   transportation  health   misc.  minimum_wage  increases_planned  \
0            93.1     96.8   91.2          8.75                  0   
1            92.2     92.6   90.3          7.25                  0   
2            90.1     95.6   93.8          7.25                  0   
3            90.6     99.3   90.5          7.25                  0   
4            90.3     90.1   95.8          7.25                  0   

   hourly_median_wage  overall_rating  staffing_rating  qm_rating   latitude  \
0           13.802066        2.491667         2.525000   2.69