In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from keras impor
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

# Load and preprocess data
def load_and_preprocess_data(filepath):
    df = pd.read_csv(filepath)
    
    # Filter relevant columns and drop rows with missing target
    df = df.dropna(subset=['percent_food_insecure'])
    
    # Convert rural_urban to numeric
    df['rural_urban'] = pd.factorize(df['rural_urban'])[0]
    
    # Create lag features
    df = df.sort_values(['fips', 'year'])
    df['food_insecure_lag1'] = df.groupby('fips')['percent_food_insecure'].shift(1)
    df['food_insecure_lag2'] = df.groupby('fips')['percent_food_insecure'].shift(2)
    
    # Drop rows with missing lag features
    df = df.dropna(subset=['food_insecure_lag1', 'food_insecure_lag2'])
    
    # Select features - adjust as needed
    features = [
        'percent_household_income_required_for_child_care_expenses',
        'food_environment_index',
        'percent_fair_or_poor_health',
        'percent_unemployed',
        'percent_children_in_poverty',
        'percent_severe_housing_problems',
        'percent_completed_high_school',
        'percent_frequent_mental_distress',
        'percent_uninsured_children',
        'percent_disconnected_youth',
        'spending_per_pupil',
        'school_funding_adequacy',
        'high_school_graduation_rate',
        'median_household_income',
        'gender_pay_gap',
        'percent_enrolled_in_free_or_reduced_lunch',
        'percent_households_with_severe_cost_burden',
        'percent_rural',
        'percent_65_and_over',
        'percent_not_proficient_in_english',
        'segregation_index',
        'teen_birth_rate',
        'percent_children_in_single_parent_households',
        'percent_low_birthweight',
        'percent_black',
        'rural_urban',
        'food_insecure_lag1',
        'food_insecure_lag2'
    ]
    
    # Filter only available features
    available_features = [f for f in features if f in df.columns]
    df = df[['year', 'fips', 'county.x', 'state.x', 'percent_food_insecure'] + available_features]
    
    return df

# Prepare data for LSTM
def prepare_lstm_data(df, n_steps=3):
    counties = df['fips'].unique()
    X, y = [], []
    
    for county in counties:
        county_data = df[df['fips'] == county].sort_values('year')
        
        # Skip counties with insufficient data
        if len(county_data) < n_steps:
            continue
            
        # Get features and target
        features = county_data.drop(columns=['year', 'fips', 'county.x', 'state.x', 'percent_food_insecure'])
        target = county_data['percent_food_insecure'].values
        
        # Normalize features
        scaler = MinMaxScaler()
        scaled_features = scaler.fit_transform(features)
        
        # Create sequences
        for i in range(n_steps, len(county_data)):
            X.append(scaled_features[i-n_steps:i])
            y.append(target[i])
    
    return np.array(X), np.array(y), scaler

# Build LSTM model
def build_lstm_model(input_shape):
    model = Sequential([
        LSTM(50, activation='relu', input_shape=input_shape, return_sequences=True),
        Dropout(0.2),
        LSTM(50, activation='relu'),
        Dropout(0.2),
        Dense(1)
    ])
    
    model.compile(optimizer='adam', loss='mse')
    return model

# Main execution
def main():
    # Load and preprocess data
    df = load_and_preprocess_data('C:\\Users\\jashb\\OneDrive\\Documents\\Masters Data Science\\Spring 2025\\DATA 698\\Masters Project\\final_data.csv')
    
    # Split into train and test
    train = df[df['year'] < 2024]
    test = df[df['year'] == 2024]
    
    # Prepare LSTM data
    n_steps = 3  # Number of time steps to look back
    X_train, y_train, scaler = prepare_lstm_data(train, n_steps)
    X_test, y_test, _ = prepare_lstm_data(test, n_steps)
    
    # Build and train model
    model = build_lstm_model((X_train.shape[1], X_train.shape[2]))
    history = model.fit(X_train, y_train, epochs=100, batch_size=32, 
                       validation_split=0.2, verbose=1)
    
    # Evaluate model
    train_pred = model.predict(X_train)
    test_pred = model.predict(X_test)
    
    print(f"Train RMSE: {np.sqrt(mean_squared_error(y_train, train_pred))}")
    print(f"Test RMSE: {np.sqrt(mean_squared_error(y_test, test_pred))}")
    
    # Plot training history
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.legend()
    plt.title('Model Training History')
    plt.show()
    
    # Prepare 2025 predictions
    # We'll use 2022-2024 data to predict 2025 for each county
    predictions_2025 = []
    counties_2025 = []
    
    for county in df['fips'].unique():
        county_data = df[df['fips'] == county].sort_values('year')
        
        # We need at least n_steps years of data
        if len(county_data) < n_steps:
            continue
            
        # Get the most recent n_steps years
        recent_data = county_data.tail(n_steps)
        
        # Prepare features
        features = recent_data.drop(columns=['year', 'fips', 'county.x', 'state.x', 'percent_food_insecure'])
        scaled_features = scaler.transform(features)
        
        # Reshape for LSTM
        X_county = scaled_features.reshape(1, n_steps, -1)
        
        # Predict
        pred = model.predict(X_county)[0][0]
        predictions_2025.append(pred)
        counties_2025.append(county)
    
    # Create results dataframe
    results_2025 = pd.DataFrame({
        'fips': counties_2025,
        'county': [df[df['fips'] == fips]['county.x'].iloc[0] for fips in counties_2025],
        'current_food_insecure': [df[(df['fips'] == fips) & (df['year'] == 2024)]['percent_food_insecure'].values[0] 
                                 for fips in counties_2025],
        'pred_2025_food_insecure': predictions_2025
    })
    
    # Calculate predicted change and risk categories
    results_2025['predicted_change'] = results_2025['pred_2025_food_insecure'] - results_2025['current_food_insecure']
    results_2025['risk_category'] = pd.qcut(results_2025['pred_2025_food_insecure'], 
                                          q=[0, 0.25, 0.75, 1], 
                                          labels=['Low Risk', 'Medium Risk', 'High Risk'])
    
    # Sort by highest predicted food insecurity
    results_2025 = results_2025.sort_values('pred_2025_food_insecure', ascending=False)
    
    # Save results
    results_2025.to_csv('food_insecurity_predictions_lstm.csv', index=False)
    
    # Visualize top 10 high-risk counties
    plt.figure(figsize=(12, 6))
    plt.bar(results_2025.head(10)['county'], 
            results_2025.head(10)['pred_2025_food_insecure'], 
            color='firebrick')
    plt.title('Top 10 High-Risk Counties for 2025 Food Insecurity')
    plt.xlabel('County')
    plt.ylabel('Predicted Food Insecurity Rate')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

if __name__ == "__main__":
    main()

UnboundLocalError: cannot access local variable 'scaler' where it is not associated with a value