In [59]:
import requests
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import missingno as msno
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer


# Optional: Configure matplotlib for inline display in Jupyter
%matplotlib inline

#Loading the Data
df = pd.read_csv('brfss_2022_final.csv')

# Display the first few rows of the DataFrame
print(df.head(10))

   DRNK3GE5  SLEPTIM1  EMTSUPRT  INCOME3  SEXVAR
0       NaN       8.0       1.0     99.0       2
1       NaN       6.0       1.0      5.0       2
2       NaN       5.0       2.0     10.0       2
3       NaN       7.0       1.0     77.0       2
4      88.0       9.0       1.0      5.0       2
5       NaN       7.0       2.0     99.0       1
6      88.0       7.0       1.0      8.0       2
7       NaN       8.0       1.0      7.0       2
8       NaN       6.0       1.0      7.0       2
9      88.0       7.0       1.0      7.0       2


In [45]:
# Create Dummy Features for Categorical Variables

df = pd.get_dummies(df, columns=['SEXVAR'], drop_first=True)

In [47]:
print(df.head())

   DRNK3GE5  SLEPTIM1  EMTSUPRT  INCOME3  SEXVAR_2
0       NaN       8.0       1.0     99.0      True
1       NaN       6.0       1.0      5.0      True
2       NaN       5.0       2.0     10.0      True
3       NaN       7.0       1.0     77.0      True
4      88.0       9.0       1.0      5.0      True


In [49]:
# Standardize the numeric columns so they are on the same scale using StandardScaler from sklearn

# Select numeric columns
numeric_columns = ['DRNK3GE5', 'SLEPTIM1', 'EMTSUPRT', 'INCOME3']

# Make a Scaler object
scaler = preprocessing.StandardScaler()

# Fit and transform the numeric data
df[numeric_columns] = scaler.fit_transform(df[numeric_columns])

In [51]:
# Split into Testing and Training Datasets

# Split data into training and test sets (80% training, 20% test)
X = df.drop('DRNK3GE5', axis=1)  # Features (replace TARGET_VARIABLE with your target)
y = df['DRNK3GE5']  # Target (replace TARGET_VARIABLE with your target)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [53]:
print(df.columns)

Index(['DRNK3GE5', 'SLEPTIM1', 'EMTSUPRT', 'INCOME3', 'SEXVAR_2'], dtype='object')


In [55]:
print(df['DRNK3GE5'].unique())

[        nan  0.61140611 -1.6533828  -1.73426812 -1.70730634 -1.57249748
 -1.68034457  0.31482661 -1.3568033  -1.22199443 -1.59945925 -0.95237671
 -1.62642102 -1.54553571 -1.49161216 -0.97933848 -1.43768862  0.90798561
 -1.08718557 -1.38376507 -1.19503266  0.28786484 -1.32984152 -1.41072684
 -1.30287975 -1.00630025 -1.0602238  -1.51857393 -1.11414734 -1.27591798
 -1.16807089 -1.46465039 -1.03326202 -0.79060607 -0.87149139 -0.92541493
 -0.54795012 -0.68275898 -1.14110912 -0.14352353  0.09913243 -1.24895621
 -0.41314125 -0.52098834 -0.30529416 -0.27833239  0.1260942   0.26090307]


In [63]:
# Using a Random Forest Model for training to predict the target var: DRNK3GE5

# Remove rows with missing values
df_cleaned = df.dropna()

# Prepare the data
X = df_cleaned.drop('DRNK3GE5', axis=1)  # Dropping target variable from features
y = df_cleaned['DRNK3GE5']  # The target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize models
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest Regressor": RandomForestRegressor(n_estimators=100, random_state=42),
}

# Train and evaluate each model
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Calculate the Mean Squared Error for evaluation
    mse = mean_squared_error(y_test, y_pred)
    print(f"{model_name} - MSE: {mse}")

Linear Regression - MSE: 1325.1794615018318
Random Forest Regressor - MSE: 1335.339509576945


- Lower MSE indicates a better model performance, so Linear Regression performed slightly better than Random Forest Regressor in this case.
- The MSE values are relatively high, which suggests that there might be room for improvement, possibly through additional data preprocessing, feature engineering, or model tuning.

## Feature Engineering
Looking to see if there are new features that can reveal hidden patterns in this dataset. 

In [67]:
# # Create a new feature that indicates whether 'DRNK3GE5' is missing
df['DRNK3GE5_missing'] = df['DRNK3GE5'].isna().astype(int)

# Impute missing values with the median or mean
df['DRNK3GE5'].fillna(df['DRNK3GE5'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['DRNK3GE5'].fillna(df['DRNK3GE5'].median(), inplace=True)


In [69]:
# One-hot encode 'SEXVAR' two categories: 1 and 2 for male and female)
df = pd.get_dummies(df, columns=['SEXVAR'], drop_first=True)

In [71]:
# INCOME3 feature contains a wide range of values, which could make models less effective if it is highly skewed. 
# Trying log-transform it to make it more normally distributed.
df['INCOME3_log'] = np.log1p(df['INCOME3'])

In [73]:
# SLEPTIM1 feature represents sleep time and might benefit from binning. 
# Classify sleep hours into categories "short sleep", "normal sleep", and "long sleep"
df['sleep_category'] = pd.cut(df['SLEPTIM1'], bins=[0, 5, 7, 10], labels=['short', 'normal', 'long'])

In [75]:
# Hoping for an interaction between SLEPTIM1 and INCOME3
# Maybe people with better income sleep more
df['sleep_income_interaction'] = df['SLEPTIM1'] * df['INCOME3']

In [77]:
#EMTSUPRT variable might represent support could create a binary feature that indicates whether someone has adequate support or not.
df['has_support'] = df['EMTSUPRT'].apply(lambda x: 1 if x == 1 else 0)

In [81]:
# Check for missing data again on these new features
df = df.dropna(subset=['DRNK3GE5'])  # Drop rows with missing target values

In [93]:
# Update the feature set to include in splitting data into predictors and targets
X = df[['INCOME3_log', 'SLEPTIM1', 'sleep_category', 'sleep_income_interaction', 'has_support', 'SEXVAR_2']]
y = df['DRNK3GE5']

In [97]:
# Convert 'sleep_category' to a categorical type
df['sleep_category'] = pd.Categorical(df['sleep_category'])

# Now apply the .cat.codes to encode the categories numerically
df['sleep_category'] = df['sleep_category'].cat.codes

In [99]:
# Finalize the feature set for X and y
X = df.drop(columns=['DRNK3GE5'])  # Drop target variable from features
y = df['DRNK3GE5']  # Target variable

In [103]:
# scale one more time to prevent bias towards variables with larger values
scaler = StandardScaler()

# Fit and transform the training data, then transform the test data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [105]:
# train models
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest Regressor": RandomForestRegressor(n_estimators=100, random_state=42)
}

# Train and evaluate each model
for model_name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)

    # Evaluate performance using Mean Squared Error (MSE)
    from sklearn.metrics import mean_squared_error
    mse = mean_squared_error(y_test, y_pred)
    print(f"{model_name} - MSE: {mse}")

Linear Regression - MSE: 1325.1794615018318
Random Forest Regressor - MSE: 1335.5475653299445


## Conclusion

First Set of Results:
Linear Regression - MSE: 1325.1794615018318
Random Forest Regressor - MSE: 1335.339509576945

New Set of Results:
Linear Regression - MSE: 1325.1794615018318 (Same)
Random Forest Regressor - MSE: 1335.5475653299445

Comparison:
Linear Regression: The MSE for Linear Regression remains exactly the same between the two sets of results.
Random Forest Regressor: The MSE for the Random Forest model has slightly increased (from 1335.34 to 1335.55).

Conclusion:
The results are very similar between the two sets, with only a small increase in MSE for the Random Forest Regressor in the second round. This suggests that the feature engineering and scaling that was done did NOT significantly improve the model performance in this case.

What happened:
Feature Engineering Impact: The added features (such as interaction terms and categorical encoding) didn't provide a strong enough signal to improve the performance.

Overfitting or Underfitting: The models might be underfitting or overfitting to the data.

This data is not providing any signficant findings in the relationship between drinking, emotional support, gender, income, and sleep. 