In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# Read the CSV and Perform Basic Data Cleaning

In [2]:
# Load the data
file = 'FoodAccessDemographics.csv'
df = pd.read_csv(file)
# check data type
df.dtypes 

CensusTract           int64
Pop2010               int64
PctLA               float64
PctWhite            float64
PctBlack            float64
PctAsian            float64
PctHispanic         float64
PctOtherMinority    float64
LA1and10              int64
dtype: object

In [4]:
# Find null values
for column in df.columns :
    print(f"Column {column} has {df[column].isnull().sum()} null values")

Column CensusTract has 0 null values
Column Pop2010 has 0 null values
Column PctLA has 0 null values
Column PctWhite has 4 null values
Column PctBlack has 4 null values
Column PctAsian has 4 null values
Column PctHispanic has 4 null values
Column PctOtherMinority has 4 null values
Column LA1and10 has 0 null values


In [47]:
# Remove null rows
#--Median Family Income has 748 null values which is 1.04%. This figure is insignificant compared to the importance of the
#feature towards prediction. So we deciide to drop null values than remove the column.
df=df.dropna()

In [48]:
#Find duplicate entries
print(f"Duplicate entries: {df.duplicated().sum()}")

Duplicate entries: 0


# Define Model

In [49]:
# Create features
X = df.drop(columns=['CensusTract',"LA1and10"]) 

# Create target
y = df["LA1and10"]

# Split model
X_train, X_test, y_train, y_test = train_test_split(X,
    y, random_state=1)

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


### Logistic Regression

In [50]:
# Define model
model = LogisticRegression(solver='lbfgs', random_state=1)
# fit model

# Fit model
model.fit(X_train, y_train)
# make predictions on the entire training dataset
predictions = model.predict(X_test)

# Evaluate the model
y_pred = model.predict(X_test)
print(f" Logistic regression model accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Logistic regression model accuracy: 0.974


In [51]:
cm_logreg = confusion_matrix(y_test,y_pred)
cm_logreg

array([[11031,   134],
       [  336,  6631]], dtype=int64)

In [52]:
#Run Logistic Regression on scaled data
# Fit model
model.fit(X_train_scaled, y_train)
# make predictions on the entire training dataset
predictions_scaled = model.predict(X_test_scaled)

# Evaluate the model
y_pred_scaled = model.predict(X_test_scaled)
print(f" Logistic regression model accuracy: {accuracy_score(y_test,y_pred_scaled):.3f}")

 Logistic regression model accuracy: 0.981


In [53]:
confusion_matrix(y_test,y_pred_scaled)

array([[11021,   144],
       [  209,  6758]], dtype=int64)

In [54]:
pd=pd.DataFrame({"Prediction": predictions, "Scaled Prediction":predictions_scaled, "Actual": y_test})
pd.head()

Unnamed: 0,Prediction,Scaled Prediction,Actual
26094,0,0,0
59902,0,0,0
31752,1,1,1
21514,0,0,0
11441,0,1,1


### Undersampling

In [55]:
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler

In [56]:
Counter(y)

Counter({1: 27548, 0: 44979})

In [57]:
rus = RandomUnderSampler(random_state=1)
X_resampled, y_resampled = rus.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({0: 20581, 1: 20581})

In [58]:
# Retry Logistic Regression with undersampled data
model.fit(X_resampled, y_resampled)
y_pred_resamp = model.predict(X_test)
print(f" Resampled logistic regression model accuracy: {accuracy_score(y_test,y_pred_resamp):.3f}")
confusion_matrix(y_test, y_pred_resamp)

 Resampled logistic regression model accuracy: 0.982


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


array([[10997,   168],
       [  156,  6811]], dtype=int64)

### SVM

In [59]:
# Create the SVM model
svm = SVC(kernel='linear')

# Train the model
svm.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred_svm = svm.predict(X_test_scaled)
print(f" SVM model accuracy: {accuracy_score(y_test,y_pred_svm):.3f}")

confusion_matrix(y_test, y_pred_svm)

 SVM model accuracy: 0.982


array([[11002,   163],
       [  167,  6800]], dtype=int64)

In [60]:
# Create the SVM model
svm_rbf = SVC()

# Train the model
svm_rbf.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred_svmrbf = svm_rbf.predict(X_test_scaled)
print(f" SVM model accuracy: {accuracy_score(y_test,y_pred_svmrbf):.3f}")

confusion_matrix(y_test, y_pred_svmrbf)

 SVM model accuracy: 0.995


array([[11145,    20],
       [   70,  6897]], dtype=int64)

### Random Forest

In [61]:
# Create Random Forest Classifier model
rf_model = RandomForestClassifier(n_estimators=100, random_state=78) 

In [62]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)
# Making predictions using the testing data.
rf_y_pred = rf_model.predict(X_test_scaled)
# Evaluate the model
print(f"Random Forest model accuracy: {accuracy_score(y_test,rf_y_pred):.3f}")

Random Forest model accuracy: 0.998


In [63]:
confusion_matrix(y_test,rf_y_pred)

array([[11143,    22],
       [   22,  6945]], dtype=int64)

In [64]:
rf_model.feature_importances_


Pop2010               int64
PctLA               float64
PctWhite            float64
PctBlack            float64
PctAsian            float64
PctHispanic         float64
PctOtherMinority    float64




array([0.04668019, 0.90476039, 0.01152447, 0.00612747, 0.00951762,
       0.011283  , 0.01010687])

### Neural Network

In [65]:
# import tensorflow as tf

In [66]:
# # Define neural network model
# nn_model = tf.keras.models.Sequential()
# nn_model.add(tf.keras.layers.Dense(units=14, activation="relu", input_dim=7))
# #nn_model.add(tf.keras.layers.Dense(units=6, activation="relu"))
# nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# # Compile the Sequential model together and customize metrics
# nn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# # Train the model
# fit_model = nn_model.fit(X_train_scaled, y_train, epochs=50)

# # Evaluate the model using the test data
# model_loss, model_accuracy = nn_model.evaluate(X_test_scaled,y_test,verbose=2)
# print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

In [67]:
print(f'Logistic Regression: {accuracy_score(y_test,y_pred):.3f}')
print(f'Scaled Logistic Regression: {accuracy_score(y_test,y_pred_scaled):.3f}')
print(f'Undersampled Logistic Regression: {accuracy_score(y_test,y_pred_resamp):.3f}')
print(f'SVM Accuracy: {accuracy_score(y_test,y_pred_svm):.3f}')

Logistic Regression: 0.974
Scaled Logistic Regression: 0.981
Undersampled Logistic Regression: 0.982
SVM Accuracy: 0.982
