In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt


In [2]:
merged = pd.read_csv('../data/oh_merged_till_2020.csv')
merged.head()

  merged = pd.read_csv('../data/oh_merged_till_2020.csv')


Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,LALVOTERID,Residence_Addresses_CensusBlock,Residence_Families_FamilyID,Residence_Addresses_Property_Land_Square_Footage,Residence_Addresses_Property_Type,Mailing_Addresses_State,Mailing_Addresses_Zip,Voters_Gender,...,General_2012_11_06,General_2016_11_08,General_2020_11_03,can_vote_2000-11-07,can_vote_2004-11-02,can_vote_2008-11-04,can_vote_2012-11-06,can_vote_2016-11-08,can_vote_2020-11-03,vote_life
0,0,0,LALOH483988945,2034.0,R021844817,13000.0,Apartment,MA,2119.0,M,...,,Y,Y,False,False,False,False,True,True,1724
1,1,1,LALOH539415345,3006.0,R015691344,14000.0,Apartment,MA,2135.0,M,...,,,Y,False,False,False,False,False,True,43
2,2,2,LALOH309002,1026.0,R000431800,56000.0,Residential,ME,3904.0,M,...,,,Y,False,True,True,True,True,True,6785
3,3,3,LALOH453346132,1000.0,R005285786,10000.0,Residential,MD,20603.0,F,...,,Y,,False,False,False,False,True,True,2247
4,4,4,LALOH484000085,3002.0,R020782033,7000.0,Residential,MD,21201.0,,...,,Y,Y,False,False,False,False,True,True,1486


In [3]:
# Select predictors and response
selected_features = [
    "Voters_Age",
    "Voters_Active",
    "Parties_Description",
    "ConsumerData_Inferred_HH_Rank",
    "ConsumerData_Presence_Of_Children_in_HH",
    "Residence_HHParties_Description",
    "Residence_Families_HHVotersCount",
    "ConsumerData_Assimilation_Status",
    "ConsumerData_MAID_Available",
    "Residence_Addresses_Density",
    "ConsumerData_Estimated_Income_Amount",
    # "ConsumerData_Household_Net_Worth",
    "Voters_Gender",
    "ConsumerData_Education_of_Person",
    "vote_life"
]

elections = []
for var in merged.columns:
    if var.startswith('General'):
        elections.append(var)

data = merged.copy()
data = data[selected_features + elections]


In [None]:
# 1. Voters_Age: Impute mean age for NaNs
data['Voters_Age'] = data['Voters_Age'].fillna(data['Voters_Age'].mean())

# 2. Voters_Active: Make 1 if 'A', 0 if 'I'
data['Voters_Active'] = data['Voters_Active'].apply(lambda x: 1 if x == 'A' else 0)

# 3. Parties_Description: One-hot encode and drop the base category
parties_encoded = pd.get_dummies(data['Parties_Description'], prefix='Party', drop_first=True)
data = pd.concat([data, parties_encoded], axis=1)
data.drop(columns=['Parties_Description'], inplace=True)

# 4. ConsumerData_Inferred_HH_Rank: Drop NaNs
data = data.dropna(subset=['ConsumerData_Inferred_HH_Rank'])

# 5. ConsumerData_Presence_Of_Children_in_HH: Fill NaN with 'Unknown', then one-hot encode
data['ConsumerData_Presence_Of_Children_in_HH'] = data['ConsumerData_Presence_Of_Children_in_HH'].fillna('Unknown')
children_encoded = pd.get_dummies(data['ConsumerData_Presence_Of_Children_in_HH'], prefix='Children')
data = pd.concat([data, children_encoded], axis=1)
data.drop(columns=['ConsumerData_Presence_Of_Children_in_HH'], inplace=True)

# 6. Residence_HHParties_Description: One-hot encode
hhparties_encoded = pd.get_dummies(data['Residence_HHParties_Description'], prefix='HHParties')
data = pd.concat([data, hhparties_encoded], axis=1)
data.drop(columns=['Residence_HHParties_Description'], inplace=True)

# 7. Residence_Families_HHVotersCount: Leave as-is (already numeric)

# 8. ConsumerData_Assimilation_Status: Fill NaN with 'Native-English', then one-hot encode
data['ConsumerData_Assimilation_Status'] = data['ConsumerData_Assimilation_Status'].fillna('Native-English')
assimilation_encoded = pd.get_dummies(data['ConsumerData_Assimilation_Status'], prefix='Assimilation')
data = pd.concat([data, assimilation_encoded], axis=1)
data.drop(columns=['ConsumerData_Assimilation_Status'], inplace=True)

# 9. ConsumerData_MAID_Available: Make 1 if True, 0 if missing
data['ConsumerData_MAID_Available'] = data['ConsumerData_MAID_Available'].apply(lambda x: 1 if x == True else 0)

# 10. Residence_Addresses_Density: Fill NaNs with mean
data['Residence_Addresses_Density'] = data['Residence_Addresses_Density'].fillna(data['Residence_Addresses_Density'].mean())

# 11. ConsumerData_Estimated_Income_Amount: Remove "$", then convert to float
data['ConsumerData_Estimated_Income_Amount'] = data['ConsumerData_Estimated_Income_Amount'].replace('[\$,]', '', regex=True).astype(float)
# Impute mean for NaNs
data['ConsumerData_Estimated_Income_Amount'] = data['ConsumerData_Estimated_Income_Amount'].fillna(data['ConsumerData_Estimated_Income_Amount'].mean())

# 12. ConsumerData_Household_Net_Worth: Fill NaN with 'Unknown', then one-hot encode
# data['ConsumerData_Household_Net_Worth'] = data['ConsumerData_Household_Net_Worth'].fillna('Unknown')
# net_worth_encoded = pd.get_dummies(data['ConsumerData_Household_Net_Worth'], prefix='NetWorth')
# data = pd.concat([data, net_worth_encoded], axis=1)
# data.drop(columns=['ConsumerData_Household_Net_Worth'], inplace=True)

# 13. Voters_Gender: Fill NaN with 'Unknown', then one-hot encode
data['Voters_Gender'] = data['Voters_Gender'].fillna('Unknown')
gender_encoded = pd.get_dummies(data['Voters_Gender'], prefix='Gender')
data = pd.concat([data, gender_encoded], axis=1)
data.drop(columns=['Voters_Gender'], inplace=True)

# 14. ConsumerData_Education_of_Person: Convert to ordinal by level of college
data['ConsumerData_Education_of_Person'] = data['ConsumerData_Education_of_Person'].fillna('Unknown')
education_mapping = {
    "Completed College Likely": 2,
    "Completed High School Likely": 1,
    "Completed Graduate School Likely": 3,
    "Other": 0,
    "Unknown": -1,
    "Did Not Complete High School Likely": 0, 
    "Attended Vocational/Technical School Likely": 1, 
    "Attended But Did Not Complete College Likely": 1
}
data['ConsumerData_Education_of_Person'] = data['ConsumerData_Education_of_Person'].map(education_mapping)

# 15. General Election variables: One-hot encode

for election in elections:
    data[election] = data[election].apply(lambda x: 1 if x == 'Y' else 0)

# 16. Scale continuous variables
continuous = ['Voters_Age', 'Residence_Addresses_Density', 'ConsumerData_Estimated_Income_Amount',
              'vote_life'
              ]
scaler = StandardScaler()

# Fit the scaler to the data and transform
data[continuous] = scaler.fit_transform(data[continuous])


In [None]:
X = data.drop(columns=['General_2020_11_03'])
y = data['General_2020_11_03']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=222)

## Naive Model

In [None]:
# Find majority class
majority_class = y_test.mode()[0]

# Predict that class for all applicants
naive_test_pred = [majority_class] * len(y_test)

naive_test_acc = accuracy_score(y_test, naive_test_pred)
print(f'Majority class: {majority_class}')
print(f'Test Accuracy: {naive_test_acc}')

Majority class: 1
Test Accuracy: 0.8045053484705652


## kNN Model

In [None]:
# accuracies = []

# for k in range(1, 30):
#     knn = KNeighborsClassifier(n_neighbors=k)

#     # Train the model
#     knn.fit(X_train, y_train)

#     # Make predictions on the test   
#     y_pred = knn.predict(X_test)

#     #Evaluate
#     acc = accuracy_score(y_test, y_pred)
#     accuracies.append(acc)


In [None]:
# # Plot Train and Validation MSEs
# plt.figure(figsize=(10, 6))
# plt.plot(range(1, 30), accuracies, label='Test Accuracy', marker='o')
# plt.xlabel('k')
# plt.ylabel('Accuracy')
# plt.title('Test Accuracy vs K Neighbors')
# plt.legend()

In [None]:
# # Evaluate the best k model
# knn = KNeighborsClassifier(n_neighbors=k)

# # Train the model
# knn.fit(X_train, y_train)

# # Make predictions on the test   
# y_pred = knn.predict(X_test)

# print("Accuracy:", accuracy_score(y_test, y_pred))
# print("\nClassification Report:\n", classification_report(y_test, y_pred))

## Logistic Regression

In [None]:
# Create logistic regression model
log_reg = LogisticRegression(max_iter=10000, class_weight='balanced')

# Fit the model to the training data
log_reg.fit(X_train, y_train)

# Predict on the test set
y_pred = log_reg.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))



Accuracy: 0.8023159987809176

Classification Report:
               precision    recall  f1-score   support

           0       0.62      0.84      0.72    459935
           1       0.92      0.79      0.85   1092052

    accuracy                           0.80   1551987
   macro avg       0.77      0.81      0.78   1551987
weighted avg       0.83      0.80      0.81   1551987



In [None]:
# Print coefs
feature_names = log_reg.get_feature_names_out(input_features=X_train.columns)
coefficients = log_reg.coef_
print("\nLogistic Regression Coefficients:")
for feature, coef in zip(feature_names, coefficients):
    print(f"{feature}: {coef:.4f}")

In [None]:
# Predict probabilities on the test set
y_pred_proba = log_reg.predict_proba(X_test)

# Get probabilities for the positive class
y_pred_proba_positive = y_pred_proba[:, 1]

y_pred_proba_positive

Predicted Probabilities: [0.28662101 0.82505579 0.08529271 ... 0.12737952 0.83349854 0.91548977]


## Logistic Regression with Lasso

In [None]:
alphas = np.logspace(-4, 2, 50)

# Create the LogisticRegressionCV model
model = LogisticRegressionCV(
    Cs=1/alphas,  # Invert the alphas to get the Cs values
    cv=5, 
    penalty='l1', 
    solver='saga', 
    max_iter=10000, 
    random_state=222
)

# Fit the model to the training data
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Best regularization parameter (C)
print("Best C (inverse of regularization strength):", model.C_)

## Decision Tree

In [7]:
tree = DecisionTreeRegressor(random_state=222)

# Grid Search parameters
param_grid = {
    'max_depth': [3, 5, 10, 11, 12, 13, 14, 15],
}

grid_search = GridSearchCV(
    estimator=tree,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',
    cv=5,
)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_tree = grid_search.best_estimator_

# Predict probabilities of class 1 on the test set
y_pred_proba = best_tree.predict_proba(X_test)[:, 1]

# Print or use the predicted probabilities as needed
print("Predicted Probabilities of Class 1:", y_pred_proba)

# Predict on the test set
y_pred = best_tree.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


KeyboardInterrupt: 