In [2]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


Prepocess and Merge input Alabama Data. Detailed explanation is in https://colab.research.google.com/drive/1uTY3JLdExXMMfYhBF7spYfAG-cRVARm8?authuser=1#scrollTo=XaIh3_LjMqP8, and https://colab.research.google.com/drive/1uTY3JLdExXMMfYhBF7spYfAG-cRVARm8?authuser=1#scrollTo=k60qrAiqM1VA

In [3]:
import pandas as pd
def merge_cr5(row):
    cr5_cols = ['CR5A', 'CR5B', 'CR5C', 'CR5D', 'CR5E']
    cr5_flags = [1, 2, 3, 4, 5]
    cr5_values = [not pd.isna(row[col]) for col in cr5_cols]  # generates a Boolean list indicating whether each column has a value

    if sum(cr5_values) > 1:  # 7 represents two or more races
        return 7
    elif row['CR4'] in [2, 3, 4, 5] and any(cr5_values):
        return 7
    elif sum(cr5_values) == 1:
        return cr5_flags[cr5_values.index(True)]
    elif row['CR4'] in [2, 3, 4, 5]:
        return 6
    else:
        return None

# Preprocess 2012 data
data_2012 = pd.read_excel('/content/drive/My Drive/Alabama12_ms.xlsx')
data_2012 = data_2012.copy()
data_2012['CR5'] = data_2012.apply(merge_cr5, axis=1)
selected_columns = ['CR2', 'CR3', 'CR5', 'CR7', 'CR11', 'CR12', 'CR13', 'CR21', 'CR22', 'CR23', 'CR26', 'CR27', 'CR28', 'CR54', 'CR56']
data_2012 = data_2012.loc[:, selected_columns]

data_2014 = pd.read_excel('/content/drive/My Drive/Alabama14_ms.xlsx')
data_2014 = data_2014.copy()
data_2014['CR5'] = data_2014.apply(merge_cr5, axis=1)
data_2014 = data_2014.loc[:, selected_columns]

data_2016 = pd.read_csv('/content/drive/My Drive/alabama16_6rigions_ms.sas7bdat.csv')
data_2016 = data_2016.copy()
data_2016['CR5'] = data_2016.apply(merge_cr5, axis=1)
data_2016 = data_2016.loc[:, selected_columns]

# Load the 2024 input data
inputs_2024 = pd.read_csv('/content/drive/My Drive/2024_sample_data_not_survey.csv')
required_columns = ['CR2', 'CR3', 'CR5']
# Ensure the 2024 input data columns match the required columns
conditions_2024 = inputs_2024.loc[:, required_columns]
conditions_2024

Unnamed: 0,CR2,CR3,CR5
0,2.0,3.0,1.0
1,2.0,3.0,3.0
2,1.0,2.0,3.0
3,1.0,1.0,3.0
4,1.0,1.0,
...,...,...,...
3032,1.0,1.0,5.0
3033,2.0,2.0,3.0
3034,1.0,3.0,2.0
3035,2.0,3.0,


In [4]:
# replace the null value to 0 for easier traning
data_2012.fillna(0, inplace=True)
data_2014.fillna(0, inplace=True)
data_2016.fillna(0, inplace=True)
conditions_2024.fillna(0, inplace=True)

KNN model: a non-parametric, supervised learning classifier, which uses proximity to make classifications or predictions about the grouping of an individual data point.

Random Rorest Classifier: a meta estimator that fits a number of decision tree classifiers on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting.

I trained the 2012, 2014, 2016 Alabama data into these models to predict 2024 Alabama data , but I don't include year as a feature, because 2024 year is far way from 2016 year. As I look further back in time, the distance between the vectors of the year becomes greater and greater, largely affecting accuracy. Therefore, the input is only CR2, CR3, CR5. And the output is "CR7", "CR11", "CR12", "CR13", "CR21", "CR22", "CR23", "CR26", "CR27", "CR28", "CR54", and "CR56".

When I first attempted to train my model, I noticed that the predicted values for the 2024 dataset, such as the column CR26, were all '2'. This prompted me to review each output column in my training set, where I discovered that most values were dominated by a single number, indicating an uneven distribution. To address the underrepresented groups, I decided to use SMOTE. SMOTE (Synthetic Minority Over-sampling Technique) is an oversampling technique for binary or multiclass tasks, designed to handle class imbalance issues. It generates new synthetic samples in the feature space to increase the number of minority class samples.

In [12]:
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE

train_data = pd.concat([data_2012, data_2014, data_2016], ignore_index=True)

#shuffle the data
train_data = train_data.sample(frac = 1)

input_cols = ['CR2', 'CR3', 'CR5']
output_cols = ['CR7', 'CR11', 'CR12', 'CR13', 'CR21', 'CR22', 'CR23', 'CR26', 'CR27', 'CR28', 'CR54', 'CR56']

X = train_data[input_cols]
y = train_data[output_cols]
# split the train, validation and test set by 0.4, 0.3, 0.3
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# standardlize each feature value
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)
conditions_2024_scaled = scaler.transform(conditions_2024)

# Use Smote to oversampling to balance data
X_train_resampled = X_train_scaled
y_train_resampled = pd.DataFrame()

for col in y_train.columns:
    min_samples = y_train[col].value_counts().min()
    smote = SMOTE(random_state=42, k_neighbors=min(min_samples - 1, 5))
    X_res, y_res = smote.fit_resample(X_train_scaled, y_train[col])
    if y_train_resampled.empty:
        X_train_resampled = X_res
        y_train_resampled[col] = y_res
    else:
        y_train_resampled[col] = y_res

In [13]:
def multioutput_accuracy_score(y_true, y_pred):
    y_true = y_true.values if isinstance(y_true, pd.DataFrame) else y_true
    y_pred = y_pred if isinstance(y_pred, np.ndarray) else y_pred.values
    return np.mean([accuracy_score(y_true[:, i], y_pred[:, i]) for i in range(y_true.shape[1])])
multioutput_scorer = make_scorer(multioutput_accuracy_score, greater_is_better=True)
param_grid = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

# define KNN hyperparamter list
knn_param_grid = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

# use GridSearchCV to adjust KNN hyperparameter
knn = KNeighborsClassifier()
knn_grid_search = GridSearchCV(knn, knn_param_grid, cv=5, scoring=multioutput_scorer)
knn_grid_search.fit(X_train_resampled, y_train_resampled)

print(f"KNN Best Hyperparameters: {knn_grid_search.best_params_}")

# use KNN best parameter to train the model
best_knn = knn_grid_search.best_estimator_

# predict it in the validation accuracy
y_val_knn_pred = best_knn.predict(X_val_scaled)
val_knn_accuracy = multioutput_accuracy_score(y_val, y_val_knn_pred)
print(f"KNN Validation Accuracy: {val_knn_accuracy}")

# define random forest hyperparameter
rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

rf = RandomForestClassifier(random_state=42)
rf_grid_search = GridSearchCV(rf, rf_param_grid, cv=5, scoring=multioutput_scorer)
rf_grid_search.fit(X_train_resampled, y_train_resampled)

print(f"RandomForest Best Hyperparameters: {rf_grid_search.best_params_}")

best_rf = rf_grid_search.best_estimator_

y_val_rf_pred = best_rf.predict(X_val_scaled)
val_rf_accuracy = multioutput_accuracy_score(y_val, y_val_rf_pred)
print(f"RandomForest Validation Accuracy: {val_rf_accuracy}")

# choose between random forest and knn model based on each validation accuracy
if val_knn_accuracy > val_rf_accuracy:
    best_model = best_knn
    print("Best Model: KNN")
else:
    best_model = best_rf
    print("Best Model: RandomForest")

# performance in the test set
y_test_pred = best_model.predict(X_test_scaled)
test_accuracy = multioutput_accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {test_accuracy}")


KNN Best Hyperparameters: {'metric': 'euclidean', 'n_neighbors': 7, 'weights': 'distance'}
KNN Validation Accuracy: 0.8186604886267904
RandomForest Best Hyperparameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 50}
RandomForest Validation Accuracy: 0.7349761303004775
Best Model: KNN
Test Accuracy: 0.8085509688289806


In [14]:
# predict 2024 Alabama data by 2024 inputs
predictions = best_model.predict(conditions_2024_scaled)
predicted_df = pd.DataFrame(predictions, columns=output_cols)
# replace back 0 value with null value
predicted_df.replace(0, np.nan, inplace=True)
predicted_df

Unnamed: 0,CR7,CR11,CR12,CR13,CR21,CR22,CR23,CR26,CR27,CR28,CR54,CR56
0,2.0,1.0,1.0,1.0,2.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0
1,1.0,1.0,1.0,1.0,2.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0
2,2.0,1.0,1.0,1.0,2.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0
3,2.0,1.0,1.0,1.0,2.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0
4,,1.0,1.0,1.0,2.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
3032,2.0,1.0,1.0,1.0,2.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0
3033,,1.0,1.0,1.0,2.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0
3034,2.0,1.0,1.0,1.0,2.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0
3035,2.0,1.0,1.0,1.0,2.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0


In [16]:
result = pd.concat([inputs_2024.reset_index(drop=True), predicted_df], axis=1)
result.to_csv('classification_2024_data.csv', index=False)