In [100]:
#Jack Corley, Savannah Balistreri, Lauren Vu

import pandas as pd
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split, GridSearchCV

In [101]:
# Model 1, including race and gender - used for other model as well, just the .csv changed

df = pd.read_csv('resume.csv')

if df.isna().sum().sum() > 0:
    for column in df.columns:
        # Check if the column is numerical
        if df[column].dtype == 'float64' or df[column].dtype == 'int64':
            df[column].fillna(df[column].mean(), inplace=True)
        else:
            df[column].fillna(df[column].mode()[0], inplace=True)

# one-hot encode the categorical columns
categorical_columns = df.select_dtypes(include=['object']).columns
df_encoded = pd.get_dummies(df, columns=categorical_columns)
encoded_columns = df_encoded.columns

X = df_encoded.drop('received_callback', axis=1)  # 'axis=1' = 'axis='columns''
y = df_encoded['received_callback']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # try random test state(?)

# param_grid = {
#     'C': [0.1, 1, 10],  # Range for C
#     'degree': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],  # Range for degree
#     'kernel': ['poly']  # Polynomial kernel
# }

# grid_search = GridSearchCV(svm.SVC(), param_grid, cv=5, scoring='precision', verbose=2)
# grid_search.fit(X_train, y_train)

model_all = svm.SVC(kernel='poly', degree=10, C=15, gamma='scale', class_weight='balanced')

model_all.fit(X_train, y_train)

predictions_all = model_all.predict(X_test)

conf_matrix = confusion_matrix(y_test, predictions_all)


tp = conf_matrix[1, 1]
tn = conf_matrix[0, 0]
fp = conf_matrix[0, 1]
fn = conf_matrix[1, 0]

print("Confusion Matrix for New Data:")
print(conf_matrix)

print("\nTrue Positives (TP):", tp)
print("True Negatives (TN):", tn)
print("False Positives (FP):", fp)
print("False Negatives (FN):", fn)

print("Accuracy:", accuracy_score(y_test, predictions_all))
print("Classification Report:")
print(classification_report(y_test, predictions_all))

final_report = ["Final Report"]


Confusion Matrix for New Data:
[[788  90]
 [ 88   8]]

True Positives (TP): 8
True Negatives (TN): 788
False Positives (FP): 90
False Negatives (FN): 88
Accuracy: 0.8172484599589322
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.90      0.90       878
           1       0.08      0.08      0.08        96

    accuracy                           0.82       974
   macro avg       0.49      0.49      0.49       974
weighted avg       0.82      0.82      0.82       974



In [102]:
new_data = pd.read_csv('removed_resume.csv')

if 'received_callback' not in new_data.columns:
    raise ValueError("Column 'received_callback' missing in new data")

if new_data.isna().sum().sum() > 0:
    for column in new_data.columns:
        # Check if the column is numerical
        if new_data[column].dtype == 'float64' or new_data[column].dtype == 'int64':
            new_data[column].fillna(new_data[column].mean(), inplace=True)
        else:
            new_data[column].fillna(new_data[column].mode()[0], inplace=True)

new_data_encoded = pd.get_dummies(new_data, columns=categorical_columns)

new_data_aligned = new_data_encoded.reindex(columns=encoded_columns, fill_value=0)

if 'received_callback' not in new_data_aligned.columns:
    raise ValueError("Column 'received_callback' missing after aligning new data")

X_new = new_data_aligned.drop('received_callback', axis=1)
y_new = new_data['received_callback']

predictions_new = model_all.predict(X_new)

conf_matrix_new = confusion_matrix(y_new, predictions_new)

tp = conf_matrix_new[1, 1]
tn = conf_matrix_new[0, 0]
fp = conf_matrix_new[0, 1]
fn = conf_matrix_new[1, 0]

print("Confusion Matrix for New Data:")
print(conf_matrix_new)

print("\nTrue Positives (TP):", tp)
print("True Negatives (TN):", tn)
print("False Positives (FP):", fp)
print("False Negatives (FN):", fn)

# Additional evaluation metrics
print("\nClassification Report:")
print(classification_report(y_new, predictions_new))

first = "The precision for all white people is: "
to_append = first + str(tp/(tp+fp))
final_report.append(to_append)

Confusion Matrix for New Data:
[[4089  389]
 [ 306   86]]

True Positives (TP): 86
True Negatives (TN): 4089
False Positives (FP): 389
False Negatives (FN): 306

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.91      0.92      4478
           1       0.18      0.22      0.20       392

    accuracy                           0.86      4870
   macro avg       0.56      0.57      0.56      4870
weighted avg       0.87      0.86      0.86      4870



In [103]:
new_data = pd.read_csv('clones/black_women_no_rorg.csv')

if 'received_callback' not in new_data.columns:
    raise ValueError("Column 'received_callback' missing in new data")

if new_data.isna().sum().sum() > 0:
    for column in new_data.columns:
        # Check if the column is numerical
        if new_data[column].dtype == 'float64' or new_data[column].dtype == 'int64':
            new_data[column].fillna(new_data[column].mean(), inplace=True)
        else:
            new_data[column].fillna(new_data[column].mode()[0], inplace=True)

new_data_encoded = pd.get_dummies(new_data, columns=categorical_columns)

new_data_aligned = new_data_encoded.reindex(columns=encoded_columns, fill_value=0)

if 'received_callback' not in new_data_aligned.columns:
    raise ValueError("Column 'received_callback' missing after aligning new data")

X_new = new_data_aligned.drop('received_callback', axis=1)
y_new = new_data['received_callback']

predictions_new = model_all.predict(X_new)

conf_matrix_new = confusion_matrix(y_new, predictions_new)

tp = conf_matrix_new[1, 1]
tn = conf_matrix_new[0, 0]
fp = conf_matrix_new[0, 1]
fn = conf_matrix_new[1, 0]

print("Confusion Matrix for New Data:")
print(conf_matrix_new)

print("\nTrue Positives (TP):", tp)
print("True Negatives (TN):", tn)
print("False Positives (FP):", fp)
print("False Negatives (FN):", fn)

# Additional evaluation metrics
print("\nClassification Report:")
print(classification_report(y_new, predictions_new))

first = "The precision for all black women is: "
to_append = first + str(tp/(tp+fp))
final_report.append(to_append)

Confusion Matrix for New Data:
[[1645  116]
 [  94   31]]

True Positives (TP): 31
True Negatives (TN): 1645
False Positives (FP): 116
False Negatives (FN): 94

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.93      0.94      1761
           1       0.21      0.25      0.23       125

    accuracy                           0.89      1886
   macro avg       0.58      0.59      0.58      1886
weighted avg       0.90      0.89      0.89      1886

