In [None]:
import pandas as pd
import numpy as np
import string
import joblib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, f1_score, precision_score, recall_score,
    roc_auc_score, classification_report
)

In [None]:
# Load dataset
data = pd.read_csv('train.csv')

# Preprocess dataset
# Drop the first unnamed column (if exists)
data = data.drop(columns=data.columns[0], axis=1)

# Rename columns for clarity
data = data.rename(columns={'0': 'text', '1': 'label'})

# Remove rows with missing values
data = data.dropna(subset=['text', 'label'])

# Function to preprocess text
def preprocess_text(text):
    """
    Converts text to lowercase and removes punctuation.
    """
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    return text

# Apply text preprocessing
data['text'] = data['text'].apply(preprocess_text)

In [None]:
# Load GloVe embeddings
glove_path = 'glove.6B.100d.txt'
embeddings = {}
print("Loading GloVe embeddings...")
with open(glove_path, 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        embeddings[word] = vector
print("GloVe embeddings loaded successfully.")

In [None]:
# Function to generate text embeddings
def get_glove_embedding(text, embeddings, dim=100):
    """
    Converts text into a GloVe-based embedding vector by averaging word embeddings.
    """
    words = text.split()
    valid_vectors = [embeddings[word] for word in words if word in embeddings]
    return np.mean(valid_vectors, axis=0) if valid_vectors else np.zeros(dim)

In [None]:
# Create feature matrix
X = np.array([get_glove_embedding(text, embeddings) for text in data['text']])
y = data['label'].values

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=True, stratify=y
)

# Display class distribution
for label, count in enumerate(np.bincount(y)):
    print(f"Class {label}: {count} samples")

In [None]:
# Initialize and train logistic regression model
model = LogisticRegression(
    penalty='l2',
    C=0.5,
    max_iter=100000,
    solver='saga'
)

In [None]:
print("Training the model...")
model.fit(X_train, y_train)
print("Model training completed.")

In [None]:
# Evaluate model on training set
y_pred_train = model.predict(X_train)
y_pred_proba_train = model.predict_proba(X_train)[:, 1]

In [None]:
def evaluate_model(y_true, y_pred, y_proba):
    """
    Computes evaluation metrics and prints a classification report.
    """
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_proba)
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print(f"AUC-ROC: {auc:.4f}")
    print("\nClassification Report:\n", classification_report(y_true, y_pred))

In [None]:
print("\n=== Training Performance ===")
evaluate_model(y_train, y_pred_train, y_pred_proba_train)

In [None]:
# Evaluate model on test set
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]
print("\n=== Test Performance ===")
evaluate_model(y_test, y_pred, y_pred_proba)

In [None]:
# Evaluate performance at different thresholds
def evaluate_at_threshold(y_true, y_proba, threshold):
    """
    Evaluates model performance at a specific probability threshold.
    """
    y_pred = (y_proba >= threshold).astype(int)
    return accuracy_score(y_true, y_pred), precision_score(y_true, y_pred), recall_score(y_true, y_pred), f1_score(y_true, y_pred), roc_auc_score(y_true, y_proba)

thresholds = [0.3, 0.5, 0.7]
print("\n=== Training Performance at Different Thresholds ===")
for thresh in thresholds:
    acc, prec, rec, f1, auc = evaluate_at_threshold(y_train, y_pred_proba_train, thresh)
    print(f"Threshold: {thresh} | Accuracy: {acc:.4f}, Precision: {prec:.4f}, Recall: {rec:.4f}, F1-Score: {f1:.4f}, AUC-ROC: {auc:.4f}")

print("\n=== Test Performance at Different Thresholds ===")
for thresh in thresholds:
    acc, prec, rec, f1, auc = evaluate_at_threshold(y_test, y_pred_proba, thresh)
    print(f"Threshold: {thresh} | Accuracy: {acc:.4f}, Precision: {prec:.4f}, Recall: {rec:.4f}, F1-Score: {f1:.4f}, AUC-ROC: {auc:.4f}")

In [None]:
# Function to make predictions on new text
def predict_gender_bias(text, model, embeddings, dim=100):
    """
    Predicts whether a given text exhibits gender bias using the trained model.
    """
    processed_text = preprocess_text(text)
    embedding = get_glove_embedding(processed_text, embeddings, dim).reshape(1, -1)
    prediction = model.predict(embedding)[0]
    probability = model.predict_proba(embedding)[0][1]
    return prediction, probability

In [None]:
# Example inference
test_texts = ["Test Sentence"]
print("\n=== Inference Examples ===")
for text in test_texts:
    pred, prob = predict_gender_bias(text, model, embeddings)
    label = "Biased" if prob >= 0.3 else "Unbiased"
    print(f"Text: '{text}' | Prediction: {label} (Probability of bias: {prob:.4f})")

# Save trained model
joblib.dump(model, "logreg_model.pkl")