In [7]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler

def classify_issue(issue):
    stance_groups = {
        'brexit': ['pro_brexit', 'anti_brexit'],
        'climateAction': ['pro_climateAction', 'anti_climateAction'],
        'NHS': ['pro_NHS', 'anti_NHS'],
        'israel_palestine': ['pro_israel', 'pro_palestine'],
        'taxation': ['pro_company_taxation', 'pro_worker_taxation']
    }
    
    if issue not in stance_groups:
        raise ValueError(f"Unknown issue: {issue}")
    
    targets = stance_groups[issue] + ['neutral', 'irrelevant']

    file_path = '/Users/adamzulficar/Documents/year3/Bachelor Project/Thesis/Automated Annotation/Training Data/UK/{}_training.csv'.format(issue)
    df = pd.read_csv(file_path)

    ## SMOTE ##

    def str_to_array(s):
        return np.fromstring(s.strip("[]"), sep=' ')

    features = df['text_vector'].apply(str_to_array).tolist()
    context = df['context_vector'].apply(str_to_array).tolist()
    X = np.array([np.concatenate((f, c)) for f, c in zip(features, context)])
    y_combined = np.array(df[targets])

    smote = SMOTE()
    X_resampled, y_resampled_combined = smote.fit_resample(X, y_combined)

    text_vector_length = len(str_to_array(df['text_vector'].iloc[0]))
    context_vector_length = len(str_to_array(df['context_vector'].iloc[0]))

    text_vectors_resampled = X_resampled[:, :text_vector_length]
    context_vectors_resampled = X_resampled[:, text_vector_length:]

    def array_to_str(arr):
        return ' '.join(map(str, arr))

    resampled_data = pd.DataFrame()
    resampled_data['text_vector'] = list(map(array_to_str, text_vectors_resampled))
    resampled_data['context_vector'] = list(map(array_to_str, context_vectors_resampled))

    for i, target in enumerate(targets):
        resampled_data[target] = y_resampled_combined[:, i]

    data = resampled_data

    ## FEATURE VECTOR PROCESSING ##

    TEXT_VECTOR_SIZE = 100
    CONTEXT_VECTOR_SIZE = 100

    def extract_vectors(row):
        text_vector = np.array(row['text_vector'].split(), dtype=float)
        context_vector = np.array(row['context_vector'].split(), dtype=float)

        if len(text_vector) > TEXT_VECTOR_SIZE:
            text_vector = text_vector[:TEXT_VECTOR_SIZE]
        else:
            text_vector = np.pad(text_vector, (0, TEXT_VECTOR_SIZE - len(text_vector)), 'constant')

        if len(context_vector) > CONTEXT_VECTOR_SIZE:
            context_vector = context_vector[:CONTEXT_VECTOR_SIZE]
        else:
            context_vector = np.pad(context_vector, (0, CONTEXT_VECTOR_SIZE - len(context_vector)), 'constant')

        return np.concatenate([text_vector, context_vector])
    
    ## MODEL ##

    X = data.apply(extract_vectors, axis=1)
    X = np.stack(X.values)
    y = data[targets]

    # Scale the features
    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    def resolve_contradictions(probabilities, stances):
        resolved_stances = np.zeros_like(probabilities)
        stance_groups = {
            'brexit': ['pro_brexit', 'anti_brexit'],
            'climateAction': ['pro_climateAction', 'anti_climateAction'],
            'NHS': ['pro_NHS', 'anti_NHS'],
            'israel_palestine': ['pro_israel', 'pro_palestine'],
            'taxation': ['pro_company_taxation', 'pro_worker_taxation']
        }

        for i, prob in enumerate(probabilities):
            prob_dict = {stance: prob[j] for j, stance in enumerate(stances)}

            max_stance = max(prob_dict, key=prob_dict.get)
            if max_stance in ['irrelevant', 'neutral']:
                resolved_stances[i][stances.index(max_stance)] = 1
            else:
                any_above_threshold = any(p > 0.5 for p in prob_dict.values())
                if any_above_threshold:
                    for group in stance_groups[issue]:
                        relevant_group = [s for s in group if s in stances]
                        if relevant_group:
                            max_stance = max(relevant_group, key=lambda x: prob_dict[x])
                            if prob_dict[max_stance] > 0.5:  # Threshold can still be tuned
                                resolved_stances[i][stances.index(max_stance)] = 1
                else:
                    max_stance = max(prob_dict, key=prob_dict.get)
                    resolved_stances[i][stances.index(max_stance)] = 1

        return resolved_stances

    def evaluate_model(X, y, stances, n_splits=10):
        results = {stance: {'train_accuracy': [], 'val_accuracy': [], 'train_precision': [], 'val_precision': [], 'train_recall': [], 'val_recall': [], 'train_f1_score': [], 'val_f1_score': []} for stance in stances}

        skf = StratifiedKFold(n_splits=n_splits)

        for train_index, test_index in skf.split(X, y['neutral']):  # Using 'neutral' just for stratification
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            for stance in stances:
                param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100], 'penalty': ['l2']}
                grid_search = GridSearchCV(LogisticRegression(max_iter=1000, class_weight='balanced'), param_grid, cv=5, scoring='f1')
                grid_search.fit(X_train, y_train[stance])

                best_clf = grid_search.best_estimator_

                y_train_pred = best_clf.predict(X_train)
                y_test_pred = best_clf.predict(X_test)

                results[stance]['train_accuracy'].append(accuracy_score(y_train[stance], y_train_pred))
                results[stance]['val_accuracy'].append(accuracy_score(y_test[stance], y_test_pred))
                results[stance]['train_precision'].append(precision_score(y_train[stance], y_train_pred, zero_division=0))
                results[stance]['val_precision'].append(precision_score(y_test[stance], y_test_pred, zero_division=0))
                results[stance]['train_recall'].append(recall_score(y_train[stance], y_train_pred, zero_division=0))
                results[stance]['val_recall'].append(recall_score(y_test[stance], y_test_pred, zero_division=0))
                results[stance]['train_f1_score'].append(f1_score(y_train[stance], y_train_pred, zero_division=0))
                results[stance]['val_f1_score'].append(f1_score(y_test[stance], y_test_pred, zero_division=0))

        for stance, metrics in results.items():
            train_accuracy = np.mean(metrics['train_accuracy'])
            val_accuracy = np.mean(metrics['val_accuracy'])
            train_precision = np.mean(metrics['train_precision'])
            val_precision = np.mean(metrics['val_precision'])
            train_recall = np.mean(metrics['train_recall'])
            val_recall = np.mean(metrics['val_recall'])
            train_f1 = np.mean(metrics['train_f1_score'])
            val_f1 = np.mean(metrics['val_f1_score'])

            print(f"Stance: {stance}")
            print(f"Train Accuracy: {train_accuracy} | Validation Accuracy: {val_accuracy}")
            print(f"Train Precision: {train_precision} | Validation Precision: {val_precision}")
            print(f"Train Recall: {train_recall} | Validation Recall: {val_recall}")
            print(f"Train F1 Score: {train_f1} | Validation F1 Score: {val_f1}")

            if val_f1 < train_f1 * 0.8:  # Example threshold for detecting overfitting
                print(f"Warning: Potential overfitting detected for stance {stance}.")

    def predict_and_resolve(X, stances):
        predictions = []

        for stance in stances:
            clf = LogisticRegression(max_iter=1000, class_weight='balanced', C=0.1, penalty='l2')
            clf.fit(X, y[stance])
            stance_predictions = clf.predict_proba(X)[:, 1]
            predictions.append(stance_predictions)

        predictions = np.array(predictions).transpose(1, 0)
        resolved_predictions = resolve_contradictions(predictions, stances)
        
        return resolved_predictions

    evaluate_model(X, y, targets)

classify_issue('brexit')


Stance: pro_brexit
Train Accuracy: 0.9993802675718972 | Validation Accuracy: 0.9744444444444443
Train Precision: 0.9977971835418142 | Validation Precision: 0.5691503267973855
Train Recall: 1.0 | Validation Recall: 0.8
Train F1 Score: 0.998890325146664 | Validation F1 Score: 0.6413636363636364
Stance: anti_brexit
Train Accuracy: 1.0 | Validation Accuracy: 0.93
Train Precision: 1.0 | Validation Precision: 0.5915427181514138
Train Recall: 1.0 | Validation Recall: 0.7279605263157894
Train F1 Score: 1.0 | Validation F1 Score: 0.6464213378975895
Stance: neutral
Train Accuracy: 1.0 | Validation Accuracy: 0.918888888888889
Train Precision: 1.0 | Validation Precision: 0.8127775588163958
Train Recall: 1.0 | Validation Recall: 0.9565217391304348
Train F1 Score: 1.0 | Validation F1 Score: 0.8705572259843184
Stance: irrelevant
Train Accuracy: 0.9972717013968962 | Validation Accuracy: 0.9066666666666666
Train Precision: 0.9892862531409895 | Validation Precision: 0.44691056910569105
Train Recall: 0.9