# Imports

In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, matthews_corrcoef
from sklearn.model_selection import cross_val_score
import numpy as np

### Loading Word2Vec Feature Matrix

In [42]:
df = pd.read_csv('Data/Word2Vec_feature_data.csv')
df

Unnamed: 0,author_ID,average_word_length,avg_sentence_length,ttr,nr_unique_words,nr_chars,nr_contradictions,subjectivity,nr_period,nr_comma,...,feature_91,feature_92,feature_93,feature_94,feature_95,feature_96,feature_97,feature_98,feature_99,Poles
0,t2_ffcfiueh,5.481442,66.164835,0.707317,667,6111,6,0.485495,0.091198,0.047720,...,0.030382,-0.038984,0.082427,0.027456,-0.110414,-0.035993,-0.043381,0.027923,-0.085713,Western
1,t2_lfs48,6.285714,85.014925,0.718078,568,5762,17,0.487030,0.113780,0.078382,...,-0.002733,-0.035620,0.086639,0.089150,-0.005477,-0.005393,-0.010546,0.005088,-0.034315,Western
2,t2_zcj4y,5.580460,34.134969,0.747126,650,5724,18,0.551005,0.174713,0.065517,...,0.041019,-0.025408,0.076059,0.078353,-0.077311,-0.021945,-0.051248,-0.023123,-0.007378,Western
3,t2_2xpu7n1c,5.481297,75.441176,0.640898,514,5197,0,0.514366,0.073566,0.112219,...,0.059938,-0.004517,0.064531,0.129023,-0.062811,0.001734,-0.053279,-0.044899,0.018924,Western
4,t2_3edl7,5.916865,60.968085,0.719715,606,5823,11,0.401854,0.122328,0.085511,...,-0.016096,-0.002657,0.096512,0.061793,-0.128142,0.001301,-0.059142,0.026707,-0.030807,Western
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30934,t2_8hacr7if,5.697468,49.875000,0.683544,540,5290,13,0.437244,0.111392,0.037975,...,-0.054445,-0.005667,0.073418,0.111456,-0.142758,-0.026188,-0.088024,0.020324,-0.023312,Eastern
30935,t2_8hacr7if,5.790725,50.450450,0.744352,626,5710,9,0.436489,0.109394,0.048751,...,-0.053965,0.010229,0.090820,0.098168,-0.123997,0.001711,-0.086069,0.063151,-0.029690,Eastern
30936,t2_8hacr7if,5.671835,47.716981,0.700258,542,5163,9,0.466545,0.131783,0.047804,...,0.012857,-0.056423,0.078301,0.113703,-0.119587,-0.012560,-0.153321,0.051385,-0.040174,Eastern
30937,t2_8hacr7if,5.748459,47.866071,0.704069,571,5472,7,0.428413,0.114673,0.053021,...,0.011493,-0.041948,0.091651,0.121716,-0.118058,-0.020970,-0.146114,0.043594,-0.022459,Eastern


## Logistic Regression + Evaluation metrics function

In [28]:
def Log_Reg(df, embeddings, X):
    """
    Function used to run logistic regression and print several evaluation metrics such as: precision, recall, accuracy, F1 score across classes as well as a Classification report. Function also prints the 5-fold cross validation scores for precision.
    Input parameter descriptions:
    new_df : the input dataframe with required features as well as outcome variable for model (type DataFrame)
    embeddings : boolean value, used to run either the baseline model without embeddings or the model with embeddings (type Boolean)
    X : the input dataframe either with or wthout document embeddings (type DataFrame)
    """
    # y data
    y = df['Poles']

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize and train the logistic regression model
    clf = LogisticRegression(penalty='l2', C=1.0, max_iter=10000)
    clf.fit(X_train, y_train)

    # Make predictions on the test set
    predictions = clf.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions, average='weighted')
    recall = recall_score(y_test, predictions, average='weighted')
    f1 = f1_score(y_test, predictions, average='weighted')
    mcc = matthews_corrcoef(y_test, predictions)
    if embeddings:
        print("Evaluation metrics of Logistic Regression WITH embeddings are as follows: ")
    else:
        print("Evaluation metrics of Baseline Logistic Regression (WITHOUT embeddings) are as follows: ")
    print()
    print(f'Accuracy: {accuracy}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print(f'F1-score: {f1}')
    print(f'Matthews Correlation Coefficient: {mcc}')
    print()

    # Generate a classification report
    class_report = classification_report(y_test, predictions)
    print('Classification Report:\n', class_report)
    print()

    # Perform 5-fold cross-validation
    cv_scores = cross_val_score(clf, X, y, cv=5, scoring='f1_weighted')
    # Print cross-validation scores
    print('Cross-Validation Scores:', cv_scores)
    print('Mean CV Accuracy:', np.mean(cv_scores))

    # returning the model
    return clf

## Model Selection (User input)

Our baseline model is a simple logistic regression model without document embeddings but only linguistic and grammatical features such as average sentence length, subjectivity, etc. We also decided to test the performance of the same model but this time with the document embeddings as additional features.

When prompted with making an input, enter "yes" if you want to obtain the results of the logistic regression with the document embeddings and enter "no", if you would like to obtain the results of the baseline model


In [37]:
while True:
    user_input = input(" Do you want to test the logistic regression with document embeddings (yes/no)?")

    if user_input.lower() == "yes":
        embeddings = True
        # Creating X dataframe with document embeddings
        X = df.drop(['author_ID', 'Poles', ], axis=1)
        break
    elif user_input.lower() == "no":
        embeddings = False
        # Creating X dataframe by extracting all features except document eembeddings
        X = df.loc[:, ['average_word_length','avg_sentence_length', 'ttr', 'nr_unique_words', 'nr_chars', 'nr_contradictions', 'subjectivity','nr_period', 'nr_comma', 'nr_question', 'nr_exclamation']]
        break
    else:
        print("Invalid input. Please enter 'yes' or 'no'. Try again.")

print()
print("You have entered a valid input")


You have entered a valid input


### Displaying Evaluation Metrics of chosen model

In [43]:
model = Log_Reg(df, embeddings, X)

Evaluation metrics of Logistic Regression WITH embeddings are as follows: 

Accuracy: 0.8055914673561733
Precision: 0.8056890113280266
Recall: 0.8055914673561733
F1-score: 0.805597636135838
Matthews Correlation Coefficient: 0.6112607449669624

Classification Report:
               precision    recall  f1-score   support

     Eastern       0.80      0.81      0.80      3058
     Western       0.81      0.80      0.81      3130

    accuracy                           0.81      6188
   macro avg       0.81      0.81      0.81      6188
weighted avg       0.81      0.81      0.81      6188


Cross-Validation Scores: [0.81281167 0.77659507 0.70505122 0.61198106 0.7307442 ]
Mean CV Accuracy: 0.7274366452006724


## Feature Importance

In [39]:
# Check if the model has the attribute coef_ (handles regularization)
if hasattr(model, 'coef_'):
    coefficients = model.coef_[0]
else:
    # For models with regularization, coefficients are stored in coef_ only if not regularized
    coefficients = model.coef_

# Create a DataFrame to display feature names and their corresponding coefficients
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Coefficient': coefficients})

# Sort features by absolute coefficient values (importance)
feature_importance_df['Absolute_Coefficient'] = feature_importance_df['Coefficient'].abs()
feature_importance_df = feature_importance_df.sort_values(by='Absolute_Coefficient', ascending=False)

# Display the feature importance DataFrame
feature_importance_df

Unnamed: 0,Feature,Coefficient,Absolute_Coefficient
42,feature_31,-7.445960,7.445960
30,feature_19,7.103592,7.103592
60,feature_49,-7.077410,7.077410
52,feature_41,6.974895,6.974895
77,feature_66,6.583141,6.583141
...,...,...,...
5,nr_contradictions,0.025130,0.025130
90,feature_79,-0.013925,0.013925
1,avg_sentence_length,0.003166,0.003166
3,nr_unique_words,-0.002646,0.002646


### Feature ranking of linguistic + grammatical features

We rank the features based on how important it is for the binary classifcation task of the model. Features higher up in ranking have higher absolute coefficient values

In [40]:
# re-setting the index to make row numbers easily readable
feature_importance_df = feature_importance_df.reset_index(drop=True)

values_to_find = ['average_word_length', 'avg_sentence_length', 'ttr', 'nr_unique_words', 'nr_chars', 'nr_contradictions', 'subjectivity', 'nr_period', 'nr_comma', 'nr_question', 'nr_exclamation']

# Create a boolean mask for each value in the list
mask = feature_importance_df.isin(values_to_find)

# Use the any() method to check if any of the values match in each row
result = mask.any(axis=1)

# Get the row numbers where the values are present
row_numbers = result.index[result].tolist()

# Print row numbers and respective values
for row_number in row_numbers:
    values = feature_importance_df.iloc[row_number].values.tolist()
    print(f"Row {row_number + 1}: {values}")

Row 49: ['nr_comma', -2.2895543034254904, 2.2895543034254904]
Row 53: ['subjectivity', -1.9952040714440074, 1.9952040714440074]
Row 75: ['nr_period', -1.162971792104766, 1.162971792104766]
Row 82: ['average_word_length', 0.6951716306608383, 0.6951716306608383]
Row 94: ['ttr', -0.29434231416973594, 0.29434231416973594]
Row 95: ['nr_exclamation', 0.28701172063714137, 0.28701172063714137]
Row 105: ['nr_question', -0.044363594028351994, 0.044363594028351994]
Row 107: ['nr_contradictions', 0.025130474545760444, 0.025130474545760444]
Row 109: ['avg_sentence_length', 0.003165755094896711, 0.003165755094896711]
Row 110: ['nr_unique_words', -0.002645644568672686, 0.002645644568672686]
Row 111: ['nr_chars', -0.0002876709713411503, 0.0002876709713411503]
