In [25]:
import spacy
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import matplotlib as plt

# Load the English model
nlp = spacy.load('es_core_news_lg')


In [26]:
# Define a function to extract dependency tree features from a sentence
def extract_dep_features(sentence):
    # Parse the sentence
    doc = nlp(sentence)
    
    # Extract dependency features
    dep_labels = []
    dep_distances = []
    for token in doc:
        if token.dep_ != 'ROOT':
            dep_labels.append(token.dep_)
            dep_distances.append(abs(token.i - token.head.i))
        
    # Convert the features into a dictionary
    dep_features = {
        'nsubj': int('nsubj' in dep_labels),
        'dobj': int('dobj' in dep_labels),
        'dep_distance_1': int(1 in dep_distances),
        'dep_distance_2': int(2 in dep_distances),
        'dep_distance_3': int(3 in dep_distances),
        # add more features as needed
    }
    
    return dep_features


In [27]:
# Load the labeled data
df = pd.read_csv('C:\\Users\\Jerem\\Desktop\\counterfactuals.csv', encoding='utf8')

df_label_0 = df[df['Label'] == 0]  # select only rows with label == 0
df_label_0_sample = df_label_0.sample(n=653, random_state=42)  # randomly sample 653 rows

df_label_1 = df[df['Label'] == 1]  # select only rows with label == 0

sample_df = pd.concat([df_label_0_sample, df_label_1])

X = sample_df['Text'].to_list()
Y = sample_df['Label'].to_list()

# split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42, shuffle=True)

In [31]:
# Extract features from the sentences
all_features = [extract_dep_features(sentence) for sentence in X_train]

# Convert the features into a matrix
vec = DictVectorizer()
X = vec.fit_transform(all_features)

# Train a logistic regression model
clf = LogisticRegression()
clf.fit(X, y_train)

# Predict the labels for new sentences

new_features = [extract_dep_features(sentence) for sentence in X_test]
new_X = vec.transform(new_features)
new_labels = clf.predict(new_X)

              precision    recall  f1-score   support

           0       0.89      0.19      0.31       133
           1       0.54      0.98      0.69       129

    accuracy                           0.58       262
   macro avg       0.72      0.58      0.50       262
weighted avg       0.72      0.58      0.50       262



In [33]:
accuracy = accuracy_score(y_test, new_labels)
print('Accuracy:', accuracy)
print(classification_report(y_test, new_labels))


Accuracy: 0.5763358778625954
              precision    recall  f1-score   support

           0       0.89      0.19      0.31       133
           1       0.54      0.98      0.69       129

    accuracy                           0.58       262
   macro avg       0.72      0.58      0.50       262
weighted avg       0.72      0.58      0.50       262

