In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('/Users/katerynaburovova/PycharmProjects/dehumanization/annotation/final_labels.csv', index_col=0)

## Log Reg over lemmatized text

In [None]:
df

In [None]:
df_dehumanization = df[['Dehumanization', 'text']].reset_index(drop=True)

In [None]:
df_dehumanization = df_dehumanization[df_dehumanization['Dehumanization']!='не можу визначитись з правильною відповіддю']

In [None]:
len(df_dehumanization)

In [None]:
df_dehumanization['label'] = df_dehumanization['Dehumanization'].apply(lambda x: 0 if x=='ні' else 1)

In [None]:
df_dehumanization

## Preprocessing

In [None]:
import spacy
nlp = spacy.load('ru_core_news_md',disable=['ner', 'attribute_ruler'])

def lemmatize_spacy(text):
    doc = nlp(text)
    result = " ".join([token.lemma_ for token in doc])
    return result

In [None]:
cyrillic_letters = u"абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ "

def clean_text(string, allowed_symbols):
    getVals = list(filter(lambda x: x in allowed_symbols, string))
    result = "".join(getVals)
    return result

In [None]:
def preprocess_df(df, col):
    df['text_clean'] = df[col].apply(lambda x: clean_text(x.lower(), cyrillic_letters))
    df['text_lemmatized'] = df['text_clean'].apply(lambda x: lemmatize_spacy(x))
    df=df[df['text_clean']!='']
    df.reset_index(inplace=True)
    return df

In [None]:
%%time
df_dehumanization = preprocess_df(df_dehumanization, 'text').copy()

## Logreg

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

X_train_, X_test_, y_train, y_test = train_test_split(df_dehumanization["text_clean"], df_dehumanization["label"], test_size=0.2, random_state=42)

In [None]:
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train_)
X_test = vectorizer.transform(X_test_)

In [None]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(solver='lbfgs', max_iter=1000)
logreg.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
y_pred = logreg.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

### Gridsearch

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100], 'penalty': ['l1', 'l2']}

grid_search = GridSearchCV(LogisticRegression(solver='liblinear', random_state=42), param_grid, cv=5)
grid_search.fit(X_train, y_train)

logreg_best = grid_search.best_estimator_

logreg_best.fit(X_train, y_train)

In [None]:
y_pred = logreg_best.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

In [None]:
best_params = logreg_best.get_params()
print("Best hyperparameters for the logistic regression model:")
for param, value in best_params.items():
    print(f"{param}: {value}")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False, xticklabels=['Class 0', 'Class 1'], yticklabels=['Class 0', 'Class 1'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

In [None]:
original_data = df_dehumanization.loc[X_test_.index]

In [None]:
original_data['Predicted Label'] = y_pred
original_data['Prediction Status'] = original_data['label'] == original_data['Predicted Label']
incorrect_predictions = original_data[~original_data['Prediction Status']]

In [None]:
print("Incorrectly predicted samples:")
for i, (index, row) in enumerate(incorrect_predictions.iterrows()):
    print(f"Index: {index}\nOriginal Text: {row['text']}\nTrue Label: {row['label']}\nPredicted Label: {row['Predicted Label']}\n")
    if i>=15:
        break

### Preliminary error analysis

Укрвояки сприймається системою як дегуманізація

Рядок 2562, 2210, 1288, 1188, 291, 2486 - сумнівний лейбл, цілком можливо що система права

Хохлофейки тригерить систему

Чомусь тригерить коронавірус (93, 33)

Не впізнає місцями свинорейх (як в 3419, 170)

Майже весь неонацизм постійно тригерить систему - 2775, 1011, 289 (але от в 2320 - неонацизм как раковая опухоль - ні, 3193 - )

1070 - система тригериться на чубатих

Система гірше зчитує subtle cues - 554 (накапливаются), 2335 (зондероотряд), 486 (не бандероукропианці, а держава = навоз), 2646 (опис а не фразеологізм)

Не всі атрибути нацизма впізнає - 678 (бандерюгенд), 2335 (зондероотряд), 2293 (бандерофашисти)

2086 - укропские мартішки




## Log Reg over lemmatized text plus (or exclusively) collocations (with concatenaction)

In [None]:
from collocation_extraction import collect_verb_obl_obj, collect_core, collect_nmod, collect_amod, collect_comp, collect_appos, show_dependency, lst_to_str

In [None]:
df_dehumanization['core_noun_verb'] = df_dehumanization['text'].apply(lambda x: lst_to_str(collect_core(x)))
df_dehumanization['verb_obl_obj'] = df_dehumanization['text'].apply(lambda x: lst_to_str(collect_verb_obl_obj(x)))
df_dehumanization['nmod'] = df_dehumanization['text'].apply(lambda x: lst_to_str(collect_nmod(x)))
df_dehumanization['amod'] = df_dehumanization['text'].apply(lambda x: lst_to_str(collect_amod(x)))
df_dehumanization['comp'] = df_dehumanization['text'].apply(lambda x: lst_to_str(collect_comp(x)))

In [None]:
df_dehumanization.drop(columns='index', inplace=True)

In [None]:
df_dehumanization.head(10)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

def get_logreg_best(col_list = ['text_lemmatized'], df=df_dehumanization, vectorizer = TfidfVectorizer(), random_state=42):
    df.loc[:, 'merged_col'] = df[col_list].apply(lambda x: ' '.join(x.astype(str)), axis=1)
    X_train_, X_test_, y_train, y_test = train_test_split(df["merged_col"], df["label"], test_size=0.2, random_state=random_state)
    X_train = vectorizer.fit_transform(X_train_)
    X_test = vectorizer.transform(X_test_)
    param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100], 'penalty': ['l1', 'l2']}
    grid_search = GridSearchCV(LogisticRegression(solver='liblinear', random_state=random_state), param_grid, cv=5)
    grid_search.fit(X_train, y_train)
    logreg_best = grid_search.best_estimator_
    logreg_best.fit(X_train, y_train)
    y_pred = logreg_best.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 Score: {f1:.2f}")

    best_params = logreg_best.get_params()
    print("Best hyperparameters for the logistic regression model:")
    for param, value in best_params.items():
        print(f"{param}: {value}")

In [None]:
get_logreg_best()

In [None]:
get_logreg_best(col_list=['text_clean'])

In [None]:
get_logreg_best(col_list=['core_noun_verb'])

In [None]:
get_logreg_best(col_list=['verb_obl_obj'])

In [None]:
get_logreg_best(col_list=['nmod'])


In [None]:
get_logreg_best(col_list=['amod'])

In [None]:
get_logreg_best(col_list=['nmod', 'verb_obl_obj'])

In [None]:
get_logreg_best(col_list=['nmod', 'verb_obl_obj', 'core_noun_verb'])

In [None]:
get_logreg_best(col_list=['nmod', 'verb_obl_obj', 'core_noun_verb', 'amod'])


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [None]:
get_logreg_best(col_list=['nmod', 'verb_obl_obj', 'core_noun_verb', 'amod', 'text_clean'])

In [None]:
get_logreg_best(col_list=['nmod', 'verb_obl_obj', 'core_noun_verb', 'amod', 'text_lemmatized', 'text_clean'])

In [None]:
df_dehumanization.columns

## Log Reg over lemmatized text plus (or exclusively) collocations (as separate features)

In [None]:
from scipy.sparse import hstack
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

class MultiColumnTfidfVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, columns, **kwargs):
        self.columns = columns
        self.vectorizers = [TfidfVectorizer(**kwargs) for _ in columns]
        self.fitted_models = []

    def fit(self, X, y=None):
        for col, vec in zip(self.columns, self.vectorizers):
            vec.fit(X[col])
            self.fitted_models.append(vec.fit(X[col]))
        return self

    def transform(self, X):
        features = []
        for col, vec in zip(self.columns, self.vectorizers):
            features.append(vec.transform(X[col]))
        return hstack(features)


def train_logistic_regression(data, text_columns, label_column, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(data[text_columns], data[label_column], test_size=0.2, random_state=random_state)
    pipeline = Pipeline([
        ('vectorizer', MultiColumnTfidfVectorizer(columns=text_columns)),
        ('regressor', LogisticRegression(solver='liblinear', random_state=random_state, max_iter=1000))
    ])

    param_grid = {
        'regressor__fit_intercept': [True, False],
        'regressor__C': [0.001, 0.01, 0.1, 0.5, 1, 2, 10, 100],
        'regressor__penalty': ['l1', 'l2']
    }

    grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, scoring='f1')
    grid_search.fit(X_train, y_train.values.ravel())
    logreg_best = grid_search.best_estimator_
    logreg_best.fit(X_train, y_train)

    y_pred = logreg_best.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 Score: {f1:.2f}")

    return logreg_best

In [None]:
text_columns = ['nmod', 'verb_obl_obj']
label_column = ['label']

In [None]:
%%time
trained_pipeline = train_logistic_regression(df_dehumanization, text_columns, label_column)

In [None]:
%%time
text_columns = ['nmod', 'verb_obl_obj', 'core_noun_verb', 'amod', 'text_lemmatized', 'text_clean']

trained_pipeline = train_logistic_regression(df_dehumanization, text_columns, label_column)

## Feature importance

### Feature importance of unique vectors

In [None]:
def get_feature_names(vectorizer):
    feature_names = []
    for col, vec in zip(vectorizer.columns, vectorizer.vectorizers):
        feature_names.extend([f"{col}_{f}" for f in vec.get_feature_names_out()])
    return feature_names

In [None]:
import numpy as np
def display_feature_importance(pipeline, n=10):
    vectorizer = pipeline.named_steps['vectorizer']
    classifier = pipeline.named_steps['regressor']
    feature_names = get_feature_names(vectorizer)
    coefficients = classifier.coef_[0]
    sorted_indices = np.argsort(np.abs(coefficients))[::-1]
    print(f"Top {n} important features:")
    for i in sorted_indices[:n]:
        print(f"{feature_names[i]}: {coefficients[i]}")

In [None]:
display_feature_importance(trained_pipeline)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
import pandas as pd

tfidf_vectorizer = TfidfVectorizer(max_features=10000)

top_features = pd.DataFrame()

# looping over each text column and apply TF-IDF vectorization followed by chi-squared test
for col in text_columns:
    tfidf_features = tfidf_vectorizer.fit_transform(df_dehumanization[col])

    # chi-squared test to select the top-k features with the lowest p-values
    selector = SelectKBest(chi2, k=100)
    selector.fit(tfidf_features, df_dehumanization['label'])
    feature_scores = pd.DataFrame({
        'feature': tfidf_vectorizer.get_feature_names_out(),
        'p_value': selector.pvalues_,
    })

    # sorting the features by p-value and add the top-k features to the top_features
    top_k_features = feature_scores.sort_values(by='p_value').head(100)['feature']
    top_features[col] = top_k_features

final_features = pd.concat([top_features[col] for col in top_features.columns]).unique().tolist()

In [None]:
final_features[:20]

In [None]:
df = pd.DataFrame(final_features, columns=['final_features'])
df.to_json('most_important_features.json')

## Importance of columns (ie collocations and versions of pre-processing)


### Averaged importance

In [None]:
def get_average_importance(trained_pipeline):
    multi_column_tfidf_vectorizer = trained_pipeline.named_steps['vectorizer']
    logistic_regression = trained_pipeline.named_steps['regressor']
    coef = logistic_regression.coef_

    column_importance = pd.DataFrame()
    start = 0
    for i, col in enumerate(multi_column_tfidf_vectorizer.columns):
        vec = multi_column_tfidf_vectorizer.fitted_models[i]
        end = start + len(vec.get_feature_names_out())
        feature_scores = pd.DataFrame({
            'feature': vec.get_feature_names_out(),
            'importance': abs(coef[0][start:end])
        })
        column_importance[col] = feature_scores.set_index('feature')['importance']
        start = end

    mean_importance = column_importance.mean(axis=0).sort_values(ascending=False)
    return mean_importance

In [None]:
mean_importance = get_average_importance(trained_pipeline)
print(mean_importance)

### Importance of columns by permutation

In [None]:
from itertools import combinations, chain

text_columns = ['nmod', 'verb_obl_obj', 'core_noun_verb', 'amod', 'text_lemmatized', 'text_clean']

all_col_variations = list(chain.from_iterable(combinations(text_columns, r) for r in range(1, len(text_columns)+1)))
all_col_variations = [list(p) for p in all_col_variations]

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class MultiColumnTfidfVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, columns, **kwargs):
        self.columns = columns
        self.vectorizers = [TfidfVectorizer(**kwargs) for _ in columns]
        self.fitted_models = []

    def fit(self, X, y=None):
        for col, vec in zip(self.columns, self.vectorizers):
            vec.fit(X[col])
            self.fitted_models.append(vec.fit(X[col]))
        return self

    def transform(self, X):
        features = []
        for col, vec in zip(self.columns, self.vectorizers):
            features.append(vec.transform(X[col]))
        return hstack(features)


def train_logistic_regression(data, text_columns, label_column, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(data[text_columns], data[label_column], test_size=0.2, random_state=random_state)
    pipeline = Pipeline([
        ('vectorizer', MultiColumnTfidfVectorizer(columns=text_columns)),
        ('regressor', LogisticRegression(solver='liblinear', random_state=random_state, max_iter=2000))
    ])

    param_grid = {
        'regressor__fit_intercept': [True, False],
        'regressor__C': [0.001, 0.01, 0.1, 0.5, 1, 2, 10, 100],
        'regressor__penalty': ['l1', 'l2']
    }

    grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, scoring='f1')
    grid_search.fit(X_train, y_train.values.ravel())
    logreg_best = grid_search.best_estimator_
    logreg_best.fit(X_train, y_train)

    y_pred = logreg_best.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    return logreg_best, [accuracy, precision, recall, f1]

In [None]:
%%time
all_trained_pipelines = []
all_results = []
for col_set in all_col_variations:
    current_pipe, results = train_logistic_regression(df_dehumanization, col_set, label_column)
    all_trained_pipelines.append(current_pipe)
    all_results.append(results)

In [None]:
idx_largest = max(range(len(all_results)), key=lambda i: all_results[i][-1])

next_largest = max([all_results[i][-1] for i in range(len(all_results)) if all_results[i][-1] < all_results[idx_largest][-1]])
idx_next_largest = max([i for i in range(len(all_results)) if all_results[i][-1] == next_largest])

print(idx_next_largest)

In [None]:
all_results[idx_largest]

In [None]:
all_trained_pipelines[idx_largest]

In [None]:
all_trained_pipelines[idx_next_largest]

In [None]:
all_results[idx_next_largest]

## Importance of columns (ie collocations and versions of pre-processing)


### Averaged importance

In [None]:
def get_average_importance(trained_pipeline):
    multi_column_tfidf_vectorizer = trained_pipeline.named_steps['vectorizer']
    logistic_regression = trained_pipeline.named_steps['regressor']
    coef = logistic_regression.coef_

    column_importance = pd.DataFrame()
    start = 0
    for i, col in enumerate(multi_column_tfidf_vectorizer.columns):
        vec = multi_column_tfidf_vectorizer.fitted_models[i]
        end = start + len(vec.get_feature_names_out())
        feature_scores = pd.DataFrame({
            'feature': vec.get_feature_names_out(),
            'importance': abs(coef[0][start:end])
        })
        column_importance[col] = feature_scores.set_index('feature')['importance']
        start = end

    mean_importance = column_importance.mean(axis=0).sort_values(ascending=False)
    return mean_importance

In [None]:
mean_importance = get_average_importance(trained_pipeline)
print(mean_importance)

### Importance of columns by permutation

In [None]:
from itertools import combinations, chain

text_columns = ['nmod', 'verb_obl_obj', 'core_noun_verb', 'amod', 'text_lemmatized', 'text_clean']

all_col_variations = list(chain.from_iterable(combinations(text_columns, r) for r in range(1, len(text_columns)+1)))
all_col_variations = [list(p) for p in all_col_variations]

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class MultiColumnTfidfVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, columns, **kwargs):
        self.columns = columns
        self.vectorizers = [TfidfVectorizer(**kwargs) for _ in columns]
        self.fitted_models = []

    def fit(self, X, y=None):
        for col, vec in zip(self.columns, self.vectorizers):
            vec.fit(X[col])
            self.fitted_models.append(vec.fit(X[col]))
        return self

    def transform(self, X):
        features = []
        for col, vec in zip(self.columns, self.vectorizers):
            features.append(vec.transform(X[col]))
        return hstack(features)


def train_logistic_regression(data, text_columns, label_column, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(data[text_columns], data[label_column], test_size=0.2, random_state=random_state)
    pipeline = Pipeline([
        ('vectorizer', MultiColumnTfidfVectorizer(columns=text_columns)),
        ('regressor', LogisticRegression(solver='liblinear', random_state=random_state, max_iter=2000))
    ])

    param_grid = {
        'regressor__fit_intercept': [True, False],
        'regressor__C': [0.001, 0.01, 0.1, 0.5, 1, 2, 10, 100],
        'regressor__penalty': ['l1', 'l2']
    }

    grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, scoring='f1')
    grid_search.fit(X_train, y_train.values.ravel())
    logreg_best = grid_search.best_estimator_
    logreg_best.fit(X_train, y_train)

    y_pred = logreg_best.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    return logreg_best, [accuracy, precision, recall, f1]

In [None]:
%%time
all_trained_pipelines = []
all_results = []
for col_set in all_col_variations:
    current_pipe, results = train_logistic_regression(df_dehumanization, col_set, label_column)
    all_trained_pipelines.append(current_pipe)
    all_results.append(results)

In [None]:
idx_largest = max(range(len(all_results)), key=lambda i: all_results[i][-1])

next_largest = max([all_results[i][-1] for i in range(len(all_results)) if all_results[i][-1] < all_results[idx_largest][-1]])
idx_next_largest = max([i for i in range(len(all_results)) if all_results[i][-1] == next_largest])

print(idx_next_largest)

In [None]:
all_results[idx_largest]

In [None]:
all_trained_pipelines[idx_largest]

In [None]:
all_trained_pipelines[idx_next_largest]

In [None]:
all_results[idx_next_largest]

## SVM

### Averaged importance

In [62]:
def get_average_importance(trained_pipeline):
    multi_column_tfidf_vectorizer = trained_pipeline.named_steps['vectorizer']
    logistic_regression = trained_pipeline.named_steps['regressor']
    coef = logistic_regression.coef_

    column_importance = pd.DataFrame()
    start = 0
    for i, col in enumerate(multi_column_tfidf_vectorizer.columns):
        vec = multi_column_tfidf_vectorizer.fitted_models[i]
        end = start + len(vec.get_feature_names_out())
        feature_scores = pd.DataFrame({
            'feature': vec.get_feature_names_out(),
            'importance': abs(coef[0][start:end])
        })
        column_importance[col] = feature_scores.set_index('feature')['importance']
        start = end

    mean_importance = column_importance.mean(axis=0).sort_values(ascending=False)
    return mean_importance

In [63]:
mean_importance = get_average_importance(trained_pipeline)
print(mean_importance)

core_noun_verb     0.259424
text_lemmatized    0.249534
amod               0.235747
nmod               0.207353
verb_obl_obj       0.189647
text_clean         0.162578
dtype: float64


### Importance of columns by permutation

In [70]:
from itertools import combinations, chain

text_columns = ['nmod', 'verb_obl_obj', 'core_noun_verb', 'amod', 'text_lemmatized', 'text_clean']

all_col_variations = list(chain.from_iterable(combinations(text_columns, r) for r in range(1, len(text_columns)+1)))
all_col_variations = [list(p) for p in all_col_variations]

In [71]:
from sklearn.base import BaseEstimator, TransformerMixin

class MultiColumnTfidfVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, columns, **kwargs):
        self.columns = columns
        self.vectorizers = [TfidfVectorizer(**kwargs) for _ in columns]
        self.fitted_models = []

    def fit(self, X, y=None):
        for col, vec in zip(self.columns, self.vectorizers):
            vec.fit(X[col])
            self.fitted_models.append(vec.fit(X[col]))
        return self

    def transform(self, X):
        features = []
        for col, vec in zip(self.columns, self.vectorizers):
            features.append(vec.transform(X[col]))
        return hstack(features)


def train_logistic_regression(data, text_columns, label_column, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(data[text_columns], data[label_column], test_size=0.2, random_state=random_state)
    pipeline = Pipeline([
        ('vectorizer', MultiColumnTfidfVectorizer(columns=text_columns)),
        ('regressor', LogisticRegression(solver='liblinear', random_state=random_state, max_iter=2000))
    ])

    param_grid = {
        'regressor__fit_intercept': [True, False],
        'regressor__C': [0.001, 0.01, 0.1, 0.5, 1, 2, 10, 100],
        'regressor__penalty': ['l1', 'l2']
    }

    grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, scoring='f1')
    grid_search.fit(X_train, y_train.values.ravel())
    logreg_best = grid_search.best_estimator_
    logreg_best.fit(X_train, y_train)

    y_pred = logreg_best.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    return logreg_best, [accuracy, precision, recall, f1]

In [72]:
%%time
all_trained_pipelines = []
all_results = []
for col_set in all_col_variations:
    current_pipe, results = train_logistic_regression(df_dehumanization, col_set, label_column)
    all_trained_pipelines.append(current_pipe)
    all_results.append(results)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

CPU times: user 4h 42min 44s, sys: 44min 38s, total: 5h 27min 22s
Wall time: 3h 46min 41s


  y = column_or_1d(y, warn=True)


In [73]:
idx_largest = max(range(len(all_results)), key=lambda i: all_results[i][-1])

next_largest = max([all_results[i][-1] for i in range(len(all_results)) if all_results[i][-1] < all_results[idx_largest][-1]])
idx_next_largest = max([i for i in range(len(all_results)) if all_results[i][-1] == next_largest])

print(idx_next_largest)

36


In [74]:
all_results[idx_largest]

[0.8074712643678161,
 0.8193979933110368,
 0.7538461538461538,
 0.7852564102564102]

In [75]:
all_trained_pipelines[idx_largest]

In [76]:
all_trained_pipelines[idx_next_largest]

In [77]:
all_results[idx_next_largest]

[0.8074712643678161,
 0.8281786941580757,
 0.7415384615384616,
 0.7824675324675325]

## SVM

In [79]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from scipy.sparse import hstack
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV


In [80]:
label_column = ['label']

In [95]:
class MultiColumnTfidfVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, columns, **kwargs):
        self.columns = columns
        self.vectorizers = [TfidfVectorizer(**kwargs) for _ in columns]
        self.fitted_models = []

    def fit(self, X, y=None):
        for col, vec in zip(self.columns, self.vectorizers):
            vec.fit(X[col])
            self.fitted_models.append(vec.fit(X[col]))
        return self

    def transform(self, X):
        features = []
        for col, vec in zip(self.columns, self.vectorizers):
            features.append(vec.transform(X[col]))
        return hstack(features)


def train_svm(data, text_columns, label_column, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(data[text_columns], data[label_column], test_size=0.2, random_state=random_state)

    pipeline = Pipeline([
        ('vectorizer', MultiColumnTfidfVectorizer(columns=text_columns)),
        ('classifier', SVC(kernel='linear', random_state=random_state))
    ])

    param_grid = {
        'classifier__C': [0.001, 0.01, 0.1, 0.5, 1, 2, 10, 100],
        # 'classifier__kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
    }

    grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, scoring='f1')
    grid_search.fit(X_train, y_train.values.ravel())
    svm_best = grid_search.best_estimator_
    svm_best.fit(X_train, y_train)

    y_pred = svm_best.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    return svm_best, [accuracy, precision, recall, f1]


In [113]:
%%time
all_trained_pipelines = []
all_results = []
for col_set in all_col_variations:
    current_pipe, results = train_svm(df_dehumanization, col_set, label_column)
    all_trained_pipelines.append(current_pipe)
    all_results.append(results)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

CPU times: user 39min 39s, sys: 14.6 s, total: 39min 54s
Wall time: 40min 3s


In [114]:
idx_largest = max(range(len(all_results)), key=lambda i: all_results[i][-1])

next_largest = max([all_results[i][-1] for i in range(len(all_results)) if all_results[i][-1] < all_results[idx_largest][-1]])
idx_next_largest = max([i for i in range(len(all_results)) if all_results[i][-1] == next_largest])

print(idx_next_largest)

39


In [115]:
all_results[idx_largest]

[0.7945402298850575,
 0.8053691275167785,
 0.7384615384615385,
 0.7704654895666131]

In [116]:
all_trained_pipelines[idx_largest]

In [117]:
all_trained_pipelines[idx_largest].get_params()

{'memory': None,
 'steps': [('vectorizer',
   MultiColumnTfidfVectorizer(columns=['text_lemmatized'])),
  ('classifier', SVC(C=1, kernel='linear', random_state=42))],
 'verbose': False,
 'vectorizer': MultiColumnTfidfVectorizer(columns=['text_lemmatized']),
 'classifier': SVC(C=1, kernel='linear', random_state=42),
 'vectorizer__columns': ['text_lemmatized'],
 'classifier__C': 1,
 'classifier__break_ties': False,
 'classifier__cache_size': 200,
 'classifier__class_weight': None,
 'classifier__coef0': 0.0,
 'classifier__decision_function_shape': 'ovr',
 'classifier__degree': 3,
 'classifier__gamma': 'scale',
 'classifier__kernel': 'linear',
 'classifier__max_iter': -1,
 'classifier__probability': False,
 'classifier__random_state': 42,
 'classifier__shrinking': True,
 'classifier__tol': 0.001,
 'classifier__verbose': False}

{'memory': None,
 'steps': [('vectorizer',
   MultiColumnTfidfVectorizer(columns=['text_lemmatized'])),
  ('classifier', SVC(C=1, kernel='linear', random_state=42))],
 'verbose': False,
 'vectorizer': MultiColumnTfidfVectorizer(columns=['text_lemmatized']),
 'classifier': SVC(C=1, kernel='linear', random_state=42),
 'vectorizer__columns': ['text_lemmatized'],
 'classifier__C': 1,
 'classifier__break_ties': False,
 'classifier__cache_size': 200,
 'classifier__class_weight': None,
 'classifier__coef0': 0.0,
 'classifier__decision_function_shape': 'ovr',
 'classifier__degree': 3,
 'classifier__gamma': 'scale',
 'classifier__kernel': 'linear',
 'classifier__max_iter': -1,
 'classifier__probability': False,
 'classifier__random_state': 42,
 'classifier__shrinking': True,
 'classifier__tol': 0.001,
 'classifier__verbose': False}

## With augmentation

In [57]:
import pandas as pd
# df_rm = pd.read_csv('/Users/katerynaburovova/PycharmProjects/dehumanization/augmentation/augm_removed_dehumanization.csv', index_col=[0])

In [58]:
from collocation_extraction import collect_verb_obl_obj, collect_core, collect_nmod, collect_amod, collect_comp, collect_appos, show_dependency, lst_to_str
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from scipy.sparse import hstack
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from scipy.sparse import hstack
from sklearn.pipeline import Pipeline


In [60]:
# df_add = pd.read_csv('/Users/katerynaburovova/PycharmProjects/dehumanization/augmentation/augm_added_dehumanization.csv', index_col=[0])
df_augmentation = pd.read_csv('/Users/katerynaburovova/PycharmProjects/dehumanization/augmentation/augm_neutal_in_class_sm.csv', index_col=[0])

In [122]:
# df_augmentation = pd.concat([df_rm,df_add])
# df_augmentation = df_add.copy()

In [123]:
# def preprocess_df_(df):
#     df['text_clean'] = df['fixed_sentences'].apply(lambda x: clean_text(x.lower(), cyrillic_letters))
#     df['text_lemmatized'] = df['text_clean'].apply(lambda x: lemmatize_spacy(x))
#     df=df[df['text_clean']!='']
#     df.reset_index(inplace=True)
#     return df

In [62]:
from itertools import combinations, chain

text_columns = ['nmod', 'verb_obl_obj', 'core_noun_verb', 'amod', 'text_lemmatized', 'text_clean']

all_col_variations = list(chain.from_iterable(combinations(text_columns, r) for r in range(1, len(text_columns)+1)))
all_col_variations = [list(p) for p in all_col_variations]

In [63]:
label_column = ['label']

In [66]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [67]:
%%time

df_augmentation = preprocess_df(df_augmentation, 'replaced_neutral_sent_ukrainians').copy()

CPU times: user 2.12 s, sys: 72.1 ms, total: 2.19 s
Wall time: 2.35 s


In [68]:
df_augmentation['core_noun_verb'] = df_augmentation['text_clean'].apply(lambda x: lst_to_str(collect_core(x)))
df_augmentation['verb_obl_obj'] = df_augmentation['text_clean'].apply(lambda x: lst_to_str(collect_verb_obl_obj(x)))
df_augmentation['nmod'] = df_augmentation['text_clean'].apply(lambda x: lst_to_str(collect_nmod(x)))
df_augmentation['amod'] = df_augmentation['text_clean'].apply(lambda x: lst_to_str(collect_amod(x)))
df_augmentation['comp'] = df_augmentation['text_clean'].apply(lambda x: lst_to_str(collect_comp(x)))

In [69]:
# df_dehumanization['core_noun_verb'] = df_dehumanization['text'].apply(lambda x: lst_to_str(collect_core(x)))
# df_dehumanization['verb_obl_obj'] = df_dehumanization['text'].apply(lambda x: lst_to_str(collect_verb_obl_obj(x)))
# df_dehumanization['nmod'] = df_dehumanization['text'].apply(lambda x: lst_to_str(collect_nmod(x)))
# df_dehumanization['amod'] = df_dehumanization['text'].apply(lambda x: lst_to_str(collect_amod(x)))
# df_dehumanization['comp'] = df_dehumanization['text'].apply(lambda x: lst_to_str(collect_comp(x)))

In [70]:
df_augmentation['label']=0

In [71]:
def train_augmented_svm(data, augmentation_data, text_columns, label_column, random_state=42):
    total_data_size = len(data) + len(augmentation_data)
    desired_test_size = int(total_data_size * 0.2)
    adjusted_test_size_ratio = desired_test_size / len(data)

    X_train, X_test, y_train, y_test = train_test_split(data[text_columns], data[label_column], test_size=adjusted_test_size_ratio, random_state=random_state)

    X_train_augmented = pd.concat([X_train, augmentation_data[text_columns]], axis=0, join='outer', ignore_index=True)
    y_train_augmented = pd.concat([y_train, augmentation_data[label_column]], axis=0, join='outer', ignore_index=True)

    pipeline = Pipeline([
        ('vectorizer', MultiColumnTfidfVectorizer(columns=text_columns)),
        ('classifier', SVC(kernel='linear', random_state=random_state))
    ])

    param_grid = {
        'classifier__C': [0.001, 0.01, 0.1, 0.5, 1, 2, 10, 100],
    }

    grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, scoring='f1')
    grid_search.fit(X_train_augmented, y_train_augmented.values.ravel())
    svm_best = grid_search.best_estimator_
    svm_best.fit(X_train_augmented, y_train_augmented.values.ravel())

    y_pred = svm_best.predict(X_test)
    accuracy = accuracy_score(y_test.values.ravel(), y_pred)
    precision = precision_score(y_test.values.ravel(), y_pred)
    recall = recall_score(y_test.values.ravel(), y_pred)
    f1 = f1_score(y_test.values.ravel(), y_pred)

    return svm_best, [accuracy, precision, recall, f1]


In [None]:
%%time
all_trained_pipelines_svm = []
all_results_svm = []
for col_set in all_col_variations:
    current_pipe, results = train_augmented_svm(df_dehumanization, df_augmentation, col_set, label_column)
    all_trained_pipelines_svm.append(current_pipe)
    all_results_svm.append(results)

In [None]:
idx_largest = max(range(len(all_results_svm)), key=lambda i: all_results_svm[i][-1])

next_largest = max([all_results_svm[i][-1] for i in range(len(all_results_svm)) if all_results_svm[i][-1] < all_results_svm[idx_largest][-1]])
idx_next_largest = max([i for i in range(len(all_results_svm)) if all_results_svm[i][-1] == next_largest])

print(idx_next_largest)

In [None]:
print(idx_largest)

In [None]:
all_results_svm[idx_largest]


In [None]:
all_trained_pipelines_svm[idx_largest].get_params()

## Separate metrics

In [137]:
# define function that separates the testing data
# y_target_labels_test, y_general_test
import pandas as pd
import ast
import numpy as np
from gensim.models import Word2Vec
from datetime import timedelta

In [138]:
model_path = "/Users/katerynaburovova/PycharmProjects/dehumanization/w2v_models/final_models/full_dataset_word2vec_correct.model"
gensim_model = Word2Vec.load(model_path)

In [139]:
def load_dictionary_from_file(file_name):
    with np.load(file_name) as data:
        return {key: data[key] for key in data}

In [140]:
centroids_dict = load_dictionary_from_file('centroids_dict.npz')

In [141]:
# def find_closest_words_for_single_model(model_path: str, given_vector, topn: int = 10):
#     try:
#         model = Word2Vec.load(model_path)
#         closest_words = model.wv.similar_by_vector(given_vector, topn=topn)
#         return closest_words
#     except Exception as e:
#         print(f"Error processing model {model_path}: {e}")
#         return []

In [142]:
%%time

closest_words = []

for vector in centroids_dict.values():
    closest_words_vector = gensim_model.wv.similar_by_vector(vector, topn=20)
    closest_words.append(closest_words_vector)

closest_words = [num for sublist in closest_words for num in sublist]

CPU times: user 1.99 s, sys: 1.05 s, total: 3.05 s
Wall time: 1 s


In [143]:
len(closest_words)

240

In [145]:
closest_words[:20]

[('укр', 0.9792277216911316),
 ('укро', 0.62953782081604),
 ('насмикав', 0.5467818975448608),
 ('зрадная', 0.5411729216575623),
 ('кляти', 0.5411087870597839),
 ('мюмзики', 0.5390231013298035),
 ('укроканалы', 0.526896595954895),
 ('цханоская', 0.5222957730293274),
 ('венерична', 0.5210613012313843),
 ('ефрв', 0.5089078545570374),
 ('пдрвля', 0.5057548880577087),
 ('мыкола', 0.5055790543556213),
 ('укропской', 0.5048384070396423),
 ('шароварный', 0.5024336576461792),
 ('укрорейха', 0.5013911128044128),
 ('хохлы', 0.498997300863266),
 ('двщ', 0.49575403332710266),
 ('увага', 0.49416372179985046),
 ('укрсми', 0.49142953753471375),
 ('укросми', 0.49056413769721985)]

In [37]:
dehumanizing_target_labels = ['укрорейха', 'нацистка', 'укропитеки', 'свинособаки', 'бандерлоги', 'свинорылых', 'укронацистов', 'укропитеков', 'укронацистская', 'укровермахта']

In [38]:
dehumanizing_target_sequences = ['рейх', 'нацист', 'питек', 'бандерло', 'свино', 'вермахт', 'питек']

In [35]:
import re

def contains_target_sequence(text, target_sequences):
    # text = text.decode("utf-8")
    for seq in target_sequences:
        if re.search(seq, text, re.IGNORECASE):
            return True
    return False

In [36]:
def split_test_data_by_target_sequences(X, y, text_columns, target_sequences):
    contains_seq = X.apply(lambda row: any(contains_target_sequence(row[col], target_sequences) for col in text_columns), axis=1)
    X_pos, y_pos = X[contains_seq], y[contains_seq]
    X_neg, y_neg = X[~contains_seq], y[~contains_seq]

    return X_pos, y_pos, X_neg, y_neg

## Model selection based on the combined F1

In [152]:
def train_augmented_svm(data, augmentation_data, text_columns, label_column, dehumanizing_target_sequences, random_state=42):
    total_data_size = len(data) + len(augmentation_data)
    desired_test_size = int(total_data_size * 0.2)
    adjusted_test_size_ratio = desired_test_size / len(data)

    X_train, X_test, y_train, y_test = train_test_split(data[text_columns], data[label_column], test_size=adjusted_test_size_ratio, random_state=random_state)

    X_train_augmented = pd.concat([X_train, augmentation_data[text_columns]], axis=0, join='outer', ignore_index=True)
    y_train_augmented = pd.concat([y_train, augmentation_data[label_column]], axis=0, join='outer', ignore_index=True)

    pipeline = Pipeline([
        ('vectorizer', MultiColumnTfidfVectorizer(columns=text_columns)),
        ('classifier', SVC(kernel='linear', random_state=random_state))
    ])

    param_grid = {
        'classifier__C': [0.001, 0.01, 0.1, 0.5, 1, 2, 10, 100],
    }

    grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, scoring='f1')
    grid_search.fit(X_train_augmented, y_train_augmented.values.ravel())
    svm_best = grid_search.best_estimator_
    svm_best.fit(X_train_augmented, y_train_augmented.values.ravel())

    X_test_pos, y_test_pos, X_test_neg, y_test_neg = split_test_data_by_target_sequences(X_test, y_test, text_columns, dehumanizing_target_sequences)

    y_pred_pos = svm_best.predict(X_test_pos)
    accuracy_pos = accuracy_score(y_test_pos, y_pred_pos)
    precision_pos = precision_score(y_test_pos, y_pred_pos)
    recall_pos = recall_score(y_test_pos, y_pred_pos)
    f1_pos = f1_score(y_test_pos, y_pred_pos)

    y_pred_neg = svm_best.predict(X_test_neg)
    accuracy_neg = accuracy_score(y_test_neg, y_pred_neg)
    precision_neg = precision_score(y_test_neg, y_pred_neg)
    recall_neg = recall_score(y_test_neg, y_pred_neg)
    f1_neg = f1_score(y_test_neg, y_pred_neg)

    return svm_best, [(accuracy_pos, precision_pos, recall_pos, f1_pos), (accuracy_neg, precision_neg, recall_neg, f1_neg)]

In [153]:
%%time
all_trained_pipelines_svm = []
all_results_svm = []
for col_set in all_col_variations:
    current_pipe, (results_pos, results_neg) = train_augmented_svm(df_dehumanization, df_augmentation, col_set, label_column, dehumanizing_target_sequences)
    all_trained_pipelines_svm.append(current_pipe)
    all_results_svm.append({"results_pos": results_pos, "results_neg": results_neg})

CPU times: user 56min 11s, sys: 18.4 s, total: 56min 29s
Wall time: 56min 42s


In [154]:
idx_largest_pos = max(range(len(all_results_svm)), key=lambda i: all_results_svm[i]["results_pos"][-1])
idx_largest_neg = max(range(len(all_results_svm)), key=lambda i: all_results_svm[i]["results_neg"][-1])

next_largest_pos = max([all_results_svm[i]["results_pos"][-1] for i in range(len(all_results_svm)) if all_results_svm[i]["results_pos"][-1] < all_results_svm[idx_largest_pos]["results_pos"][-1]])
idx_next_largest_pos = max([i for i in range(len(all_results_svm)) if all_results_svm[i]["results_pos"][-1] == next_largest_pos])

next_largest_neg = max([all_results_svm[i]["results_neg"][-1] for i in range(len(all_results_svm)) if all_results_svm[i]["results_neg"][-1] < all_results_svm[idx_largest_neg]["results_neg"][-1]])
idx_next_largest_neg = max([i for i in range(len(all_results_svm)) if all_results_svm[i]["results_neg"][-1] == next_largest_neg])

print(idx_next_largest_pos)
print(idx_next_largest_neg)


20
40


In [59]:
idx_largest_pos

0

In [60]:
idx_largest_neg

9

In [135]:
all_results_svm[idx_largest_pos]

{'results_pos': (0.8928571428571429,
  0.9864864864864865,
  0.9012345679012346,
  0.9419354838709678),
 'results_neg': (0.6416772554002541,
  0.6742424242424242,
  0.27134146341463417,
  0.3869565217391305)}

In [155]:
all_trained_pipelines_svm[idx_largest_pos].get_params()

{'memory': None,
 'steps': [('vectorizer',
   MultiColumnTfidfVectorizer(columns=['core_noun_verb'])),
  ('classifier', SVC(C=10, kernel='linear', random_state=42))],
 'verbose': False,
 'vectorizer': MultiColumnTfidfVectorizer(columns=['core_noun_verb']),
 'classifier': SVC(C=10, kernel='linear', random_state=42),
 'vectorizer__columns': ['core_noun_verb'],
 'classifier__C': 10,
 'classifier__break_ties': False,
 'classifier__cache_size': 200,
 'classifier__class_weight': None,
 'classifier__coef0': 0.0,
 'classifier__decision_function_shape': 'ovr',
 'classifier__degree': 3,
 'classifier__gamma': 'scale',
 'classifier__kernel': 'linear',
 'classifier__max_iter': -1,
 'classifier__probability': False,
 'classifier__random_state': 42,
 'classifier__shrinking': True,
 'classifier__tol': 0.001,
 'classifier__verbose': False}

In [156]:
all_results_svm[idx_largest_neg]


{'results_pos': (0.8745980707395499,
  0.9807692307692307,
  0.8823529411764706,
  0.9289617486338797),
 'results_neg': (0.85, 0.78125, 0.4166666666666667, 0.5434782608695653)}

In [157]:
all_trained_pipelines_svm[idx_largest_neg].get_params()

{'memory': None,
 'steps': [('vectorizer',
   MultiColumnTfidfVectorizer(columns=['text_lemmatized', 'text_clean'])),
  ('classifier', SVC(C=100, kernel='linear', random_state=42))],
 'verbose': False,
 'vectorizer': MultiColumnTfidfVectorizer(columns=['text_lemmatized', 'text_clean']),
 'classifier': SVC(C=100, kernel='linear', random_state=42),
 'vectorizer__columns': ['text_lemmatized', 'text_clean'],
 'classifier__C': 100,
 'classifier__break_ties': False,
 'classifier__cache_size': 200,
 'classifier__class_weight': None,
 'classifier__coef0': 0.0,
 'classifier__decision_function_shape': 'ovr',
 'classifier__degree': 3,
 'classifier__gamma': 'scale',
 'classifier__kernel': 'linear',
 'classifier__max_iter': -1,
 'classifier__probability': False,
 'classifier__random_state': 42,
 'classifier__shrinking': True,
 'classifier__tol': 0.001,
 'classifier__verbose': False}

## Non augmented

In [158]:
def train_svm(data, text_columns, label_column, dehumanizing_target_sequences, random_state=42):

    X_train, X_test, y_train, y_test = train_test_split(data[text_columns], data[label_column], test_size=0.2, random_state=random_state)

    pipeline = Pipeline([
        ('vectorizer', MultiColumnTfidfVectorizer(columns=text_columns)),
        ('classifier', SVC(kernel='linear', random_state=random_state))
    ])

    param_grid = {
        'classifier__C': [0.001, 0.01, 0.1, 0.5, 1, 2, 10, 100],
    }

    grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, scoring='f1')
    grid_search.fit(X_train, y_train.values.ravel())
    svm_best = grid_search.best_estimator_
    svm_best.fit(X_train, y_train.values.ravel())

    X_test_pos, y_test_pos, X_test_neg, y_test_neg = split_test_data_by_target_sequences(X_test, y_test, text_columns, dehumanizing_target_sequences)

    y_pred_pos = svm_best.predict(X_test_pos)
    accuracy_pos = accuracy_score(y_test_pos, y_pred_pos)
    precision_pos = precision_score(y_test_pos, y_pred_pos)
    recall_pos = recall_score(y_test_pos, y_pred_pos)
    f1_pos = f1_score(y_test_pos, y_pred_pos)

    y_pred_neg = svm_best.predict(X_test_neg)
    accuracy_neg = accuracy_score(y_test_neg, y_pred_neg)
    precision_neg = precision_score(y_test_neg, y_pred_neg)
    recall_neg = recall_score(y_test_neg, y_pred_neg)
    f1_neg = f1_score(y_test_neg, y_pred_neg)

    return svm_best, [(accuracy_pos, precision_pos, recall_pos, f1_pos), (accuracy_neg, precision_neg, recall_neg, f1_neg)]

In [159]:
%%time
all_trained_pipelines_svm_reg = []
all_results_svm_reg = []
for col_set in all_col_variations:
    current_pipe_reg, (results_pos, results_neg) = train_svm(df_dehumanization, col_set, label_column, dehumanizing_target_sequences)
    all_trained_pipelines_svm_reg.append(current_pipe_reg)
    all_results_svm_reg.append({"results_pos": results_pos, "results_neg": results_neg})

CPU times: user 39min 42s, sys: 14.3 s, total: 39min 56s
Wall time: 40min 6s


In [160]:
idx_largest_pos = max(range(len(all_results_svm_reg)), key=lambda i: all_results_svm_reg[i]["results_pos"][-1])
idx_largest_neg = max(range(len(all_results_svm_reg)), key=lambda i: all_results_svm_reg[i]["results_neg"][-1])

next_largest_pos = max([all_results_svm_reg[i]["results_pos"][-1] for i in range(len(all_results_svm_reg)) if all_results_svm_reg[i]["results_pos"][-1] < all_results_svm_reg[idx_largest_pos]["results_pos"][-1]])
idx_next_largest_pos = max([i for i in range(len(all_results_svm_reg)) if all_results_svm_reg[i]["results_pos"][-1] == next_largest_pos])

next_largest_neg = max([all_results_svm_reg[i]["results_neg"][-1] for i in range(len(all_results_svm_reg)) if all_results_svm_reg[i]["results_neg"][-1] < all_results_svm_reg[idx_largest_neg]["results_neg"][-1]])
idx_next_largest_neg = max([i for i in range(len(all_results_svm_reg)) if all_results_svm_reg[i]["results_neg"][-1] == next_largest_neg])

print(idx_next_largest_pos)
print(idx_next_largest_neg)

7
11


In [161]:
print(idx_largest_pos)
print(idx_largest_neg)

2
6


In [162]:
all_trained_pipelines_svm[idx_largest_pos].get_params()

{'memory': None,
 'steps': [('vectorizer',
   MultiColumnTfidfVectorizer(columns=['core_noun_verb'])),
  ('classifier', SVC(C=10, kernel='linear', random_state=42))],
 'verbose': False,
 'vectorizer': MultiColumnTfidfVectorizer(columns=['core_noun_verb']),
 'classifier': SVC(C=10, kernel='linear', random_state=42),
 'vectorizer__columns': ['core_noun_verb'],
 'classifier__C': 10,
 'classifier__break_ties': False,
 'classifier__cache_size': 200,
 'classifier__class_weight': None,
 'classifier__coef0': 0.0,
 'classifier__decision_function_shape': 'ovr',
 'classifier__degree': 3,
 'classifier__gamma': 'scale',
 'classifier__kernel': 'linear',
 'classifier__max_iter': -1,
 'classifier__probability': False,
 'classifier__random_state': 42,
 'classifier__shrinking': True,
 'classifier__tol': 0.001,
 'classifier__verbose': False}

In [163]:
all_results_svm[idx_largest_pos]

{'results_pos': (0.8928571428571429,
  0.9864864864864865,
  0.9012345679012346,
  0.9419354838709678),
 'results_neg': (0.6416772554002541,
  0.6742424242424242,
  0.27134146341463417,
  0.3869565217391305)}

In [164]:
all_trained_pipelines_svm[idx_largest_neg].get_params()

{'memory': None,
 'steps': [('vectorizer',
   MultiColumnTfidfVectorizer(columns=['nmod', 'verb_obl_obj'])),
  ('classifier', SVC(C=100, kernel='linear', random_state=42))],
 'verbose': False,
 'vectorizer': MultiColumnTfidfVectorizer(columns=['nmod', 'verb_obl_obj']),
 'classifier': SVC(C=100, kernel='linear', random_state=42),
 'vectorizer__columns': ['nmod', 'verb_obl_obj'],
 'classifier__C': 100,
 'classifier__break_ties': False,
 'classifier__cache_size': 200,
 'classifier__class_weight': None,
 'classifier__coef0': 0.0,
 'classifier__decision_function_shape': 'ovr',
 'classifier__degree': 3,
 'classifier__gamma': 'scale',
 'classifier__kernel': 'linear',
 'classifier__max_iter': -1,
 'classifier__probability': False,
 'classifier__random_state': 42,
 'classifier__shrinking': True,
 'classifier__tol': 0.001,
 'classifier__verbose': False}

In [165]:
all_results_svm[idx_largest_neg]

{'results_pos': (0.8074534161490683,
  0.9615384615384616,
  0.8278145695364238,
  0.8896797153024911),
 'results_neg': (0.7281690140845071,
  0.7338129496402878,
  0.3953488372093023,
  0.5138539042821159)}

## Experiments with other feature extraction methods

## Using word2vec

In [17]:
from itertools import combinations, chain

text_columns = ['nmod', 'verb_obl_obj', 'core_noun_verb', 'amod', 'text_lemmatized', 'text_clean']

all_col_variations = list(chain.from_iterable(combinations(text_columns, r) for r in range(1, len(text_columns)+1)))
all_col_variations = [list(p) for p in all_col_variations]

In [18]:
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from gensim.models import Word2Vec

class Word2VecVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, word2vec_model, aggregation_func=np.mean):
        self.word2vec_model = word2vec_model
        self.aggregation_func = aggregation_func

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        def document_vector(document):
            word_vectors = [self.word2vec_model.wv[word] for word in document.split() if word in self.word2vec_model.wv]
            return np.mean(word_vectors, axis=0) if word_vectors else np.zeros(self.word2vec_model.vector_size)

        X_concatenated = X.apply(lambda x: ' '.join(x.dropna()), axis=1)
        return np.array([document_vector(doc) for doc in X_concatenated])



In [19]:
from sklearn.model_selection import train_test_split
from collocation_extraction import collect_verb_obl_obj, collect_core, collect_nmod, collect_amod, collect_comp, collect_appos, show_dependency, lst_to_str
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from scipy.sparse import hstack
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from scipy.sparse import hstack
from sklearn.pipeline import Pipeline

label_column = ['label']
dehumanizing_target_sequences = ['рейх', 'нацист', 'питек', 'бандерло', 'свино', 'вермахт', 'питек']

In [20]:
df_dehumanization['core_noun_verb'] = df_dehumanization['text'].apply(lambda x: lst_to_str(collect_core(x)))
df_dehumanization['verb_obl_obj'] = df_dehumanization['text'].apply(lambda x: lst_to_str(collect_verb_obl_obj(x)))
df_dehumanization['nmod'] = df_dehumanization['text'].apply(lambda x: lst_to_str(collect_nmod(x)))
df_dehumanization['amod'] = df_dehumanization['text'].apply(lambda x: lst_to_str(collect_amod(x)))
df_dehumanization['comp'] = df_dehumanization['text'].apply(lambda x: lst_to_str(collect_comp(x)))
# df_dehumanization.drop(columns='index', inplace=True)

In [21]:
word2vec_model_path = "/Users/katerynaburovova/PycharmProjects/dehumanization/w2v_models/final_models/full_dataset_word2vec_correct.model"

In [22]:
w2v_model = Word2Vec.load(word2vec_model_path)

In [23]:
def train_augmented_svm(data, augmentation_data, w2v_model, text_columns, label_column, dehumanizing_target_sequences,random_state=42):
    total_data_size = len(data) + len(augmentation_data)
    desired_test_size = int(total_data_size * 0.2)
    adjusted_test_size_ratio = desired_test_size / len(data)

    X_train, X_test, y_train, y_test = train_test_split(data[text_columns], data[label_column], test_size=adjusted_test_size_ratio, random_state=random_state)

    X_train_augmented = pd.concat([X_train, augmentation_data[text_columns]], axis=0, join='outer', ignore_index=True)
    y_train_augmented = pd.concat([y_train, augmentation_data[label_column]], axis=0, join='outer', ignore_index=True)

    # w2v_model = Word2Vec.load(word2vec_model_path)

    pipeline = Pipeline([
        ('vectorizer', Word2VecVectorizer(w2v_model)),
        ('classifier', SVC(kernel='linear', random_state=random_state))
    ])

    param_grid = {
        'classifier__C': [0.1, 1, 10, 100],
    }

    grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, scoring='f1', n_jobs=-1)
    grid_search.fit(X_train_augmented, y_train_augmented.values.ravel())
    svm_best = grid_search.best_estimator_
    svm_best.fit(X_train_augmented, y_train_augmented.values.ravel())

    X_test_pos, y_test_pos, X_test_neg, y_test_neg = split_test_data_by_target_sequences(X_test, y_test, text_columns, dehumanizing_target_sequences)

    y_pred_pos = svm_best.predict(X_test_pos)
    accuracy_pos = accuracy_score(y_test_pos, y_pred_pos)
    precision_pos = precision_score(y_test_pos, y_pred_pos)
    recall_pos = recall_score(y_test_pos, y_pred_pos)
    f1_pos = f1_score(y_test_pos, y_pred_pos)

    y_pred_neg = svm_best.predict(X_test_neg)
    accuracy_neg = accuracy_score(y_test_neg, y_pred_neg)
    precision_neg = precision_score(y_test_neg, y_pred_neg)
    recall_neg = recall_score(y_test_neg, y_pred_neg)
    f1_neg = f1_score(y_test_neg, y_pred_neg)

    return svm_best, [(accuracy_pos, precision_pos, recall_pos, f1_pos), (accuracy_neg, precision_neg, recall_neg, f1_neg)]

In [39]:
%%time
all_trained_pipelines_svm_w2v = []
all_results_svm_w2v = []
for col_set in all_col_variations:
    current_pipe, (results_pos, results_neg) = train_augmented_svm(df_dehumanization, df_augmentation, w2v_model, col_set, label_column,dehumanizing_target_sequences)
    all_trained_pipelines_svm_w2v.append(current_pipe)
    all_results_svm_w2v.append({"results_pos": results_pos, "results_neg": results_neg})
    print(f'Done with {col_set}')

Done with ['nmod']
Done with ['verb_obl_obj']
Done with ['core_noun_verb']
Done with ['amod']
Done with ['text_lemmatized']




Done with ['text_clean']
Done with ['nmod', 'verb_obl_obj']
Done with ['nmod', 'core_noun_verb']
Done with ['nmod', 'amod']
Done with ['nmod', 'text_lemmatized']




Done with ['nmod', 'text_clean']
Done with ['verb_obl_obj', 'core_noun_verb']
Done with ['verb_obl_obj', 'amod']
Done with ['verb_obl_obj', 'text_lemmatized']
Done with ['verb_obl_obj', 'text_clean']


PicklingError: Could not pickle the task to send it to the workers.

In [None]:
idx_largest_pos = max(range(len(all_results_svm_w2v)), key=lambda i: all_results_svm_w2v[i]["results_pos"][-1])
idx_largest_neg = max(range(len(all_results_svm_w2v)), key=lambda i: all_results_svm_w2v[i]["results_neg"][-1])

next_largest_pos = max([all_results_svm_w2v[i]["results_pos"][-1] for i in range(len(all_results_svm_w2v)) if all_results_svm_w2v[i]["results_pos"][-1] < all_results_svm_w2v[idx_largest_pos]["results_pos"][-1]])
idx_next_largest_pos = max([i for i in range(len(all_results_svm_w2v)) if all_results_svm_w2v[i]["results_pos"][-1] == next_largest_pos])

next_largest_neg = max([all_results_svm_w2v[i]["results_neg"][-1] for i in range(len(all_results_svm_w2v)) if all_results_svm_w2v[i]["results_neg"][-1] < all_results_svm_w2v[idx_largest_neg]["results_neg"][-1]])
idx_next_largest_neg = max([i for i in range(len(all_results_svm_w2v)) if all_results_svm_w2v[i]["results_neg"][-1] == next_largest_neg])

print(idx_next_largest_pos)
print(idx_next_largest_neg)

## Rerunnig best models with 2nd batch

## Augmented SVM

In [13]:
df_2nd = pd.read_csv('/Users/katerynaburovova/PycharmProjects/dehumanization/annotation/final_labels_2nd_batch.csv', index_col=0)

In [14]:
df_dehumanization_2nd = df_2nd[['Dehumanization', 'text']].reset_index(drop=True)

In [19]:
%%time

df_dehumanization_2nd = df_dehumanization_2nd[df_dehumanization_2nd['Dehumanization']!='не можу визначитись з правильною відповіддю']
df_dehumanization_2nd['label'] = df_dehumanization_2nd['Dehumanization'].apply(lambda x: 0 if x=='ні' else 1)
df_dehumanization_2nd = preprocess_df(df_dehumanization_2nd)

CPU times: user 10.6 s, sys: 272 ms, total: 10.9 s
Wall time: 11.2 s


In [22]:
df_dehumanization_full = pd.concat([df_dehumanization, df_dehumanization_2nd], ignore_index=True)

In [26]:
from collocation_extraction import collect_verb_obl_obj, collect_core, collect_nmod, collect_amod, collect_comp, collect_appos, show_dependency, lst_to_str

df_dehumanization_full['core_noun_verb'] = df_dehumanization_full['text'].apply(lambda x: lst_to_str(collect_core(x)))
df_dehumanization_full['verb_obl_obj'] = df_dehumanization_full['text'].apply(lambda x: lst_to_str(collect_verb_obl_obj(x)))
df_dehumanization_full['nmod'] = df_dehumanization_full['text'].apply(lambda x: lst_to_str(collect_nmod(x)))
df_dehumanization_full['amod'] = df_dehumanization_full['text'].apply(lambda x: lst_to_str(collect_amod(x)))
df_dehumanization_full['comp'] = df_dehumanization_full['text'].apply(lambda x: lst_to_str(collect_comp(x)))

In [25]:
df_augmentation = pd.read_csv('/Users/katerynaburovova/PycharmProjects/dehumanization/augmentation/augm_neutal_in_class_sm.csv', index_col=[0])

In [26]:
from itertools import combinations, chain
text_columns = ['nmod', 'verb_obl_obj', 'core_noun_verb', 'amod', 'text_lemmatized', 'text_clean']

all_col_variations = list(chain.from_iterable(combinations(text_columns, r) for r in range(1, len(text_columns)+1)))
all_col_variations = [list(p) for p in all_col_variations]

In [27]:
%%time

df_augmentation = preprocess_df(df_augmentation, 'replaced_neutral_sent_ukrainians').copy()
df_augmentation['core_noun_verb'] = df_augmentation['text_clean'].apply(lambda x: lst_to_str(collect_core(x)))
df_augmentation['verb_obl_obj'] = df_augmentation['text_clean'].apply(lambda x: lst_to_str(collect_verb_obl_obj(x)))
df_augmentation['nmod'] = df_augmentation['text_clean'].apply(lambda x: lst_to_str(collect_nmod(x)))
df_augmentation['amod'] = df_augmentation['text_clean'].apply(lambda x: lst_to_str(collect_amod(x)))
df_augmentation['comp'] = df_augmentation['text_clean'].apply(lambda x: lst_to_str(collect_comp(x)))

df_augmentation['label']=0

CPU times: user 36.5 s, sys: 352 ms, total: 36.8 s
Wall time: 37 s


In [28]:
label_column = ['label']

In [41]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from scipy.sparse import hstack
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV


dehumanizing_target_sequences = ['рейх', 'нацист', 'питек', 'бандерло', 'свино', 'вермахт', 'питек']

In [48]:
%%time
all_trained_pipelines_svm = []
all_results_svm = []
for col_set in all_col_variations:
    current_pipe, (results_pos, results_neg) = train_augmented_svm(df_dehumanization_full, df_augmentation, col_set, label_column, dehumanizing_target_sequences)
    all_trained_pipelines_svm.append(current_pipe)
    all_results_svm.append({"results_pos": results_pos, "results_neg": results_neg})

CPU times: user 1h 30min 23s, sys: 32.9 s, total: 1h 30min 56s
Wall time: 4h 53min 47s


In [49]:
idx_largest_pos = max(range(len(all_results_svm)), key=lambda i: all_results_svm[i]["results_pos"][-1])
idx_largest_neg = max(range(len(all_results_svm)), key=lambda i: all_results_svm[i]["results_neg"][-1])

next_largest_pos = max([all_results_svm[i]["results_pos"][-1] for i in range(len(all_results_svm)) if all_results_svm[i]["results_pos"][-1] < all_results_svm[idx_largest_pos]["results_pos"][-1]])
idx_next_largest_pos = max([i for i in range(len(all_results_svm)) if all_results_svm[i]["results_pos"][-1] == next_largest_pos])

next_largest_neg = max([all_results_svm[i]["results_neg"][-1] for i in range(len(all_results_svm)) if all_results_svm[i]["results_neg"][-1] < all_results_svm[idx_largest_neg]["results_neg"][-1]])
idx_next_largest_neg = max([i for i in range(len(all_results_svm)) if all_results_svm[i]["results_neg"][-1] == next_largest_neg])

print(idx_next_largest_pos)
print(idx_next_largest_neg)


40
4


In [61]:
all_trained_pipelines_svm[idx_largest_pos].get_params()

{'memory': None,
 'steps': [('vectorizer', MultiColumnTfidfVectorizer(columns=['nmod'])),
  ('classifier', SVC(C=10, kernel='linear', random_state=42))],
 'verbose': False,
 'vectorizer': MultiColumnTfidfVectorizer(columns=['nmod']),
 'classifier': SVC(C=10, kernel='linear', random_state=42),
 'vectorizer__columns': ['nmod'],
 'classifier__C': 10,
 'classifier__break_ties': False,
 'classifier__cache_size': 200,
 'classifier__class_weight': None,
 'classifier__coef0': 0.0,
 'classifier__decision_function_shape': 'ovr',
 'classifier__degree': 3,
 'classifier__gamma': 'scale',
 'classifier__kernel': 'linear',
 'classifier__max_iter': -1,
 'classifier__probability': False,
 'classifier__random_state': 42,
 'classifier__shrinking': True,
 'classifier__tol': 0.001,
 'classifier__verbose': False}

In [62]:
all_trained_pipelines_svm[idx_largest_neg].get_params()


{'memory': None,
 'steps': [('vectorizer',
   MultiColumnTfidfVectorizer(columns=['nmod', 'text_lemmatized'])),
  ('classifier', SVC(C=10, kernel='linear', random_state=42))],
 'verbose': False,
 'vectorizer': MultiColumnTfidfVectorizer(columns=['nmod', 'text_lemmatized']),
 'classifier': SVC(C=10, kernel='linear', random_state=42),
 'vectorizer__columns': ['nmod', 'text_lemmatized'],
 'classifier__C': 10,
 'classifier__break_ties': False,
 'classifier__cache_size': 200,
 'classifier__class_weight': None,
 'classifier__coef0': 0.0,
 'classifier__decision_function_shape': 'ovr',
 'classifier__degree': 3,
 'classifier__gamma': 'scale',
 'classifier__kernel': 'linear',
 'classifier__max_iter': -1,
 'classifier__probability': False,
 'classifier__random_state': 42,
 'classifier__shrinking': True,
 'classifier__tol': 0.001,
 'classifier__verbose': False}

In [50]:
all_results_svm[idx_largest_pos]

{'results_pos': (0.873015873015873,
  0.972972972972973,
  0.8925619834710744,
  0.9310344827586208),
 'results_neg': (0.6911421911421911,
  0.5655737704918032,
  0.24555160142348753,
  0.3424317617866005)}

In [51]:
all_results_svm[idx_largest_neg]

{'results_pos': (0.799373040752351,
  0.9836734693877551,
  0.8006644518272426,
  0.8827838827838828),
 'results_neg': (0.8751879699248121,
  0.6323529411764706,
  0.42574257425742573,
  0.5088757396449705)}