# Vectorizer Selection

In [None]:
import numpy as np
import torch
import pandas as pd
import gensim
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import BertTokenizer, BertModel
from sklearn import svm
from sklearn.linear_model import LogisticRegression

# Suppress "SettingWithCopyWarning"
pd.options.mode.chained_assignment = None 

In [None]:
clean_spacy_mapaffil = pd.read_parquet("data/clean_spacy_mapaffil.parquet", engine="fastparquet") 

In [None]:
num_affiliations = 15000

In [None]:
df = clean_spacy_mapaffil.head(num_affiliations)
city_counts = df['city'].value_counts()
single_instance_cities = city_counts[city_counts == 1].index.tolist()
num_affiliations -= len(single_instance_cities)
filtered_df = df[~df['city'].isin(single_instance_cities)]

In [None]:
filtered_df['city'] = filtered_df['city'].astype('category')
filtered_df['label'] = filtered_df['city'].cat.codes

In [None]:
calculated_test_size = (filtered_df['city'].nunique()) / num_affiliations
X_train_texts, X_test_texts, y_train, y_test = train_test_split(filtered_df["affiliation"], filtered_df["city"], test_size=calculated_test_size if calculated_test_size > 0.1 else 0.1, stratify=filtered_df['label'], random_state=42)

In [None]:
# TF-IDF Vectorizer 
tfidf_vectorizer = TfidfVectorizer(stop_words="english", decode_error="ignore") 
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_texts)
X_test_tfidf = tfidf_vectorizer.transform(X_test_texts)

In [None]:
# Word2Vec Vectorizer
X_train_texts_processed = X_train_texts.apply(gensim.utils.simple_preprocess)
X_test_texts_processed = X_test_texts.apply(gensim.utils.simple_preprocess)

model = gensim.models.Word2Vec(window=10, min_count=2, workers=4)

model.build_vocab(X_train_texts_processed, progress_per=100)
model.train(X_train_texts_processed, total_examples=model.corpus_count, epochs=model.epochs)

def vectorize_text(text, model):
    vectors = []
    for word in text:
        if word in model.wv:
            vectors.append(model.wv[word])
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

X_train_w2v = np.array([vectorize_text(text, model) for text in X_train_texts_processed]) 
X_test_w2v = np.array([vectorize_text(text, model) for text in X_test_texts_processed])

In [None]:
# BERT Vectorizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def embed_texts_bert(texts):
    encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**encoded_input)
    embeddings = model_output.last_hidden_state[:, 0, :].numpy()  
    return embeddings

X_train_bert = embed_texts_bert(list(X_train_texts))
X_test_bert = embed_texts_bert(list(X_test_texts))

In [None]:
def train_and_evaluate(X_train, X_test, y_train, y_test, vectorizer_name, clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

In [None]:
# Evaluate TF-IDF
accuracy_tfidf_LogisticRegression = train_and_evaluate(X_train_tfidf, X_test_tfidf, y_train, y_test, 'TF-IDF', LogisticRegression())
accuracy_tfidf_LinearSVC = train_and_evaluate(X_train_tfidf, X_test_tfidf, y_train, y_test, 'TF-IDF', svm.LinearSVC(dual=True))


In [None]:
# Evaluate Word2Vec
accuracy_w2v_LogisticRegression = train_and_evaluate(X_train_w2v, X_test_w2v, y_train, y_test, 'Word2Vec', LogisticRegression())
accuracy_w2v_LinearSVC = train_and_evaluate(X_train_w2v, X_test_w2v, y_train, y_test, 'Word2Vec', svm.LinearSVC(dual=True))

In [None]:
# Evaluate BERT
accuracy_bert_LogisticRegression = train_and_evaluate(X_train_bert, X_test_bert, y_train, y_test, 'BERT', LogisticRegression())
accuracy_bert_LinearSVC = train_and_evaluate(X_train_bert, X_test_bert, y_train, y_test, 'BERT', svm.LinearSVC(dual=True))

In [None]:
results = pd.DataFrame({
    'Vectorizer': ['TF-IDF (Logistic Regression)', 'Word2Vec (Logistic Regression)', 'BERT (Logistic Regression)', 'TF-IDF (LinearSVC)', 'Word2Vec (LinearSVC)', 'BERT (LinearSVC)'],
    'Accuracy': [accuracy_tfidf_LogisticRegression, accuracy_w2v_LogisticRegression, accuracy_bert_LogisticRegression, accuracy_tfidf_LinearSVC, accuracy_w2v_LinearSVC, accuracy_bert_LinearSVC]
})

display(results)

print(f"Total number of affiliations in current dataset: {num_affiliations}")
print(f"Test Size: {calculated_test_size if calculated_test_size > 0.1 else 0.1}")
print(f"# of training affiliations: {num_affiliations - int(calculated_test_size * num_affiliations)}")
print(f"# of test affiliations: {int(calculated_test_size * num_affiliations)}")