# Classifier Selection

In [None]:
import pandas as pd
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

# Suppress "SettingWithCopyWarning"
pd.options.mode.chained_assignment = None 

In [None]:
clean_spacy_mapaffil = pd.read_parquet("data/clean_spacy_mapaffil.parquet", engine="fastparquet") 

In [None]:
num_affiliations = 15000

In [None]:
df = clean_spacy_mapaffil.head(num_affiliations)
city_counts = df['city'].value_counts()
single_instance_cities = city_counts[city_counts == 1].index.tolist()
num_affiliations -= len(single_instance_cities)
filtered_df = df[~df['city'].isin(single_instance_cities)]

In [None]:
filtered_df['city'] = filtered_df['city'].astype('category')
filtered_df['label'] = filtered_df['city'].cat.codes

In [None]:
calculated_test_size = (filtered_df['city'].nunique()) / num_affiliations
X_train_texts, X_test_texts, y_train, y_test = train_test_split(filtered_df["affiliation"], filtered_df["city"], test_size=calculated_test_size if calculated_test_size > 0.1 else 0.1, stratify=filtered_df['label'], random_state=42)

In [None]:
tfidf_vectorizer = TfidfVectorizer(stop_words="english", decode_error="ignore")
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_texts)
X_test_tfidf = tfidf_vectorizer.transform(X_test_texts)

In [None]:
accuracy_data = []

for classifier in (svm.LinearSVC(dual=True), RandomForestClassifier(), LogisticRegression(), MultinomialNB()): 
    classifier.fit(X_train_tfidf, y_train)    
    y_pred = classifier.predict(X_test_tfidf)
    accuracy_data.append([str(classifier), accuracy_score(y_test, y_pred), ])

accuracy_results = pd.DataFrame(accuracy_data, columns=['Classifier', 'Accuracy'])

display(accuracy_results)