In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import cross_validate, KFold

from sklearn.model_selection import train_test_split

from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from utils import train_test

## Preprocess data

In [None]:
df = pd.read_feather('processed.feather')
print(f"Data frame shape: {df.shape}")

In [None]:
exclude_cols = [k for k, v in df['Category'].value_counts().items() if v < 500]
df = df.dropna()
data = df[~df['Category'].isin(exclude_cols)].copy()
train_df, test_df = train_test_split(data, test_size=0.2, random_state=42, shuffle=True)

print(f"train_df shape: {train_df.shape}")
print(f"train_df shape: {test_df.shape}")

## Train - Test

In [None]:
feature_col = 'Content'

X_train = train_df[feature_col]
X_test = test_df[feature_col]

y_train = train_df['Category']
y_test = test_df['Category']


vectorizer = TfidfVectorizer()

X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

print(X_train_vectorized.shape, X_test_vectorized.shape)

### Experiments

In [None]:
# Title
model = MultinomialNB()

kfold = KFold(n_splits=3)

scores = cross_validate(model, X_train_vectorized, y_train, cv=kfold, scoring=['f1_weighted', 'precision_weighted', 'recall_weighted', 'accuracy'], n_jobs=-1, verbose=2)
scores_df = pd.DataFrame(scores).T
scores_df['mean'] = scores_df.mean(axis=1)
scores_df

In [None]:
# Title
model = DecisionTreeClassifier()

kfold = KFold(n_splits=3)

scores = cross_validate(model, X_train_vectorized, y_train, cv=kfold, scoring=['f1_weighted', 'precision_weighted', 'recall_weighted', 'accuracy'], n_jobs=-1, verbose=2)
scores_df = pd.DataFrame(scores).T
scores_df['mean'] = scores_df.mean(axis=1)
scores_df

In [None]:
# Title
model = RandomForestClassifier()

kfold = KFold(n_splits=3)

scores = cross_validate(model, X_train_vectorized, y_train, cv=kfold, scoring=['f1_weighted', 'precision_weighted', 'recall_weighted', 'accuracy'], n_jobs=-1, verbose=2)
scores_df = pd.DataFrame(scores).T
scores_df['mean'] = scores_df.mean(axis=1)
scores_df

In [None]:
# Title
model = GradientBoostingClassifier()

kfold = KFold(n_splits=3)

scores = cross_validate(model, X_train_vectorized, y_train, cv=kfold, scoring=['f1_weighted', 'precision_weighted', 'recall_weighted', 'accuracy'], n_jobs=-1, verbose=2)
scores_df = pd.DataFrame(scores).T
scores_df['mean'] = scores_df.mean(axis=1)
scores_df

In [None]:
# Title
model = LogisticRegression(max_iter=1000)

kfold = KFold(n_splits=3)

scores = cross_validate(model, X_train_vectorized, y_train, cv=kfold, scoring=['f1_weighted', 'precision_weighted', 'recall_weighted', 'accuracy'], n_jobs=-1, verbose=2)
scores_df = pd.DataFrame(scores).T
scores_df['mean'] = scores_df.mean(axis=1)
scores_df

In [None]:
model = LogisticRegression()
train_test(model, X_train_vectorized, X_test_vectorized, y_train, y_test)

In [None]:
# Title
model = SVC()

kfold = KFold(n_splits=3)

scores = cross_validate(model, X_train_vectorized, y_train, cv=kfold, scoring=['f1_weighted', 'precision_weighted', 'recall_weighted', 'accuracy'], n_jobs=-1, verbose=2)
scores_df = pd.DataFrame(scores).T
scores_df['mean'] = scores_df.mean(axis=1)
scores_df