## Step 2: Apply classical ML models

In [1]:
from config import models_isot_path
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
import joblib



In [2]:
news = pd.read_csv("data/isot_news.csv")


X_train, X_test, y_train, y_test = train_test_split(news['text'], news['label'], test_size=0.2, random_state=42)


In [3]:
lr = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('clf', LogisticRegression())
])

lr.fit(X_train, y_train)

joblib.dump(lr, f'{models_isot_path}/logistic_regression.pkl')


pipeline = joblib.load(f'{models_isot_path}/logistic_regression.pkl')
print(classification_report(y_test, pipeline.predict(X_test), digits=5))


              precision    recall  f1-score   support

           0    0.99139   0.98724   0.98931      4546
           1    0.98660   0.99095   0.98877      4308

    accuracy                        0.98904      8854
   macro avg    0.98899   0.98909   0.98904      8854
weighted avg    0.98905   0.98904   0.98905      8854



In [4]:
nb = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('clf', MultinomialNB())
])

nb.fit(X_train, y_train)
joblib.dump(nb, f'{models_isot_path}/naive_bayes.pkl')

y_pred = nb.predict(X_test)

print(classification_report(y_test, y_pred, digits=5))


              precision    recall  f1-score   support

           0    0.95812   0.93599   0.94692      4546
           1    0.93406   0.95682   0.94530      4308

    accuracy                        0.94613      8854
   macro avg    0.94609   0.94641   0.94611      8854
weighted avg    0.94641   0.94613   0.94614      8854



In [5]:
rf = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('clf', RandomForestClassifier())
])

rf.fit(X_train, y_train)
joblib.dump(rf, f'{models_isot_path}/random_forest.pkl')

y_pred = rf.predict(X_test)

print(classification_report(y_test, y_pred, digits=5))


              precision    recall  f1-score   support

           0    0.99227   0.98812   0.99019      4546
           1    0.98752   0.99188   0.98969      4308

    accuracy                        0.98995      8854
   macro avg    0.98989   0.99000   0.98994      8854
weighted avg    0.98996   0.98995   0.98995      8854



In [6]:
svm = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('clf', SVC(kernel='linear'))
])

svm.fit(X_train, y_train)
joblib.dump(svm, f'{models_isot_path}/svc_linear_kernel.pkl')

y_pred = svm.predict(X_test)

print(classification_report(y_test, y_pred, digits=5))


              precision    recall  f1-score   support

           0    0.99713   0.99428   0.99570      4546
           1    0.99398   0.99698   0.99548      4308

    accuracy                        0.99560      8854
   macro avg    0.99556   0.99563   0.99559      8854
weighted avg    0.99560   0.99560   0.99560      8854

