In [2]:
import pandas as pd
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
import joblib

#NLTK data
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

#Loading dataset
df = pd.read_csv('../dataset/domain_classification_dataset.csv')

print(f"Loaded {len(df)} samples")
print("Distribution:\n", df['category'].value_counts())

#Cleaning text function
def clean_text(text):
    if isinstance(text, float) or text is None:
        text = ""
    text = str(text).lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return " ".join(tokens)

#Applying cleaning
print("Cleaning text...")
df['cleaned_text'] = df['text'].apply(clean_text)


#TF-IDF Vectorization
vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.95,
    max_features=30000,
    stop_words='english'
)

X = vectorizer.fit_transform(df['cleaned_text'])
y = df['category']

print(f"TF-IDF matrix shape: {X.shape}")

#Spliting data into train[80%] and test[20%]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y 
)

print(f"Training samples: {X_train.shape[0]}")
print(f"Test samples: {X_test.shape[0]}")

#Train Logistic Regression
print("Training model...")
base_svc = LinearSVC(C=0.5, class_weight='balanced', max_iter=2000)
model = CalibratedClassifierCV(base_svc, cv=3)
model.fit(X_train, y_train)

#Prediction on test set
y_pred = model.predict(X_test)

#Accuracy Calculation
accuracy = accuracy_score(y_test, y_pred)
print("\n" + "="*50)
print(f"MODEL ACCURACY: {accuracy * 100:.2f}%")
print("="*50)

#Saving model and vectorizer
save_path_model = r'C:\Users\ishfa\OneDrive\Documents\MU 6TH SEM\AI\AI Final Project\News_Domain_Classification\app\domain_classifier_model.pkl'
save_path_vectorizer = r'C:\Users\ishfa\OneDrive\Documents\MU 6TH SEM\AI\AI Final Project\News_Domain_Classification\app\tfidf_vectorizer.pkl'

joblib.dump(model, save_path_model)
joblib.dump(vectorizer, save_path_vectorizer)


print(f"\nModel saved to: {save_path_model}")
print(f"Vectorizer saved to: {save_path_vectorizer}")
print("Training complete!")

Loaded 250000 samples
Distribution:
 category
ENTERTAINMENT     25000
POLITICS          25000
BUSINESS          25000
TECH              25000
TRAVEL            25000
SCIENCE           25000
FOOD & DRINK      25000
SPORTS            25000
WORLD NEWS        25000
HEALTHY LIVING    25000
Name: count, dtype: int64
Cleaning text...
TF-IDF matrix shape: (250000, 30000)
Training samples: 200000
Test samples: 50000
Training model...

MODEL ACCURACY: 85.69%

Model saved to: C:\Users\ishfa\OneDrive\Documents\MU 6TH SEM\AI\AI Final Project\News_Domain_Classification\app\domain_classifier_model.pkl
Vectorizer saved to: C:\Users\ishfa\OneDrive\Documents\MU 6TH SEM\AI\AI Final Project\News_Domain_Classification\app\tfidf_vectorizer.pkl
Training complete!
