In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV

# Function to prepare data for scikit-learn


In [2]:
def prepare_sklearn_data(dataset_split):
    texts = dataset_split['text']
    labels = dataset_split['label']
    return texts, labels

# Load the data

In [3]:
from datasets import load_dataset

# Load text classification dataset
text_classification_dataset = load_dataset("ag_news")

  from .autonotebook import tqdm as notebook_tqdm
Downloading readme: 100%|██████████| 8.07k/8.07k [00:00<?, ?B/s]
Downloading data: 100%|██████████| 18.6M/18.6M [00:13<00:00, 1.37MB/s]
Downloading data: 100%|██████████| 1.23M/1.23M [00:01<00:00, 925kB/s]
Generating train split: 100%|██████████| 120000/120000 [00:00<00:00, 673802.72 examples/s]
Generating test split: 100%|██████████| 7600/7600 [00:00<00:00, 499956.25 examples/s]


# Get the data


In [4]:
train_texts, train_labels = prepare_sklearn_data(text_classification_dataset['train'])
test_texts, test_labels = prepare_sklearn_data(text_classification_dataset['test'])

# Create a pipeline


In [5]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=50000, ngram_range=(1, 2))),
    ('classifier', LinearSVC(C=1, dual=False))
])

# Train the model


In [6]:
print("Training TF-IDF + SVM model...")
pipeline.fit(train_texts, train_labels)

Training TF-IDF + SVM model...


# Evaluate


In [7]:
print("Evaluating model...")
predictions = pipeline.predict(test_texts)
accuracy = accuracy_score(test_labels, predictions)
report = classification_report(test_labels, predictions, 
                              target_names=text_classification_dataset['train'].features['label'].names)

print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(report)

Evaluating model...
Accuracy: 0.9238
Classification Report:
              precision    recall  f1-score   support

       World       0.94      0.91      0.93      1900
      Sports       0.96      0.98      0.97      1900
    Business       0.89      0.89      0.89      1900
    Sci/Tech       0.90      0.91      0.91      1900

    accuracy                           0.92      7600
   macro avg       0.92      0.92      0.92      7600
weighted avg       0.92      0.92      0.92      7600



# Save the model

In [8]:
import joblib

# Save the trained pipeline to a file
joblib.dump(pipeline, 'tfidf_svm_model.pkl')
print("Model saved as 'tfidf_svm_model.pkl'")

Model saved as 'tfidf_svm_model.pkl'
