In [None]:
import re
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [None]:
df = pd.read_csv('/content/domain_data.csv')

In [None]:
# Preprocessing
# Lowercasing
df['text'] = df['text'].apply(lambda x: x.lower())


In [None]:
# Removing Punctuation
df['text'] = df['text'].apply(lambda x: re.sub(r'[^\w\s]', '', x))

In [None]:
# Tokenization
nltk.download('punkt')
df['text'] = df['text'].apply(lambda x: word_tokenize(x))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
# Removing Stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
df['text'] = df['text'].apply(lambda x: [word for word in x if word not in stop_words])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# Joining tokens back into sentences (optional, depending on your preference)
df['text'] = df['text'].apply(lambda x: ' '.join(x))

In [None]:
df = df[['text', 'labels']]
df.columns = ['text', 'domain']

In [None]:
# Encode labels
label_dict = {'business': 0, 'entertainment': 1, 'politics': 2, 'sport': 3, 'tech': 4}
df['domain'] = df['domain'].map(label_dict)

In [None]:
# Feature extraction
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(df['text'])
y = df['domain']

In [None]:
# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Model training (using Random Forest)
model = RandomForestClassifier(n_estimators=100, random_state=42)  # You can adjust n_estimators as needed
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=label_dict.keys()))

               precision    recall  f1-score   support

     business       0.92      0.86      0.89       557
entertainment       0.93      0.82      0.87       420
     politics       0.88      0.90      0.89       502
        sport       0.84      0.97      0.90       585
         tech       0.92      0.89      0.91       508

     accuracy                           0.89      2572
    macro avg       0.90      0.89      0.89      2572
 weighted avg       0.90      0.89      0.89      2572



In [None]:
# Prediction (example)
new_text = ["A new gadget was launched yesterday."]
new_text_features = tfidf_vectorizer.transform(new_text)
predicted_label = model.predict(new_text_features)
print("Predicted domain:", list(label_dict.keys())[predicted_label[0]])

Predicted domain: tech
