In [10]:
import pandas as pd

# Load the dataset
data = pd.read_csv("C:\\Users\\hp\\Documents\\abcnews-date-text.csv")

# Print column names and unique values
print("Columns present in the dataset:")
print(data.columns)

# Identify the correct column for categories
print("\nPossible columns for categories:")
for column in data.columns:
    print(column, data[column].nunique())

# You can manually inspect the output and identify the correct column for categories.


Columns present in the dataset:
Index(['publish_date', 'headline_text'], dtype='object')

Possible columns for categories:
publish_date 6882
headline_text 1213004


In [12]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Load the dataset
data = pd.read_csv("C:\\Users\\hp\\Documents\\abcnews-date-text.csv")

# Create a new column 'category' based on the year extracted from 'publish_date'
data['category'] = pd.to_datetime(data['publish_date']).dt.year

# Data Exploration
print("Categories present in the dataset:")
print(data['category'].unique())

print("\nDistribution of articles across different topics:")
print(data['category'].value_counts())

# Split the data into training and testing sets
train_data, test_data, train_labels, test_labels = train_test_split(data['headline_text'], data['category'], test_size=0.2, random_state=42)

# Bag-of-Words (BoW) with unigrams and bigrams
vectorizer_unigram = CountVectorizer()
X_train_unigram = vectorizer_unigram.fit_transform(train_data)
X_test_unigram = vectorizer_unigram.transform(test_data)

vectorizer_bigram = CountVectorizer(ngram_range=(2, 2))
X_train_bigram = vectorizer_bigram.fit_transform(train_data)
X_test_bigram = vectorizer_bigram.transform(test_data)

# TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(train_data)
X_test_tfidf = tfidf_vectorizer.transform(test_data)

# Build a Naive Bayes classifier for each representation
nb_classifier_unigram = MultinomialNB()
nb_classifier_unigram.fit(X_train_unigram, train_labels)
predictions_unigram = nb_classifier_unigram.predict(X_test_unigram)
accuracy_unigram = accuracy_score(test_labels, predictions_unigram)
print("\nAccuracy using BoW with unigrams:", accuracy_unigram)

nb_classifier_bigram = MultinomialNB()
nb_classifier_bigram.fit(X_train_bigram, train_labels)
predictions_bigram = nb_classifier_bigram.predict(X_test_bigram)
accuracy_bigram = accuracy_score(test_labels, predictions_bigram)
print("Accuracy using BoW with bigrams:", accuracy_bigram)

nb_classifier_tfidf = MultinomialNB()
nb_classifier_tfidf.fit(X_train_tfidf, train_labels)
predictions_tfidf = nb_classifier_tfidf.predict(X_test_tfidf)
accuracy_tfidf = accuracy_score(test_labels, predictions_tfidf)
print("Accuracy using TF-IDF:", accuracy_tfidf)

# Insights and Recommendations
print("\nInsights and Recommendations:")
print("1. Bag-of-Words (BoW) with unigrams and bigrams:")
print("   - BoW with unigrams accuracy:", accuracy_unigram)
print("   - BoW with bigrams accuracy:", accuracy_bigram)
print("\n2. TF-IDF:")
print("   - TF-IDF accuracy:", accuracy_tfidf)
print("\n3. Recommendations:")
print("   - Depending on dataset characteristics, consider a combination of techniques.")
print("   - Experiment with different feature engineering strategies.")
print("   - Regularly update the model with new data to maintain relevance and accuracy.")


Categories present in the dataset:
[1970]

Distribution of articles across different topics:
category
1970    1244184
Name: count, dtype: int64

Accuracy using BoW with unigrams: 1.0
Accuracy using BoW with bigrams: 1.0
Accuracy using TF-IDF: 1.0

Insights and Recommendations:
1. Bag-of-Words (BoW) with unigrams and bigrams:
   - BoW with unigrams accuracy: 1.0
   - BoW with bigrams accuracy: 1.0

2. TF-IDF:
   - TF-IDF accuracy: 1.0

3. Recommendations:
   - Depending on dataset characteristics, consider a combination of techniques.
   - Experiment with different feature engineering strategies.
   - Regularly update the model with new data to maintain relevance and accuracy.
