In [20]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from textblob import TextBlob
import nltk


In [34]:
print(nltk.data.path)

['/root/nltk_data', '/usr/nltk_data', '/usr/share/nltk_data', '/usr/lib/nltk_data', '/usr/share/nltk_data', '/usr/local/share/nltk_data', '/usr/lib/nltk_data', '/usr/local/lib/nltk_data']


In [37]:
# Ensure necessary NLTK data is downloaded
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [38]:
# Loading dataset
file_path = '/content/blogs.csv'
data = pd.read_csv(file_path)

In [39]:
# Data Exploration
print(data.head())
print(data.info())


                                                Data       Labels
0  Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...  alt.atheism
1  Newsgroups: alt.atheism\nPath: cantaloupe.srv....  alt.atheism
2  Path: cantaloupe.srv.cs.cmu.edu!das-news.harva...  alt.atheism
3  Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...  alt.atheism
4  Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:53...  alt.atheism
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Data    2000 non-null   object
 1   Labels  2000 non-null   object
dtypes: object(2)
memory usage: 31.4+ KB
None


In [40]:
# Data Cleaning and Preprocessing
stop_words = set(stopwords.words('english'))

In [41]:
def clean_text(text):
    text = re.sub(r'\W', ' ', text)  # Remove punctuation
    text = text.lower()  # Convert to lowercase
    tokens = text.split()  # Tokenize using split
    filtered_words = [word for word in tokens if word not in stop_words]
    return ' '.join(filtered_words)


In [42]:
data['Cleaned_Text'] = data['Data'].apply(clean_text)

In [43]:
# Feature Extraction using TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data['Cleaned_Text'])
y = data['Labels']


In [44]:
# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [45]:
# Naive Bayes Model
model = MultinomialNB()
model.fit(X_train, y_train)

In [46]:
# Predictions
y_pred = model.predict(X_test)


In [47]:
# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.785
Classification Report:
                          precision    recall  f1-score   support

             alt.atheism       0.60      0.83      0.70        18
           comp.graphics       0.72      0.72      0.72        18
 comp.os.ms-windows.misc       0.75      0.95      0.84        22
comp.sys.ibm.pc.hardware       0.72      0.84      0.78        25
   comp.sys.mac.hardware       0.88      0.67      0.76        21
          comp.windows.x       1.00      0.24      0.39        25
            misc.forsale       0.74      0.78      0.76        18
               rec.autos       0.77      0.94      0.85        18
         rec.motorcycles       0.81      0.81      0.81        16
      rec.sport.baseball       0.83      0.83      0.83        18
        rec.sport.hockey       0.65      1.00      0.79        15
               sci.crypt       0.68      1.00      0.81        19
         sci.electronics       0.75      0.56      0.64        16
                 sci.med       0.88 

In [48]:
# Sentiment Analysis
def get_sentiment(text):
    analysis = TextBlob(text).sentiment
    if analysis.polarity > 0:
        return 'Positive'
    elif analysis.polarity < 0:
        return 'Negative'
    else:
        return 'Neutral'

In [49]:
data['Sentiment'] = data['Data'].apply(get_sentiment)

In [50]:
# Sentiment Distribution by Category
sentiment_distribution = data.groupby('Labels')['Sentiment'].value_counts(normalize=True).unstack()
print("Sentiment Distribution by Category:")
print(sentiment_distribution)

Sentiment Distribution by Category:
Sentiment                 Negative  Positive
Labels                                      
alt.atheism                   0.23      0.77
comp.graphics                 0.24      0.76
comp.os.ms-windows.misc       0.22      0.78
comp.sys.ibm.pc.hardware      0.20      0.80
comp.sys.mac.hardware         0.24      0.76
comp.windows.x                0.27      0.73
misc.forsale                  0.16      0.84
rec.autos                     0.17      0.83
rec.motorcycles               0.26      0.74
rec.sport.baseball            0.29      0.71
rec.sport.hockey              0.34      0.66
sci.crypt                     0.19      0.81
sci.electronics               0.19      0.81
sci.med                       0.29      0.71
sci.space                     0.27      0.73
soc.religion.christian        0.13      0.87
talk.politics.guns            0.30      0.70
talk.politics.mideast         0.22      0.78
talk.politics.misc            0.22      0.78
talk.religion.misc 

In [51]:
# Save the processed data with sentiments
data.to_csv('/content/blogs_with_sentiments.csv', index=False)