In [153]:
# importig libraries
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import string
import warnings
warnings.filterwarnings("ignore")
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

In [155]:
# importing the dataset
df = pd.read_csv("blogs.csv")
df

Unnamed: 0,Data,Labels
0,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism
1,Newsgroups: alt.atheism\nPath: cantaloupe.srv....,alt.atheism
2,Path: cantaloupe.srv.cs.cmu.edu!das-news.harva...,alt.atheism
3,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism
4,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:53...,alt.atheism
...,...,...
1995,Xref: cantaloupe.srv.cs.cmu.edu talk.abortion:...,talk.religion.misc
1996,Xref: cantaloupe.srv.cs.cmu.edu talk.religion....,talk.religion.misc
1997,Xref: cantaloupe.srv.cs.cmu.edu talk.origins:4...,talk.religion.misc
1998,Xref: cantaloupe.srv.cs.cmu.edu talk.religion....,talk.religion.misc


In [157]:
df.info() # gives the info of the dataset

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Data    2000 non-null   object
 1   Labels  2000 non-null   object
dtypes: object(2)
memory usage: 31.4+ KB


In [159]:
df['Labels'].value_counts() 

Labels
alt.atheism                 100
comp.graphics               100
talk.politics.misc          100
talk.politics.mideast       100
talk.politics.guns          100
soc.religion.christian      100
sci.space                   100
sci.med                     100
sci.electronics             100
sci.crypt                   100
rec.sport.hockey            100
rec.sport.baseball          100
rec.motorcycles             100
rec.autos                   100
misc.forsale                100
comp.windows.x              100
comp.sys.mac.hardware       100
comp.sys.ibm.pc.hardware    100
comp.os.ms-windows.misc     100
talk.religion.misc          100
Name: count, dtype: int64

In [161]:
# data preprocessing
def clean_text(text):
    text=text.lower()
    text=re.sub('[\.*?/]','',text)
    text=re.sub('[%s]' %re.escape(string.punctuation),'',text)
    text=re.sub('w*\d\w*','',text)
    text=re.sub('[0-9]','',text)
    text=re.sub('[''""..,]','',text)
    return text
df['Data']=df['Data'].apply(clean_text)
df

Unnamed: 0,Data,Labels
0,path cantaloupesrvcscmuedumagnesiumclubcccmued...,alt.atheism
1,newsgroups altatheism\npath cantaloupesrvcscmu...,alt.atheism
2,path cantaloupesrvcscmuedudasnewsharvardedunoc...,alt.atheism
3,path cantaloupesrvcscmuedumagnesiumclubcccmued...,alt.atheism
4,xref cantaloupesrvcscmuedu altatheism talkreli...,alt.atheism
...,...,...
1995,xref cantaloupesrvcscmuedu talkabortion altath...,talk.religion.misc
1996,xref cantaloupesrvcscmuedu talkreligionmisc ta...,talk.religion.misc
1997,xref cantaloupesrvcscmuedu talkorigins talkrel...,talk.religion.misc
1998,xref cantaloupesrvcscmuedu talkreligionmisc al...,talk.religion.misc


In [162]:
# tokenizing
df['Data']=df['Data'].apply(word_tokenize)
df

Unnamed: 0,Data,Labels
0,"[path, cantaloupesrvcscmuedumagnesiumclubcccmu...",alt.atheism
1,"[newsgroups, altatheism, path, cantaloupesrvcs...",alt.atheism
2,"[path, cantaloupesrvcscmuedudasnewsharvardedun...",alt.atheism
3,"[path, cantaloupesrvcscmuedumagnesiumclubcccmu...",alt.atheism
4,"[xref, cantaloupesrvcscmuedu, altatheism, talk...",alt.atheism
...,...,...
1995,"[xref, cantaloupesrvcscmuedu, talkabortion, al...",talk.religion.misc
1996,"[xref, cantaloupesrvcscmuedu, talkreligionmisc...",talk.religion.misc
1997,"[xref, cantaloupesrvcscmuedu, talkorigins, tal...",talk.religion.misc
1998,"[xref, cantaloupesrvcscmuedu, talkreligionmisc...",talk.religion.misc


In [164]:
# removing the stop words
from nltk.corpus import stopwords
stop = stopwords.words('english')
df['Data']=df['Data'].apply(lambda tokens:[word for word in tokens if word not in stop])
df

Unnamed: 0,Data,Labels
0,"[path, cantaloupesrvcscmuedumagnesiumclubcccmu...",alt.atheism
1,"[newsgroups, altatheism, path, cantaloupesrvcs...",alt.atheism
2,"[path, cantaloupesrvcscmuedudasnewsharvardedun...",alt.atheism
3,"[path, cantaloupesrvcscmuedumagnesiumclubcccmu...",alt.atheism
4,"[xref, cantaloupesrvcscmuedu, altatheism, talk...",alt.atheism
...,...,...
1995,"[xref, cantaloupesrvcscmuedu, talkabortion, al...",talk.religion.misc
1996,"[xref, cantaloupesrvcscmuedu, talkreligionmisc...",talk.religion.misc
1997,"[xref, cantaloupesrvcscmuedu, talkorigins, tal...",talk.religion.misc
1998,"[xref, cantaloupesrvcscmuedu, talkreligionmisc...",talk.religion.misc


In [166]:
# Converting list of tokens to string
df['Data'] = df['Data'].apply(lambda tokens: ' '.join(tokens))

In [169]:
# feature extraction
# Feature Extraction using TF-IDF
x= df['Data']  # dividing dependent and inependent
y = df['Labels']
tfidf = TfidfVectorizer(stop_words='english', max_features= 1000, max_df = 0.5, smooth_idf=True)  
x_tfidf = tfidf.fit_transform(x)
names_features = tfidf.get_feature_names_out()
dense = x_tfidf.todense()
denselist = dense.tolist()
df1 = pd.DataFrame(denselist, columns = names_features)
df1


Unnamed: 0,able,accept,access,according,account,act,action,actions,actually,add,...,xref,yeah,year,years,yes,york,youll,young,youre,youve
0,0.0,0.053948,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.045443,0.0,0.0,0.0,0.000000,0.057540
1,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.110066,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000
2,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.107898,0.000000,0.000000,...,0.000000,0.332519,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.082467,0.000000
3,0.0,0.025356,0.0,0.0,0.024553,0.026494,0.133537,0.108174,0.040108,0.000000,...,0.000000,0.000000,0.019753,0.017855,0.128149,0.0,0.0,0.0,0.000000,0.000000
4,0.0,0.226967,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.038665,0.000000,0.000000,0.053275,0.000000,0.0,0.0,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,0.0,0.000000,0.0,0.0,0.000000,0.042786,0.000000,0.000000,0.032387,0.000000,...,0.020927,0.000000,0.000000,0.028835,0.068985,0.0,0.0,0.0,0.033380,0.087348
1996,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.071049,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000
1997,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.095749,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000
1998,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.178642,0.112933,...,0.057717,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000


In [171]:
# Splitting the data into training and test sets
x_train, x_test, y_train, y_test = train_test_split(x_tfidf, y, test_size=0.2, random_state=100)

In [173]:
# Naive Bayes Model
nb= MultinomialNB()
nb.fit(x_train, y_train)

In [175]:
# predicting and evaluating on test data
# Predictions
y_pred = nb.predict(x_test)
print(classification_report(y_test, y_pred))

                          precision    recall  f1-score   support

             alt.atheism       0.72      0.76      0.74        17
           comp.graphics       0.95      0.82      0.88        22
 comp.os.ms-windows.misc       0.87      0.87      0.87        23
comp.sys.ibm.pc.hardware       0.75      0.75      0.75        20
   comp.sys.mac.hardware       0.94      0.89      0.91        18
          comp.windows.x       0.85      0.85      0.85        20
            misc.forsale       0.78      0.90      0.84        20
               rec.autos       0.83      0.83      0.83        24
         rec.motorcycles       0.94      1.00      0.97        15
      rec.sport.baseball       0.85      1.00      0.92        17
        rec.sport.hockey       0.96      0.85      0.90        26
               sci.crypt       0.90      0.95      0.93        20
         sci.electronics       0.56      0.67      0.61        15
                 sci.med       0.89      0.89      0.89        19
         

In [None]:
# The TF-IDF vectorizer was effective in transforming textual data into numerical features.
#The classifier  Naive Bayes  performed well in categories with clear sentiment polarity.

In [177]:
#3. Sentiment Analysis
sent = SentimentIntensityAnalyzer()
sentiments = []
for post in df['Data']:
    sentiment_score = sent.polarity_scores(post)
    if sentiment_score['compound'] > 0:
        sentiment = 'positive'
    elif sentiment_score['compound'] < 0:
        sentiment = 'negative'
    else:
        sentiment = 'neutral'
    sentiments.append(sentiment)

df['Sentiment'] = sentiments

In [179]:
# Evaluating sentiment distribution across different categories
sentiment_category_distribution = df.groupby(['Labels', 'Sentiment']).size().unstack(fill_value=0)
print(sentiment_category_distribution)

Sentiment                 negative  neutral  positive
Labels                                               
alt.atheism                     40        1        59
comp.graphics                   10        2        88
comp.os.ms-windows.misc         21        2        77
comp.sys.ibm.pc.hardware        19        1        80
comp.sys.mac.hardware           19        2        79
comp.windows.x                  19        2        79
misc.forsale                     9        8        83
rec.autos                       26        1        73
rec.motorcycles                 33        1        66
rec.sport.baseball              24        1        75
rec.sport.hockey                29        1        70
sci.crypt                       24        0        76
sci.electronics                 12        3        85
sci.med                         28        1        71
sci.space                       29        3        68
soc.religion.christian          29        0        71
talk.politics.guns          

In [None]:
# Most categories show a positive sentiment majority.
# Only politics-related categories exhibit higher negative sentiment.