In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
import string

In [2]:
# Load the uploaded dataset
file_path = '/content/blogs.csv'
blogs_data = pd.read_csv(file_path)

In [3]:
# Display the first few rows of the dataset to understand its structure and content
blogs_data.head()


Unnamed: 0,Data,Labels
0,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism
1,Newsgroups: alt.atheism\nPath: cantaloupe.srv....,alt.atheism
2,Path: cantaloupe.srv.cs.cmu.edu!das-news.harva...,alt.atheism
3,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism
4,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:53...,alt.atheism


# Data Exploration and Preprocessing ########################

In [4]:
# Define a text cleaning function #
def clean_text_alternate(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize the text by splitting on spaces
    tokens = text.split()
    # Remove stopwords using sklearn's ENGLISH_STOP_WORDS
    tokens = [word for word in tokens if word not in ENGLISH_STOP_WORDS]
    return ' '.join(tokens)


In [5]:
# Apply text cleaning to the 'Data' column
blogs_data['Cleaned_Data'] = blogs_data['Data'].apply(clean_text_alternate)

In [6]:
########## Display cleaned data ###########
blogs_data[['Data', 'Cleaned_Data']]


Unnamed: 0,Data,Cleaned_Data
0,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,path cantaloupesrvcscmuedumagnesiumclubcccmued...
1,Newsgroups: alt.atheism\nPath: cantaloupe.srv....,newsgroups altatheism path cantaloupesrvcscmue...
2,Path: cantaloupe.srv.cs.cmu.edu!das-news.harva...,path cantaloupesrvcscmuedudasnewsharvardedunoc...
3,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,path cantaloupesrvcscmuedumagnesiumclubcccmued...
4,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:53...,xref cantaloupesrvcscmuedu altatheism53485 tal...
...,...,...
1995,Xref: cantaloupe.srv.cs.cmu.edu talk.abortion:...,xref cantaloupesrvcscmuedu talkabortion120945 ...
1996,Xref: cantaloupe.srv.cs.cmu.edu talk.religion....,xref cantaloupesrvcscmuedu talkreligionmisc837...
1997,Xref: cantaloupe.srv.cs.cmu.edu talk.origins:4...,xref cantaloupesrvcscmuedu talkorigins41030 ta...
1998,Xref: cantaloupe.srv.cs.cmu.edu talk.religion....,xref cantaloupesrvcscmuedu talkreligionmisc836...


In [7]:
######### Feature Extraction ############
# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # Limiting to 1000


In [8]:
# Apply TF-IDF transformation to the cleaned text
tfidf_features = tfidf_vectorizer.fit_transform(blogs_data['Cleaned_Data'])


In [10]:
# Convert the sparse matrix to a DataFrame for better visualization
tfidf_df = pd.DataFrame(tfidf_features.toarray(), columns=tfidf_vectorizer.get_feature_names_out())



In [11]:
# Display a sample of the TF-IDF features
tfidf_df

Unnamed: 0,0400,10,100,11,12,13,14,15,16,17,...,xnewsreader,xref,year,years,yes,york,youll,young,youre,youve
0,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.0,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.045189,0.0,0.0,0.0,0.000000,0.057218
1,0.0,0.0,0.0,0.10896,0.0,0.0,0.000000,0.0,0.0,0.000000,...,0.126042,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000
2,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.0,0.0,0.090253,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.094594,0.000000
3,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.0,0.0,0.000000,...,0.000000,0.000000,0.019805,0.017902,0.128485,0.0,0.0,0.0,0.000000,0.000000
4,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.0,0.0,0.000000,...,0.000000,0.038468,0.000000,0.053003,0.000000,0.0,0.0,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.0,0.0,0.000000,...,0.000000,0.020852,0.000000,0.028730,0.068735,0.0,0.0,0.0,0.033259,0.087031
1996,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.0,0.0,0.000000,...,0.000000,0.070910,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000
1997,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.0,0.0,0.000000,...,0.000000,0.093107,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000
1998,0.0,0.0,0.0,0.00000,0.0,0.0,0.093481,0.0,0.0,0.000000,...,0.000000,0.057058,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000


# Text Classification #######################

In [12]:
# Split the data into training and test sets
X = tfidf_df
y = blogs_data['Labels']


In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [14]:
# Initialize the Naive Bayes classifier
nb_classifier = MultinomialNB()


In [15]:
# Train the model on the training set
nb_classifier.fit(X_train, y_train)

In [16]:
# Make predictions on the test set
y_pred = nb_classifier.predict(X_test)

In [17]:
# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)


In [18]:
accuracy, report

(0.745,
 '                          precision    recall  f1-score   support\n\n             alt.atheism       0.50      0.78      0.61        18\n           comp.graphics       0.59      0.72      0.65        18\n comp.os.ms-windows.misc       0.78      0.82      0.80        22\ncomp.sys.ibm.pc.hardware       0.76      0.64      0.70        25\n   comp.sys.mac.hardware       0.75      0.71      0.73        21\n          comp.windows.x       0.83      0.76      0.79        25\n            misc.forsale       0.81      0.72      0.76        18\n               rec.autos       0.75      1.00      0.86        18\n         rec.motorcycles       0.65      0.94      0.77        16\n      rec.sport.baseball       0.74      0.94      0.83        18\n        rec.sport.hockey       0.83      1.00      0.91        15\n               sci.crypt       0.86      0.95      0.90        19\n         sci.electronics       0.53      0.50      0.52        16\n                 sci.med       0.76      0.76     

# Sentiment Analysis ##########################

In [19]:
from textblob import TextBlob

In [20]:
# Function definition to classify sentiment based on polarity
def get_sentiment(text):
    polarity = TextBlob(text).sentiment.polarity
    if polarity > 0:
        return "Positive"
    elif polarity < 0:
        return "Negative"
    else:
        return "Neutral"

In [21]:
# Apply sentiment analysis to the 'Data' column
blogs_data['Sentiment'] = blogs_data['Data'].apply(get_sentiment)

In [22]:
# Examine the distribution of sentiments across different categories
sentiment_distribution = blogs_data.groupby(['Labels', 'Sentiment']).size().unstack(fill_value=0)


In [23]:
# Dispaly Sentiment Distribution
sentiment_distribution


Sentiment,Negative,Positive
Labels,Unnamed: 1_level_1,Unnamed: 2_level_1
alt.atheism,23,77
comp.graphics,24,76
comp.os.ms-windows.misc,22,78
comp.sys.ibm.pc.hardware,20,80
comp.sys.mac.hardware,24,76
comp.windows.x,27,73
misc.forsale,16,84
rec.autos,17,83
rec.motorcycles,26,74
rec.sport.baseball,29,71


In [24]:
# Summarize findings
sentiment_summary = sentiment_distribution.sum(axis=1)
sentiment_distribution, sentiment_summary


(Sentiment                 Negative  Positive
 Labels                                      
 alt.atheism                     23        77
 comp.graphics                   24        76
 comp.os.ms-windows.misc         22        78
 comp.sys.ibm.pc.hardware        20        80
 comp.sys.mac.hardware           24        76
 comp.windows.x                  27        73
 misc.forsale                    16        84
 rec.autos                       17        83
 rec.motorcycles                 26        74
 rec.sport.baseball              29        71
 rec.sport.hockey                34        66
 sci.crypt                       19        81
 sci.electronics                 19        81
 sci.med                         29        71
 sci.space                       27        73
 soc.religion.christian          13        87
 talk.politics.guns              30        70
 talk.politics.mideast           22        78
 talk.politics.misc              22        78
 talk.religion.misc              1

# Evaluation of Performance ##############################

In [25]:
# Calculate metrics for the Naive Bayes classifier
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')


In [26]:
# Display the evaluation metrics
accuracy, precision, recall, f1


(0.745, 0.7622665984377942, 0.745, 0.7381884650778477)