# Natural Language Processing & Naive Bayes

## Data Exploration & Preprocessing

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
import nltk
from nltk.corpus import stopwords

In [2]:
# load dataset
df = pd.read_csv('blogs.csv')
df

Unnamed: 0,Data,Labels
0,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism
1,Newsgroups: alt.atheism\nPath: cantaloupe.srv....,alt.atheism
2,Path: cantaloupe.srv.cs.cmu.edu!das-news.harva...,alt.atheism
3,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism
4,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:53...,alt.atheism
...,...,...
1995,Xref: cantaloupe.srv.cs.cmu.edu talk.abortion:...,talk.religion.misc
1996,Xref: cantaloupe.srv.cs.cmu.edu talk.religion....,talk.religion.misc
1997,Xref: cantaloupe.srv.cs.cmu.edu talk.origins:4...,talk.religion.misc
1998,Xref: cantaloupe.srv.cs.cmu.edu talk.religion....,talk.religion.misc


In [3]:
# structure
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Data    2000 non-null   object
 1   Labels  2000 non-null   object
dtypes: object(2)
memory usage: 31.4+ KB


In [5]:
import re
import string

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [8]:
df['processed_text']=df['Data'].apply(clean_text)

In [10]:
# tf-idf vectorizer
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['processed_text'])
y = df['Labels']

## Naive Bayes model for text classification

In [12]:
# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# naive bayes
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)
# Evaluation
y_pred = nb_classifier.predict(X_test)
print(classification_report(y_test, y_pred))

                          precision    recall  f1-score   support

             alt.atheism       0.54      0.83      0.65        18
           comp.graphics       0.83      0.83      0.83        18
 comp.os.ms-windows.misc       0.86      0.82      0.84        22
comp.sys.ibm.pc.hardware       0.73      0.76      0.75        25
   comp.sys.mac.hardware       0.82      0.86      0.84        21
          comp.windows.x       0.91      0.84      0.88        25
            misc.forsale       0.78      0.78      0.78        18
               rec.autos       0.89      0.94      0.92        18
         rec.motorcycles       0.94      0.94      0.94        16
      rec.sport.baseball       0.77      0.94      0.85        18
        rec.sport.hockey       0.94      1.00      0.97        15
               sci.crypt       0.95      0.95      0.95        19
         sci.electronics       0.59      0.62      0.61        16
                 sci.med       0.88      0.88      0.88        17
         

## Sentiment Analysis

In [14]:
# Sentiment Analysis
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

def analyze_sentiment(text):
    scores = analyzer.polarity_scores(text)
    compound_score = scores['compound']
    if compound_score >= 0.05:
        return 'Positive'
    elif compound_score <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


In [16]:
df['Sentiment'] = df['Data'].apply(analyze_sentiment)

sentiment_counts = df['Sentiment'].value_counts()
print(sentiment_counts)

Sentiment
Positive    1334
Negative     631
Neutral       35
Name: count, dtype: int64


In [17]:
# distibutions across categories
category_sentiment_counts = df.groupby('Labels')['Sentiment'].value_counts()
print(category_sentiment_counts)

Labels                    Sentiment
alt.atheism               Positive     57
                          Negative     42
                          Neutral       1
comp.graphics             Positive     83
                          Negative     13
                          Neutral       4
comp.os.ms-windows.misc   Positive     74
                          Negative     24
                          Neutral       2
comp.sys.ibm.pc.hardware  Positive     79
                          Negative     21
comp.sys.mac.hardware     Positive     73
                          Negative     24
                          Neutral       3
comp.windows.x            Positive     78
                          Negative     20
                          Neutral       2
misc.forsale              Positive     85
                          Neutral       8
                          Negative      7
rec.autos                 Positive     72
                          Negative     27
                          Neutral       

## Evaluation



*   Accuracy of 83% suggests we have developed a good model, while precision varies throught different categories by huge percentage.

*   Clearly positives in our document outnumbered negative reviews, The only label that positives less than or equal to negatives is politics.

