In [1]:
import pandas as pd

data = pd.read_csv("preprocessed_kindle_review .csv")
df = data
df.head()

Unnamed: 0.1,Unnamed: 0,rating,reviewText,summary
0,0,5,This book was the very first bookmobile book I...,50 + years ago...
1,1,1,"When I read the description for this book, I c...",Boring! Boring! Boring!
2,2,5,I just had to edit this review. This book is a...,Wiggleliscious/new toy ready/!!
3,3,5,I don't normally buy 'mystery' novels because ...,Very good read.
4,4,5,"This isn't the kind of book I normally read, a...",Great Story!


In [2]:
df.dropna(subset=['reviewText'], inplace=True)

In [3]:
import re

def sentiment_label(rating):
    if rating >= 4:
        return 1
    elif rating == 3:
        return 0
    else:
        return -1

df['sentiment'] = df['rating'].apply(sentiment_label)

In [4]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\W+', ' ', text) 
    return text

df['cleaned_text'] = df['reviewText'].apply(clean_text)


In [5]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
def word_tokenize(text, language="english", preserve_line=False):
    df['tokens'] = df['cleaned_text'].apply(word_tokenize)


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/parthharpale/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
from nltk.corpus import stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

df['tokens'] = df['cleaned_text'].apply(lambda x: [word for word in x if word not in stop_words])


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/parthharpale/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
nltk.download('wordnet')

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

df['stemmed'] = df['cleaned_text'].apply(lambda x: [stemmer.stem(word) for word in x])
df['lemmatized'] = df['cleaned_text'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/parthharpale/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X_bow = vectorizer.fit_transform(df['cleaned_text'])

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(df['cleaned_text'])

In [10]:
from gensim.models import Word2Vec

word2vec_model = Word2Vec(sentences=df['cleaned_text'], vector_size=100, window=5, min_count=1)

In [11]:
from sklearn.model_selection import train_test_split

X = X_tfidf 
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.76


In [13]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred_rf))

Accuracy: 0.68625


In [14]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred_rf))

              precision    recall  f1-score   support

          -1       0.73      0.66      0.69       799
           0       0.00      0.00      0.00       412
           1       0.67      0.94      0.78      1189

    accuracy                           0.69      2400
   macro avg       0.47      0.53      0.49      2400
weighted avg       0.57      0.69      0.62      2400



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [15]:
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

sia = SentimentIntensityAnalyzer()

df['vader_score'] = df['cleaned_text'].apply(lambda x: sia.polarity_scores(x)['compound'])

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/parthharpale/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [16]:
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(n_components=5)
topics = lda.fit_transform(X_tfidf)

In [18]:
import spacy
nlp = spacy.load("en_core_web_sm")

def extract_entities(text):
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents]

df['entities'] = df['cleaned_text'].apply(extract_entities)

In [19]:
import joblib
joblib.dump(lr, 'sentiment_model.pkl')
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')


['tfidf_vectorizer.pkl']