<a href="https://colab.research.google.com/github/hiteshjck/nlp/blob/main/Sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

---------------------------------------------------------------------------------------------------------**Import Libraries**------------------------------------------------------------------------------

In [1]:
import pandas as pd
import numpy as np
#import nltk
#nltk.download('punkt')
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report

---------------------------------------------------------------------------------------------------------**Data Preprocessing**------------------------------------------------------------------------------

In [2]:
# Data preprocessing

# reading csv using pandas
from google.colab import drive
drive.mount('/content/drive/')
df = pd.read_csv('drive/MyDrive/nlp_data/AmazonReview.csv')

print(df.head())
print(len(df.index))

Mounted at /content/drive/
                                              Review  Sentiment
0  Fast shipping but this product is very cheaply...          1
1  This case takes so long to ship and it's not e...          1
2  Good for not droids. Not good for iPhones. You...          1
3  The cable was not compatible between my macboo...          1
4  The case is nice but did not have a glow light...          1
25000


In [5]:
# text cleaning
# 1. removing leading and trailing spaces
df["Review"] = df["Review"].str.strip()

# 2. Removing special characters
df["Review"] = df["Review"].str.replace("[\"$&+,:;=?@#|'<>.-^*()%!]", "")

# Handling missing values
# no missing values
df = df.dropna()

# tokenization
#df["tokenized review"] = df.apply(lambda row: nltk.word_tokenize(row["Review"]), axis=1)

# Lowercasing
df["Review"] = df["Review"].str.lower()

# Map scores to sentiments (e.g., positive, neutral, negative)
df['Sentiment value'] = df['Sentiment'].apply(lambda score: 'positive' if score > 3 else ('negative' if score < 3 else 'neutral'))
print(df['Sentiment value'].value_counts())

print(df.head())
print(len(df.index))

  df["Review"] = df["Review"].str.replace("[\"$&+,:;=?@#|'<>.-^*()%!]", "")


negative    10000
positive     9999
neutral      5000
Name: Sentiment value, dtype: int64
                                              Review  Sentiment  \
0  ast shipping but this product is very cheaply ...          1   
1  his case takes so long to ship and its not eve...          1   
2  ood for not droids ot good for ihones ou canno...          1   
3  he cable was not compatible between my macbook...          1   
4  he case is nice but did not have a glow light ...          1   

  Sentiment value  
0        negative  
1        negative  
2        negative  
3        negative  
4        negative  
24999


--------------------------------------------------------------------------------------------------------- **Naive Bayes** -------------------------------------------------------------------------------------------------------

In [4]:
# sentiment analysis implementation
# 1. Naive bayes algorithm

# Split the data into training and testing sets
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

# Use TF-IDF vectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

# Create TF-IDF matrices for training and testing data
X_train = vectorizer.fit_transform(train_data['Review'])
X_test = vectorizer.transform(test_data['Review'])

# Use a simple model (Naive Bayes) as a starting point
model = make_pipeline(MultinomialNB())
model.fit(X_train, train_data['Sentiment value'])

# Make predictions on the test set
predictions = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(test_data['Sentiment value'], predictions))
print("\nClassification Report:\n", classification_report(test_data['Sentiment value'], predictions))

negative    10000
positive     9999
neutral      5000
Name: Sentiment value, dtype: int64
Accuracy: 0.6776

Classification Report:
               precision    recall  f1-score   support

    negative       0.69      0.82      0.75      2021
     neutral       0.45      0.04      0.07       985
    positive       0.67      0.84      0.75      1994

    accuracy                           0.68      5000
   macro avg       0.60      0.57      0.52      5000
weighted avg       0.64      0.68      0.62      5000



---------------------------------------------------------------------------------------- **Logistic Regression** ----------------------------------------------------------------------------------------------------

In [5]:
cv = TfidfVectorizer(max_features=2500)
X = cv.fit_transform(df['Review'] ).toarray()

df.loc[df['Sentiment']<=3,'Sentiment number'] = 0
df.loc[df['Sentiment']>3,'Sentiment number'] = 1

x_train ,x_test,y_train,y_test=train_test_split(X, df['Sentiment number'],
                                                test_size=0.25, random_state=42)

model=LogisticRegression()

#Model fitting
model.fit(x_train,y_train)

#testing the model
pred=model.predict(x_test)

#model accuracy
print(accuracy_score(y_test,pred))

df.head()

0.82256


Unnamed: 0,Review,Sentiment,Sentiment value,Sentiment number
0,ast shipping but this product is very cheaply ...,1,negative,0.0
1,his case takes so long to ship and its not eve...,1,negative,0.0
2,ood for not droids ot good for ihones ou canno...,1,negative,0.0
3,he cable was not compatible between my macbook...,1,negative,0.0
4,he case is nice but did not have a glow light ...,1,negative,0.0


--------------------------------------------------------------------------------- **Support Vector Machines** ----------------------------------------------------------------------------------------(slow)

In [6]:
newdf = df.copy()
drop_indices = np.random.choice(newdf.index, 10000, replace=False)
newdf = newdf.drop(drop_indices)
print(newdf['Sentiment value'].value_counts())
print(newdf.index)
newdf.head()

positive    5988
negative    5973
neutral     3038
Name: Sentiment value, dtype: int64
Int64Index([    0,     1,     3,     5,     6,     7,     8,     9,    11,
               14,
            ...
            24981, 24982, 24983, 24984, 24986, 24988, 24989, 24990, 24991,
            24999],
           dtype='int64', length=14999)


Unnamed: 0,Review,Sentiment,Sentiment value
0,ast shipping but this product is very cheaply ...,1,negative
1,his case takes so long to ship and its not eve...,1,negative
3,he cable was not compatible between my macbook...,1,negative
5,he cable keeps coming up with message that thi...,1,negative
6,his pos broke off in my phone after uses and ...,1,negative


In [7]:
# Step 1: Load and preprocess the dataset
text = newdf['Review'].values
labels = newdf['Sentiment value'].values

In [8]:
# Step 2: Split the dataset into training and testing sets
text_train, text_test, labels_train, labels_test = train_test_split(text, labels, test_size=0.2, random_state=42)

# Step 3: Convert text data into numerical feature vectors
vectorizer = CountVectorizer()
features_train = vectorizer.fit_transform(text_train)
features_test = vectorizer.transform(text_test)

# Step 4: Train the SVM model
svm = SVC(kernel='linear')
svm.fit(features_train, labels_train)

"""Step 6: Predict sentiment on new data
new_text = ["I love this movie!", "This product is terrible.", "The food was delicious."]
new_features = vectorizer.transform(new_text)
new_predictions = svm.predict(new_features)
print(new_predictions)"""

# Step 7: Generate the classification report to evaluate the model
predictions = svm.predict(features_test)
print("Accuracy:", accuracy_score(labels_test, predictions))
print(classification_report(labels_test, predictions))

Accuracy: 0.6466666666666666
              precision    recall  f1-score   support

    negative       0.68      0.74      0.71      1217
     neutral       0.34      0.30      0.32       600
    positive       0.75      0.73      0.74      1183

    accuracy                           0.65      3000
   macro avg       0.59      0.59      0.59      3000
weighted avg       0.64      0.65      0.64      3000



---------------------------------------------------------------------------------------- **Logistic Regression** ----------------------------------------------------------------------------------------------------