<a href="https://colab.research.google.com/github/hiteshjck/nlp/blob/main/NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

---------------------------------------------------------------------------------------------------------**Data Preprocessing**------------------------------------------------------------------------------

In [None]:
# Data preprocessing

# reading csv using pandas
from google.colab import drive
drive.mount('/content/drive/')


import pandas as pd
df = pd.read_csv('drive/MyDrive/nlp_data/AmazonReview.csv')

print(df.head())
print(len(df.index))

Mounted at /content/drive/
                                              Review  Sentiment
0  Fast shipping but this product is very cheaply...          1
1  This case takes so long to ship and it's not e...          1
2  Good for not droids. Not good for iPhones. You...          1
3  The cable was not compatible between my macboo...          1
4  The case is nice but did not have a glow light...          1
25000


In [None]:
# text cleaning
# 1. removing leading and trailing spaces
df["Review"] = df["Review"].str.strip()

# 2. Removing special characters
df["Review"] = df["Review"].str.replace("[\"$&+,:;=?@#|'<>.-^*()%!]", "")

# Handling missing values
# no missing values
df = df.dropna()

# tokenization
#import nltk
#nltk.download('punkt')
#df["tokenized review"] = df.apply(lambda row: nltk.word_tokenize(row["Review"]), axis=1)

# Lowercasing
df["Review"] = df["Review"].str.lower()

print(df.head())
print(len(df.index))

  df["Review"] = df["Review"].str.replace("[\"$&+,:;=?@#|'<>.-^*()%!]", "")


                                              Review  Sentiment
0  ast shipping but this product is very cheaply ...          1
1  his case takes so long to ship and its not eve...          1
2  ood for not droids ot good for ihones ou canno...          1
3  he cable was not compatible between my macbook...          1
4  he case is nice but did not have a glow light ...          1
24999


--------------------------------------------------------------------------------------------------------- **Naive Bayes** -------------------------------------------------------------------------------------------------------

In [None]:
# sentiment analysis implementation
# 1. Naive bayes algorithm
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import make_pipeline

# Map scores to sentiments (e.g., positive, neutral, negative)
df['Sentiment value'] = df['Sentiment'].apply(lambda score: 'positive' if score > 3 else ('negative' if score < 3 else 'neutral'))
print(df['Sentiment value'].value_counts())

# Split the data into training and testing sets
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

# Use TF-IDF vectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

# Create TF-IDF matrices for training and testing data
X_train = vectorizer.fit_transform(train_data['Review'])
X_test = vectorizer.transform(test_data['Review'])

# Use a simple model (Naive Bayes) as a starting point
model = make_pipeline(MultinomialNB())
model.fit(X_train, train_data['Sentiment value'])

# Make predictions on the test set
predictions = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(test_data['Sentiment value'], predictions))
print("\nClassification Report:\n", classification_report(test_data['Sentiment value'], predictions))

negative    10000
positive     9999
neutral      5000
Name: Sentiment value, dtype: int64
Accuracy: 0.6776

Classification Report:
               precision    recall  f1-score   support

    negative       0.69      0.82      0.75      2021
     neutral       0.45      0.04      0.07       985
    positive       0.67      0.84      0.75      1994

    accuracy                           0.68      5000
   macro avg       0.60      0.57      0.52      5000
weighted avg       0.64      0.68      0.62      5000



---------------------------------------------------------------------------------------- **Logistic Regression** ----------------------------------------------------------------------------------------------------

In [None]:
cv = TfidfVectorizer(max_features=2500)
X = cv.fit_transform(df['Review'] ).toarray()

df.loc[df['Sentiment']<=3,'Sentiment number'] = 0
df.loc[df['Sentiment']>3,'Sentiment number'] = 1

x_train ,x_test,y_train,y_test=train_test_split(X, df['Sentiment number'],
                                                test_size=0.25, random_state=42)

from sklearn.linear_model import LogisticRegression
model=LogisticRegression()

#Model fitting
model.fit(x_train,y_train)

#testing the model
pred=model.predict(x_test)

#model accuracy
print(accuracy_score(y_test,pred))

df.head()

0.82256


Unnamed: 0,Review,Sentiment,Sentiment value,Sentiment number
0,ast shipping but this product is very cheaply ...,1,negative,0.0
1,his case takes so long to ship and its not eve...,1,negative,0.0
2,ood for not droids ot good for ihones ou canno...,1,negative,0.0
3,he cable was not compatible between my macbook...,1,negative,0.0
4,he case is nice but did not have a glow light ...,1,negative,0.0


---------------------------------------------------------------------------------------- **Support Vector Machine** ----------------------------------------------------------------------------------------------------

In [None]:
from sklearn.svm import SVC
svm_model = SVC()
X_train, X_test, y_train, y_test = train_test_split(df['Review'],
                                df['Sentiment value'], test_size=0.2, random_state=42)
# Create TF-IDF matrices for training and testing data
X_train = vectorizer.fit_transform(train_data['Review'])
X_test = vectorizer.transform(test_data['Review'])

svm_model.fit(X_train, y_train)
svm_predictions = svm_model.predict(X_test)

# Evaluate SVM
print("SVM Accuracy:", accuracy_score(y_test, svm_predictions))
print("\nClassification Report:\n", classification_report(y_test, svm_predictions))

KeyboardInterrupt: ignored