<a href="https://colab.research.google.com/github/ireddybsshr-17/ireddybsshr-17/blob/main/Social_Media_Sentiment_Analysis_ML_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

IMPORTING NECESSARY LIBRARIES

In [19]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import json
import random

DEFINING NECESSARY CLASSES

In [20]:
class Sentiment:
  negative="NEGATIVE"
  positive="POSITIVE"
class Review:
  def __init__(self,text,score):
    self.text=text
    self.score=score
    self.sentiment=self.get_sentiment()
  def get_sentiment(self):
    if self.score<3:
      return Sentiment.negative
    else:#when score is 3 or 4 or 5
      return Sentiment.positive
class ReviewContainer:
  def __init__(self,reviews):
    self.reviews=reviews
  def evenly_distibute(self):
    negative=list(filter(lambda x:x.sentiment==Sentiment.negative,self.reviews))
    positive=list(filter(lambda x:x.sentiment==Sentiment.positive,self.reviews))
    #as data contains more positive data sets,we will reduce them into number of negative one's for better accuracy
    positive_shrunk=positive[:len(negative)]
    self.reviews=negative+positive_shrunk
    random.shuffle(self.reviews)

LOADING AND UNDERSTANDING DATA

In [21]:
reviews=[]
with open("/content/Books_small_10000.json") as f:
  for line in f:
    review_json=json.loads(line)
    review=Review(review_json['reviewText'], review_json['overall'])
    reviews.append(review)

In [22]:
reviews[7].text
reviews[7].score
reviews[7].sentiment

'POSITIVE'

In [23]:
from sklearn.model_selection import train_test_split
training,testing=train_test_split(reviews,test_size=0.33)

In [24]:
len(training)
len(testing)

3300

In [15]:
#evenly distributing data for better accuracies
traincontainer=ReviewContainer(training)
traincontainer.evenly_distibute()
testcontainer=ReviewContainer(testing)
testcontainer.evenly_distibute()

In [25]:
#splitting data
x_train=[x.text for x in traincontainer.reviews]
x_test=[x.text for x in testcontainer.reviews]
y_train=[y.sentiment for y in traincontainer.reviews]
y_test=[y.sentiment for y in testcontainer.reviews]

In [27]:
y_train.count(Sentiment.positive)
y_train.count(Sentiment.negative)


434

In [28]:
#here we can also use count vectorizer but ,tfidf vectorizer best suits the problem
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer=TfidfVectorizer()
x_train_vector=vectorizer.fit_transform(x_train)
x_test_vector=vectorizer.transform(x_test)

SUPPORT VECTOR MACHINES

In [29]:
from sklearn.svm import SVC
clf_svm=SVC()
clf_svm.fit(x_train_vector,y_train)
y_pred=clf_svm.predict(x_test_vector)

from sklearn.metrics import accuracy_score,f1_score
clf_svm_score=accuracy_score(y_test,y_pred)
print(accuracy_score(y_test,y_pred))
print(f1_score(y_test,y_pred,average=None,labels=[Sentiment.positive,Sentiment.negative]))

0.8142857142857143
[0.80597015 0.82191781]


DECISION TREES

In [30]:
from sklearn.tree import DecisionTreeClassifier
clf_dec=DecisionTreeClassifier()
clf_dec.fit(x_train_vector,y_train)
y_pred=clf_dec.predict(x_test_vector)

from sklearn.metrics import accuracy_score,f1_score
clf_dec_score=accuracy_score(y_test,y_pred)
print(accuracy_score(y_test,y_pred))
print(f1_score(y_test,y_pred,average=None,labels=[Sentiment.positive,Sentiment.negative]))

0.6738095238095239
[0.66666667 0.68065268]


LOGISTIC REGRESSION

In [32]:
from sklearn.linear_model import LogisticRegression
clf_log=LogisticRegression()
clf_log.fit(x_train_vector,y_train)
y_pred=clf_log.predict(x_test_vector)

from sklearn.metrics import accuracy_score,f1_score
clf_log_score=accuracy_score(y_test,y_pred)
print(accuracy_score(y_test,y_pred))
print(f1_score(y_test,y_pred,average=None,labels=[Sentiment.positive,Sentiment.negative]))

0.7928571428571428
[0.78832117 0.7972028 ]


from the above calculations,we can clearly say that logistic regression best suits the problem

In [None]:
#predicting few random samples whether positive or negative
clf_log.predict(vectorizer.transform(["very bad","i loved it","awesome","worst one"]))

array(['NEGATIVE', 'POSITIVE', 'POSITIVE', 'NEGATIVE'], dtype='<U8')