                            ML based SE Project

                            "Sentimental Analysis" 
                                of text data

Import libraries

In [89]:
import tkinter as tk
from tkinter import ttk
from tkinter import messagebox
import numpy as np
import pandas as pd
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize.toktok import ToktokTokenizer
from bs4 import BeautifulSoup
import re
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

Load Dataset

In [90]:
# Load the dataset
imdb_data = pd.read_csv('Dataset.csv')

In [91]:
#Print sample data
print(imdb_data.shape)
imdb_data.head(10)

(50000, 2)


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [92]:
#Summary of the dataset
imdb_data.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


Cleaning Data

In [93]:
# Clean the dataset
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')
stemmer = PorterStemmer()

In [94]:
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()


In [95]:
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

In [96]:
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

In [97]:
def remove_special_characters(text):
    pattern = r'[^a-zA-z0-9\s]'
    text = re.sub(pattern, '', text)
    return text

In [98]:
def simple_stemmer(text):
    text = ' '.join([stemmer.stem(word) for word in text.split()])
    return text


In [99]:

def remove_stopwords(text):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

In [100]:
def clean_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    text = remove_special_characters(text)
    text = simple_stemmer(text)
    text = remove_stopwords(text)
    return text

Apply function on review column

In [101]:
imdb_data['clean_review'] = imdb_data['review'].apply(clean_text)

Training Data

In [None]:
# Train the machine learning model
cv = CountVectorizer(ngram_range=(1, 3))
tv = TfidfVectorizer(ngram_range=(1, 3))
lb = LabelBinarizer()

In [None]:
cv_train_reviews = cv.fit_transform(imdb_data['clean_review'][:40000])
tv_train_reviews = tv.fit_transform(imdb_data['clean_review'][:40000])
train_sentiments = lb.fit_transform(imdb_data['sentiment'][:40000])

In [None]:
lr_bow = LogisticRegression(penalty='l2', max_iter=500, C=1, random_state=42)
lr_tfidf = LogisticRegression(penalty='l2', max_iter=500, C=1, random_state=42)

In [None]:
lr_bow.fit(cv_train_reviews, train_sentiments)
lr_tfidf.fit(tv_train_reviews, train_sentiments)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Predict the sentiments

In [None]:
# Define a function to predict the sentiment of the given text
def predict_sentiment(text):
    cleaned_text = clean_text(text)
    cv_text = cv.transform([cleaned_text])
    tv_text = tv.transform([cleaned_text])
    lr_bow_prediction = lr_bow.predict(cv_text)[0]
    lr_tfidf_prediction = lr_tfidf.predict(tv_text)[0]
    if lr_bow_prediction == lr_tfidf_prediction:
        return lr_bow_prediction
    else:
        # If the predictions from both models do not match, we return the prediction with higher confidence
        lr_bow_confidence = np.max(lr_bow.predict_proba(cv_text))
        lr_tfidf_confidence = np.max(lr_tfidf.predict_proba(tv_text))
        if lr_bow_confidence >= lr_tfidf_confidence:
            return lr_bow_prediction
        else:
            return lr_tfidf_prediction


Accuracy

In [None]:
# Split the data into training and testing sets
X_train = imdb_data['clean_review'][:40000]
X_test = imdb_data['clean_review'][40000:]
y_train = imdb_data['sentiment'][:40000]
y_test = imdb_data['sentiment'][40000:]

In [None]:
# Vectorize the data
cv = CountVectorizer(ngram_range=(1, 3))
cv_train = cv.fit_transform(X_train)
cv_test = cv.transform(X_test)

In [None]:
# Train the model
lr_bow = LogisticRegression(penalty='l2', max_iter=500, C=1, random_state=42)
lr_bow.fit(cv_train, y_train)

In [None]:
# Predict on the test data
y_pred = lr_bow.predict(cv_test)

In [None]:
# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

Accuracy: 0.9012


Sample Test

In [None]:
text = "I loved the movie."
print(predict_sentiment(text))

1


In [None]:
text = "I hate the movie."
print(predict_sentiment(text))

0


GUI

In [None]:
class SentimentAnalysisGUI:
    
    def __init__(self):
        # Create the main window
        self.root = tk.Tk()
        self.root.title("Sentiment Analysis GUI")
        
        # Create a label widget for the input text
        self.input_label = tk.Label(self.root, text="Enter some text:")
        self.input_label.pack()
        
        # Create an entry widget for the input text
        self.input_entry = tk.Entry(self.root)
        self.input_entry.pack()
        
        # Create a button widget to analyze the sentiment
        self.analyze_button = tk.Button(self.root, text="Analyze Sentiment", command=self.analyze_sentiment)
        self.analyze_button.pack()
        
        # Create a label widget to display the sentiment result
        self.result_label = tk.Label(self.root, text="")
        self.result_label.pack()
        
        # Create a label widget to display previous input text
        self.previous_input_label = tk.Label(self.root, text="Previous input:")
        self.previous_input_label.pack()
        
        # Create a text widget to display previous input text
        self.previous_input_text = tk.Text(self.root, height=5, state="disabled")
        self.previous_input_text.pack()
        
        # Create a label widget to display previous sentiment result
        self.previous_result_label = tk.Label(self.root, text="Previous sentiment result:")
        self.previous_result_label.pack()
        
        # Create a text widget to display previous sentiment result
        self.previous_result_text = tk.Text(self.root, height=1, state="disabled")
        self.previous_result_text.pack()
        
        # Initialize an empty list to store the sentiment results
        self.sentiments = []
        
        # Run the main event loop
        self.root.mainloop()
    
    # Define the analyze_sentiment method
    def analyze_sentiment(self):
        # Get the input text from the entry widget
        input_text = self.input_entry.get()
        
        # Predict the sentiment of the input text using the predict_sentiment function
        sentiment = predict_sentiment(input_text)
        
        # Display the sentiment result in the result label widget
        self.result_label.config(text="Sentiment: {}".format(sentiment))
        
        # Display the previous input text and sentiment result in the text widgets
        self.previous_input_text.configure(state="normal")
        self.previous_input_text.insert("end", input_text + "\n")
        self.previous_input_text.configure(state="disabled")
        
        self.previous_result_text.configure(state="normal")
        self.previous_result_text.delete("1.0", "end")
        self.previous_result_text.insert("end", sentiment)
        self.previous_result_text.configure(state="disabled")
        
        # Store the sentiment result in the list
        self.sentiments.append(sentiment)


Run Application

In [None]:
gui = SentimentAnalysisGUI()