In [1]:
# Import necessary libraries
import pandas as pd
import re
import spacy
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from transformers import pipeline



In [2]:
# Load the dataset
data = pd.read_csv('IMDB_Dataset.csv')

# Data Cleaning
def clean_text(text):
    text = text.lower()
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'[^a-z\s]', '', text)
    return text

data['review'] = data['review'].apply(clean_text)

# Convert sentiment labels to binary format
data['sentiment'] = data['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)


In [3]:
# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(data['review'], data['sentiment'], test_size=0.2, random_state=42)

# Text Vectorization
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)



In [4]:
# Initialize the models

svm_model = SVC(kernel='linear')
# Train the models

svm_model.fit(X_train_tfidf, y_train)


# Predict on the test set
svm_predictions = svm_model.predict(X_test_tfidf)


# Evaluate the models

print("SVM Accuracy:", accuracy_score(y_test, svm_predictions))


# Classification reports
print("SVM Classification Report:\n", classification_report(y_test, svm_predictions))

# Testing SVM Model on the dataset
print("SVM Predictions:\n", svm_predictions)
print("SVM Accuracy:", accuracy_score(y_test, svm_predictions))
print("SVM Classification Report:\n", classification_report(y_test, svm_predictions))



SVM Accuracy: 0.8921
SVM Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.88      0.89      4961
           1       0.89      0.90      0.89      5039

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000

SVM Predictions:
 [0 1 0 ... 1 0 1]
SVM Accuracy: 0.8921
SVM Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.88      0.89      4961
           1       0.89      0.90      0.89      5039

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



In [5]:
print("\nSVM Predictions and Actual Results:")
for i, (prediction, actual) in enumerate(zip(svm_predictions, y_test)):
    print(f"Review {i+1}: Prediction={prediction}, Actual={actual}")


SVM Predictions and Actual Results:
Review 1: Prediction=0, Actual=1
Review 2: Prediction=1, Actual=1
Review 3: Prediction=0, Actual=0
Review 4: Prediction=1, Actual=1
Review 5: Prediction=0, Actual=0
Review 6: Prediction=1, Actual=1
Review 7: Prediction=1, Actual=1
Review 8: Prediction=0, Actual=1
Review 9: Prediction=0, Actual=0
Review 10: Prediction=0, Actual=0
Review 11: Prediction=0, Actual=0
Review 12: Prediction=1, Actual=1
Review 13: Prediction=0, Actual=0
Review 14: Prediction=0, Actual=0
Review 15: Prediction=0, Actual=0
Review 16: Prediction=1, Actual=1
Review 17: Prediction=1, Actual=1
Review 18: Prediction=1, Actual=1
Review 19: Prediction=1, Actual=1
Review 20: Prediction=1, Actual=1
Review 21: Prediction=0, Actual=1
Review 22: Prediction=1, Actual=1
Review 23: Prediction=1, Actual=1
Review 24: Prediction=0, Actual=1
Review 25: Prediction=1, Actual=1
Review 26: Prediction=1, Actual=1
Review 27: Prediction=1, Actual=1
Review 28: Prediction=1, Actual=1
Review 29: Predictio

In [6]:
# Save the trained SVM model and TF-IDF vectorizer
joblib.dump(svm_model, 'svm_model.pkl')
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']

In [7]:
# Aspect-Based Sentiment Analysis
# Load Spacy model
nlp = spacy.load('en_core_web_sm')

In [11]:
from transformers import pipeline

# Specify model and tokenizer explicitly
model_name = "distilbert-base-uncased-finetuned-sst-2-english"

sentiment_pipeline = pipeline('sentiment-analysis')

# Example usage
texts = ["I love this movie!", "I hate this movie."]
results = sentiment_pipeline(texts)

for text, result in zip(texts, results):
    print(f"Text: {text}")
    print(f"Label: {result['label']}, Score: {result['score']}")


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


Text: I love this movie!
Label: POSITIVE, Score: 0.9998775720596313
Text: I hate this movie.
Label: NEGATIVE, Score: 0.9996869564056396


In [10]:


# Define aspects
aspects = ['acting', 'direction', 'storyline', 'special effects']

# Function to identify aspects in a review
def identify_aspects(review):
    doc = nlp(review)
    review_aspects = []
    for token in doc:
        if token.lemma_ in aspects:
            review_aspects.append(token.lemma_)
    return review_aspects

data['aspects'] = data['review'].apply(identify_aspects)

# Load sentiment analysis pipeline
sentiment_pipeline = pipeline('sentiment-analysis')


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [16]:
# Function to get sentiment for each aspect
def aspect_sentiment(review, aspects):
    sentiments = {}
    max_length = 512  # Maximum token length for the model

    for aspect in aspects:
        if aspect in review:
            # Find sentences containing the aspect
            aspect_sentence = [sent for sent in review.split('.') if aspect in sent]
            if aspect_sentence:
                # Join sentences and truncate to max_length
                joined_sentence = ' '.join(aspect_sentence)
                if len(joined_sentence) > max_length:
                    joined_sentence = joined_sentence[:max_length]

                # Get sentiment
                sentiment_result = sentiment_pipeline(joined_sentence)
                sentiments[aspect] = sentiment_result[0]['label']
    return sentiments


# Apply sentiment analysis for each review's aspects
data['aspect_sentiments'] = data.apply(lambda x: aspect_sentiment(x['review'], x['aspects']), axis=1)

# Display some results
for i in range(10):
    print(f"Review {i+1}:")
    print(data['review'].iloc[i])
    print("Aspects Identified:", data['aspects'].iloc[i])
    print("Aspect Sentiments:", data['aspect_sentiments'].iloc[i])
    print("\n")

Review 1:
one of the other reviewers has mentioned that after watching just  oz episode youll be hooked they are right as this is exactly what happened with me  the first thing that struck me about oz was its brutality and unflinching scenes of violence which set in right from the word go trust me this is not a show for the faint hearted or timid this show pulls no punches with regards to drugs sex or violence its is hardcore in the classic use of the word  it is called oz as that is the nickname given to the oswald maximum security state penitentary it focuses mainly on emerald city an experimental section of the prison where all the cells have glass fronts and face inwards so privacy is not high on the agenda em city is home to manyaryans muslims gangstas latinos christians italians irish and moreso scuffles death stares dodgy dealings and shady agreements are never far away  i would say the main appeal of the show is due to the fact that it goes where other shows wouldnt dare forget

In [None]:
# Load the trained SVM model and TF-IDF vectorizer for real-time predictions
svm_model = joblib.load('svm_model.pkl')
tfidf = joblib.load('tfidf_vectorizer.pkl')

# Define a function for real-time prediction
def preprocess_and_predict(review):
    cleaned_review = clean_text(review)
    transformed_review = tfidf.transform([cleaned_review])
    sentiment = svm_model.predict(transformed_review)
    sentiment_label = 'positive' if sentiment == 1 else 'negative'
    return sentiment_label


In [None]:
# Example usage for a new review
new_review = "The acting was fantastic but the storyline was a bit dull."
predicted_sentiment = preprocess_and_predict(new_review)
print(f"Predicted Sentiment: {predicted_sentiment}")

# Simulate receiving new reviews and predicting their sentiments
new_reviews = [
    "The special effects were stunning and the plot was engaging!",
    "I found the movie to be quite boring and predictable.",
    "Great performances by the cast but the direction was weak.",
    "An absolute masterpiece! Thoroughly enjoyed every moment."
]

for review in new_reviews:
    predicted_sentiment = preprocess_and_predict(review)
    print(f"Review: {review}")
    print(f"Predicted Sentiment: {predicted_sentiment}\n")