In [34]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [4]:
data = pd.read_csv("data/IMDB_Dataset.csv")

In [5]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [9]:
# Label encoding
data['sentiment'] = data['sentiment'].map({'positive': 1, 'negative': 0})

In [12]:
data['sentiment'].head()

0    1
1    1
2    1
3    0
4    1
Name: sentiment, dtype: int64

In [10]:
# Split the data into target and features
X = data['review']
y = data['sentiment']
# Do a train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
tokenizer = Tokenizer(num_words = 5000)
tokenizer.fit_on_texts(X_train)

In [19]:
X_train = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=200)
X_test = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=200)

In [23]:
model = keras.Sequential([
    keras.layers.Embedding(input_dim=10000, output_dim=32, input_length=250),
    keras.layers.LSTM(64, return_sequences=True),
    keras.layers.LSTM(32),
    keras.layers.Dense(16, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [24]:
model.fit(X_train, y_train, epochs=5, batch_size=64)

Epoch 1/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m92s[0m 139ms/step - accuracy: 0.7328 - loss: 0.5088
Epoch 2/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 134ms/step - accuracy: 0.8920 - loss: 0.2740
Epoch 3/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 132ms/step - accuracy: 0.9095 - loss: 0.2265
Epoch 4/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 134ms/step - accuracy: 0.9247 - loss: 0.1953
Epoch 5/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 130ms/step - accuracy: 0.9351 - loss: 0.1725


<keras.src.callbacks.history.History at 0x1f4121f6240>

In [25]:
def predict_review(review):
    encoded_review = tokenizer.texts_to_sequences([review])
    padded_review = pad_sequences(encoded_review, maxlen=250, padding='post', truncating='post')
    prediction = model.predict(padded_review)[0][0]
    return "Positive" if prediction > 0.5 else "Negative"

In [26]:
sample_review = "This movie was absolutely fantastic! The storyline was engaging and the characters were well developed."
print(predict_review(sample_review))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 499ms/step
Positive


In [30]:
# gradio interface
import gradio as gr

func = gr.Interface(
    fn=predict_review, 
    inputs=gr.Textbox(
    label="Enter your review here",
    lines=3,
    max_lines=5,
    interactive=True  
), 
    outputs=gr.Textbox(label='Review')
)

In [33]:
func.launch(share=True)

Rerunning server... use `close()` to stop if you need to change `launch()` parameters.
----

Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.




In [35]:
from transformers import pipeline

# Load NER pipeline
ner_pipeline = pipeline("ner", model="dslim/bert-base-NER")

# Example review
review = "The Dark Knight was an amazing movie, and Christian Bale was outstanding."

# Get named entities
entities = ner_pipeline(review)

# Print results
for entity in entities:
    print(f"Entity: {entity['word']}, Label: {entity['entity']}, Score: {entity['score']:.2f}")


config.json:   0%|          | 0.00/829 [00:00<?, ?B/s]




model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cpu


Entity: Dark, Label: B-MISC, Score: 1.00
Entity: Knight, Label: I-MISC, Score: 1.00
Entity: Christian, Label: B-PER, Score: 1.00
Entity: Ba, Label: I-PER, Score: 1.00
Entity: ##le, Label: I-PER, Score: 0.97


In [42]:
splits = {'train': 'train.parquet', 'validation': 'validation.parquet', 'test': 'test.parquet'}
df_new = pd.read_parquet("hf://datasets/cornell-movie-review-data/rotten_tomatoes/" + splits["train"], engine='fastparquet')

In [47]:
df_new = df_new.rename(columns={'text': 'review'})


In [48]:
from transformers import AutoModelForTokenClassification, AutoTokenizer
import torch

# Load tokenizer and model
model_name = "dslim/bert-base-NER"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, device=0)  # Use GPU if available

# Process reviews in batches
batch_size = 32  # Adjust based on memory
reviews = df_new["review"].tolist()

# Apply NER in batches
all_entities = []
for i in tqdm(range(0, len(reviews), batch_size)):
    batch = reviews[i : i + batch_size]
    results = ner_pipeline(batch)
    all_entities.extend(results)

df_new["named_entities"] = all_entities
df_new.to_csv("movie_reviews_with_entities.csv", index=False)


Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu
100%|██████████| 267/267 [10:22<00:00,  2.33s/it]


In [62]:
df_reviews = pd.read_csv("movie_reviews_with_entities.csv")

In [50]:
df_reviews.head()

Unnamed: 0,review,label,named_entities
0,the rock is destined to be the 21st century's ...,1,[]
1,"the gorgeously elaborate continuation of "" the...",1,"[{'entity': 'B-PER', 'score': 0.92304903, 'ind..."
2,effective but too-tepid biopic,1,[]
3,if you sometimes like to go to the movies to h...,1,[]
4,"emerges as something rare , an issue movie tha...",1,[]


In [52]:
pd.set_option('display.max_colwidth', None)
df_reviews

Unnamed: 0,review,label,named_entities
0,"the rock is destined to be the 21st century's new "" conan "" and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .",1,[]
1,"the gorgeously elaborate continuation of "" the lord of the rings "" trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson's expanded vision of j . r . r . tolkien's middle-earth .",1,"[{'entity': 'B-PER', 'score': 0.92304903, 'index': 31, 'word': 'pet', 'start': 155, 'end': 158}, {'entity': 'B-PER', 'score': 0.9222884, 'index': 33, 'word': 'jack', 'start': 161, 'end': 165}, {'entity': 'B-PER', 'score': 0.41237748, 'index': 47, 'word': '##lk', 'start': 204, 'end': 206}]"
2,effective but too-tepid biopic,1,[]
3,"if you sometimes like to go to the movies to have fun , wasabi is a good place to start .",1,[]
4,"emerges as something rare , an issue movie that's so honest and keenly observed that it doesn't feel like one .",1,[]
...,...,...,...
8525,any enjoyment will be hinge from a personal threshold of watching sad but endearing characters do extremely unconventional things .,0,[]
8526,"if legendary shlockmeister ed wood had ever made a movie about a vampire , it probably would look a lot like this alarming production , adapted from anne rice's novel the vampire chronicles .",0,[]
8527,"hardly a nuanced portrait of a young woman's breakdown , the film nevertheless works up a few scares .",0,[]
8528,"interminably bleak , to say nothing of boring .",0,[]


In [67]:
df_reviews = df_reviews[df_reviews['named_entities'].apply(lambda x: len(x) > 2)]
df_reviews

Unnamed: 0,review,label,named_entities
1,"the gorgeously elaborate continuation of "" the lord of the rings "" trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson's expanded vision of j . r . r . tolkien's middle-earth .",1,"[{'entity': 'B-PER', 'score': 0.92304903, 'index': 31, 'word': 'pet', 'start': 155, 'end': 158}, {'entity': 'B-PER', 'score': 0.9222884, 'index': 33, 'word': 'jack', 'start': 161, 'end': 165}, {'entity': 'B-PER', 'score': 0.41237748, 'index': 47, 'word': '##lk', 'start': 204, 'end': 206}]"
9,take care of my cat offers a refreshingly different slice of asian cinema .,1,"[{'entity': 'B-MISC', 'score': 0.5999475, 'index': 15, 'word': 'as', 'start': 61, 'end': 63}, {'entity': 'B-MISC', 'score': 0.6693086, 'index': 16, 'word': '##ian', 'start': 63, 'end': 66}]"
28,"at about 95 minutes , treasure planet maintains a brisk pace as it races through the familiar story . however , it lacks grandeur and that epic quality often associated with stevenson's tale as well as with earlier disney efforts .",1,"[{'entity': 'B-PER', 'score': 0.93078464, 'index': 34, 'word': 's', 'start': 174, 'end': 175}, {'entity': 'B-PER', 'score': 0.8506029, 'index': 35, 'word': '##te', 'start': 175, 'end': 177}, {'entity': 'B-PER', 'score': 0.5502569, 'index': 36, 'word': '##ven', 'start': 177, 'end': 180}]"
37,"ms . fulford-wierzbicki is almost spooky in her sulky , calculating lolita turn .",1,"[{'entity': 'B-PER', 'score': 0.93638897, 'index': 8, 'word': 'w', 'start': 13, 'end': 14}, {'entity': 'B-PER', 'score': 0.9331462, 'index': 9, 'word': '##ier', 'start': 14, 'end': 17}, {'entity': 'B-PER', 'score': 0.86285675, 'index': 10, 'word': '##z', 'start': 17, 'end': 18}, {'entity': 'B-PER', 'score': 0.4013568, 'index': 11, 'word': '##bic', 'start': 18, 'end': 21}]"
46,gosling provides an amazing performance that dwarfs everything else in the film .,1,"[{'entity': 'B-PER', 'score': 0.96018916, 'index': 1, 'word': 'go', 'start': 0, 'end': 2}]"
...,...,...,...
8475,too bad kramer couldn't make a guest appearance to liven things up .,0,"[{'entity': 'B-PER', 'score': 0.9860117, 'index': 3, 'word': 'k', 'start': 8, 'end': 9}]"
8500,mctiernan's remake may be lighter on its feet -- the sober-minded original was as graceful as a tap-dancing rhino -- but it is just as boring and as obvious .,0,"[{'entity': 'B-PER', 'score': 0.8695813, 'index': 1, 'word': 'm', 'start': 0, 'end': 1}, {'entity': 'B-PER', 'score': 0.6427288, 'index': 2, 'word': '##ct', 'start': 1, 'end': 3}, {'entity': 'B-PER', 'score': 0.41811576, 'index': 3, 'word': '##ier', 'start': 3, 'end': 6}]"
8511,"the film doesn't really care about the thousands of americans who die hideously , it cares about how ryan meets his future wife and makes his start at the cia .",0,"[{'entity': 'B-MISC', 'score': 0.57740635, 'index': 12, 'word': 'am', 'start': 52, 'end': 54}, {'entity': 'B-PER', 'score': 0.9391809, 'index': 24, 'word': 'r', 'start': 101, 'end': 102}]"
8519,"like its title character , esther kahn is unusual but unfortunately also irritating .",0,"[{'entity': 'B-PER', 'score': 0.65784127, 'index': 6, 'word': 'est', 'start': 27, 'end': 30}]"


In [68]:
import spacy

# Load English NLP model
nlp = spacy.load("en_core_web_sm")

# List of movie reviews
reviews = df_reviews["review"].tolist()

# Process each review and extract entities
for review in reviews:
    doc = nlp(review)
    print(f"Review: {review}")
    for ent in doc.ents:
        print(f"  Entity: {ent.text}, Label: {ent.label_}")
    print("-" * 50)


Review: the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson's expanded vision of j . r . r . tolkien's middle-earth .
  Entity: peter jackson, Label: PERSON
  Entity: tolkien, Label: PERSON
--------------------------------------------------
Review: take care of my cat offers a refreshingly different slice of asian cinema .
  Entity: asian, Label: NORP
--------------------------------------------------
Review: at about 95 minutes , treasure planet maintains a brisk pace as it races through the familiar story . however , it lacks grandeur and that epic quality often associated with stevenson's tale as well as with earlier disney efforts .
  Entity: about 95 minutes, Label: TIME
  Entity: stevenson, Label: PERSON
--------------------------------------------------
Review: ms . fulford-wierzbicki is almost spooky in her sulky , calculating lolita turn .
  Entity: fulford-wier