In [13]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import spacy

In [3]:
data = pd.read_csv("data/IMDB_Dataset.csv")

In [5]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
# Label encoding
data['sentiment'] = data['sentiment'].map({'positive': 1, 'negative': 0})

In [12]:
data['sentiment'].head()

0    1
1    1
2    1
3    0
4    1
Name: sentiment, dtype: int64

In [5]:
# Split the data into target and features
X = data['review']
y = data['sentiment']
# Do a train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
tokenizer = Tokenizer(num_words = 5000)
tokenizer.fit_on_texts(X_train)

In [7]:
X_train = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=200)
X_test = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=200)

In [8]:
model = keras.Sequential([
    keras.layers.Embedding(input_dim=10000, output_dim=32, input_length=250),
    keras.layers.LSTM(64, return_sequences=True),
    keras.layers.LSTM(32),
    keras.layers.Dense(16, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [9]:
model.fit(X_train, y_train, epochs=5, batch_size=64)

Epoch 1/5
[1m 89/625[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m1:26[0m 161ms/step - accuracy: 0.5570 - loss: 0.6741

KeyboardInterrupt: 

In [25]:
def predict_review(review):
    encoded_review = tokenizer.texts_to_sequences([review])
    padded_review = pad_sequences(encoded_review, maxlen=250, padding='post', truncating='post')
    prediction = model.predict(padded_review)[0][0]
    return "Positive" if prediction > 0.5 else "Negative"

In [26]:
sample_review = "This movie was absolutely fantastic! The storyline was engaging and the characters were well developed."
print(predict_review(sample_review))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 499ms/step
Positive


In [30]:
# gradio interface
import gradio as gr

func = gr.Interface(
    fn=predict_review, 
    inputs=gr.Textbox(
    label="Enter your review here",
    lines=3,
    max_lines=5,
    interactive=True  
), 
    outputs=gr.Textbox(label='Review')
)

In [None]:
func.launch(share=True)

In [None]:
from transformers import pipeline

# Load NER pipeline
ner_pipeline = pipeline("ner", model="dslim/bert-base-NER")

# Example review
review = "The Dark Knight was an amazing movie, and Christian Bale was outstanding."

# Get named entities
entities = ner_pipeline(review)

# Print results
for entity in entities:
    print(f"Entity: {entity['word']}, Label: {entity['entity']}, Score: {entity['score']:.2f}")


In [10]:
splits = {'train': 'train.parquet', 'validation': 'validation.parquet', 'test': 'test.parquet'}
df_new = pd.read_parquet("hf://datasets/cornell-movie-review-data/rotten_tomatoes/" + splits["train"], engine='fastparquet')

In [11]:
df_new = df_new.rename(columns={'text': 'review'})


In [12]:
from transformers import AutoModelForTokenClassification, AutoTokenizer
import torch

# Load tokenizer and model
model_name = "dslim/bert-base-NER"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, device=0)  # Use GPU if available

# Process reviews in batches
batch_size = 32  # Adjust based on memory
reviews = df_new["review"].tolist()

# Apply NER in batches
all_entities = []
for i in tqdm(range(0, len(reviews), batch_size)):
    batch = reviews[i : i + batch_size]
    results = ner_pipeline(batch)
    all_entities.extend(results)

df_new["named_entities"] = all_entities
df_new.to_csv("movie_reviews_with_entities.csv", index=False)


Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


NameError: name 'pipeline' is not defined

In [17]:
df_reviews = pd.read_csv("movie_reviews_with_entities.csv")

In [50]:
df_reviews.head()

Unnamed: 0,review,label,named_entities
0,the rock is destined to be the 21st century's ...,1,[]
1,"the gorgeously elaborate continuation of "" the...",1,"[{'entity': 'B-PER', 'score': 0.92304903, 'ind..."
2,effective but too-tepid biopic,1,[]
3,if you sometimes like to go to the movies to h...,1,[]
4,"emerges as something rare , an issue movie tha...",1,[]


In [52]:
pd.set_option('display.max_colwidth', None)
df_reviews

Unnamed: 0,review,label,named_entities
0,"the rock is destined to be the 21st century's new "" conan "" and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .",1,[]
1,"the gorgeously elaborate continuation of "" the lord of the rings "" trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson's expanded vision of j . r . r . tolkien's middle-earth .",1,"[{'entity': 'B-PER', 'score': 0.92304903, 'index': 31, 'word': 'pet', 'start': 155, 'end': 158}, {'entity': 'B-PER', 'score': 0.9222884, 'index': 33, 'word': 'jack', 'start': 161, 'end': 165}, {'entity': 'B-PER', 'score': 0.41237748, 'index': 47, 'word': '##lk', 'start': 204, 'end': 206}]"
2,effective but too-tepid biopic,1,[]
3,"if you sometimes like to go to the movies to have fun , wasabi is a good place to start .",1,[]
4,"emerges as something rare , an issue movie that's so honest and keenly observed that it doesn't feel like one .",1,[]
...,...,...,...
8525,any enjoyment will be hinge from a personal threshold of watching sad but endearing characters do extremely unconventional things .,0,[]
8526,"if legendary shlockmeister ed wood had ever made a movie about a vampire , it probably would look a lot like this alarming production , adapted from anne rice's novel the vampire chronicles .",0,[]
8527,"hardly a nuanced portrait of a young woman's breakdown , the film nevertheless works up a few scares .",0,[]
8528,"interminably bleak , to say nothing of boring .",0,[]


In [16]:
df_reviews = df_reviews[df_reviews['named_entities'].apply(lambda x: len(x) > 2)]
df_reviews

NameError: name 'df_reviews' is not defined

In [18]:
import spacy

# Load English NLP model
nlp = spacy.load("en_core_web_sm")

# List of movie reviews
reviews = df_reviews["review"].tolist()

In [None]:
# Process each review and extract entities
for review in reviews:
    doc = nlp(review)
    print(f"Review: {review}")
    for ent in doc.ents:
        print(f"  Entity: {ent.text}, Label: {ent.label_}")
    print("-" * 50)

In [None]:
reviews_imdb = data["review"].tolist()
# Process each review and extract entities
for review in reviews_imdb:
    doc = nlp(review)
    print(f"Review: {review}")
    for ent in doc.ents:
        print(f"  Entity: {ent.text}, Label: {ent.label_}")
    print("-" * 50)

In [27]:
df_reviews['count'] = None

In [42]:
for review in reviews:
    count = 0
    doc = nlp(review)
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            count += 1
    df_reviews['count'][review] = count

In [44]:
# Iterate through the DataFrame using `iterrows()`
for idx, row in df_reviews.iterrows():
    count = 0
    doc = nlp(row["review"])  # Process the review text
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            count += 1
    df_reviews.at[idx, "count"] = count  # Assign count correctly
print(df_reviews)

                                                 review  label count
0     the rock is destined to be the 21st century's ...      1     2
1     the gorgeously elaborate continuation of " the...      1     2
2                        effective but too-tepid biopic      1     0
3     if you sometimes like to go to the movies to h...      1     0
4     emerges as something rare , an issue movie tha...      1     0
...                                                 ...    ...   ...
8525  any enjoyment will be hinge from a personal th...      0     0
8526  if legendary shlockmeister ed wood had ever ma...      0     2
8527  hardly a nuanced portrait of a young woman's b...      0     0
8528    interminably bleak , to say nothing of boring .      0     0
8529  things really get weird , though not particula...      0     0

[8530 rows x 3 columns]


In [49]:
df_reviews_positive = df_reviews.drop(df_reviews[df_reviews['label'] == 0].index)
df_reviews_positive['count'].value_counts()

count
0    3626
1     528
2      94
3      13
4       3
5       1
Name: count, dtype: int64

In [50]:
df_reviews_negative = df_reviews.drop(df_reviews[df_reviews['label'] == 1].index)
df_reviews_negative['count'].value_counts()

count
0    3710
1     489
2      55
3      10
4       1
Name: count, dtype: int64

In [51]:
df_reviews['all_entities_count'] = None

In [52]:
for idx, row in df_reviews.iterrows():
    count = 0
    doc = nlp(row["review"])
    for ent in doc.ents:
        count += 1
    df_reviews.at[idx, "all_entities_count"] = count
print(df_reviews)

                                                 review  label count  \
0     the rock is destined to be the 21st century's ...      1     2   
1     the gorgeously elaborate continuation of " the...      1     2   
2                        effective but too-tepid biopic      1     0   
3     if you sometimes like to go to the movies to h...      1     0   
4     emerges as something rare , an issue movie tha...      1     0   
...                                                 ...    ...   ...   
8525  any enjoyment will be hinge from a personal th...      0     0   
8526  if legendary shlockmeister ed wood had ever ma...      0     2   
8527  hardly a nuanced portrait of a young woman's b...      0     0   
8528    interminably bleak , to say nothing of boring .      0     0   
8529  things really get weird , though not particula...      0     0   

     all_entities_count  
0                     3  
1                     2  
2                     0  
3                     0  
4    

In [53]:
df_reviews_positive2 = df_reviews.drop(df_reviews[df_reviews['label'] == 0].index)
df_reviews_positive2['all_entities_count'].value_counts()

all_entities_count
0    2639
1    1108
2     381
3     104
4      22
5       8
6       2
7       1
Name: count, dtype: int64

In [54]:
df_reviews_negative2 = df_reviews.drop(df_reviews[df_reviews['label'] == 1].index)
df_reviews_negative2['all_entities_count'].value_counts()

all_entities_count
0    2850
1    1017
2     286
3      83
4      24
5       4
6       1
Name: count, dtype: int64