In [None]:
import nltk
import pandas as pd
from nltk.tokenize import word_tokenize

# Download necessary resources
nltk.download('punkt_tab')

def tokenize_text(text):
    """Tokenizes a given text into individual words."""
    tokens = word_tokenize(text)
    return tokens

# Load dataset (Replace 'your_dataset.csv' with your actual file)
df = pd.read_csv('/content/IMDB Dataset.csv')

# Assume the dataset has a column named 'review'
df['tokenized_review'] = df['review'].apply(tokenize_text)

# Display tokenized data
print(df[['review', 'tokenized_review']].head())


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


                                              review  \
0  One of the other reviewers has mentioned that ...   
1  A wonderful little production. <br /><br />The...   
2  I thought this was a wonderful way to spend ti...   
3  Basically there's a family where a little boy ...   
4  Petter Mattei's "Love in the Time of Money" is...   

                                    tokenized_review  
0  [One, of, the, other, reviewers, has, mentione...  
1  [A, wonderful, little, production, ., <, br, /...  
2  [I, thought, this, was, a, wonderful, way, to,...  
3  [Basically, there, 's, a, family, where, a, li...  
4  [Petter, Mattei, 's, ``, Love, in, the, Time, ...  


In [None]:
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Download necessary resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()

def remove_stopwords(tokens):
    """Removes stopwords from a tokenized list of words."""
    stop_words = set(stopwords.words('english'))
    return [word for word in tokens if word.lower() not in stop_words]

def lemmatize_tokens(tokens):
    """Performs lemmatization on a tokenized list of words."""
    return [lemmatizer.lemmatize(word) for word in tokens]

def tokens_to_text(tokens):
    """Converts tokenized words back into a single text string."""
    return ' '.join(tokens)

# Ensure required columns exist
if 'tokenized_review' not in df.columns or 'sentiment' not in df.columns:
    raise ValueError("Dataset must contain 'tokenized_review' and 'sentiment' columns.")

# Preprocessing: Stopword removal, Lemmatization, and Text Formatting
df['filtered_review'] = df['tokenized_review'].apply(remove_stopwords)
df['lemmatized_review'] = df['filtered_review'].apply(lemmatize_tokens)
df['processed_text'] = df['lemmatized_review'].apply(tokens_to_text)

# Splitting dataset into train & test sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(df['processed_text'], df['sentiment'], test_size=0.2, random_state=42)

### MODEL TRAINING ###

# Logistic Regression Classifier
pipeline_lr = make_pipeline(TfidfVectorizer(), LogisticRegression(max_iter=1000))
pipeline_lr.fit(X_train, y_train)
y_pred_lr = pipeline_lr.predict(X_test)

# Naive Bayes Classifier
pipeline_nb = make_pipeline(TfidfVectorizer(), MultinomialNB())
pipeline_nb.fit(X_train, y_train)
y_pred_nb = pipeline_nb.predict(X_test)

### MODEL EVALUATION ###
print("\n=== Logistic Regression Model ===")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))

print("\n=== Naive Bayes Model ===")
print("Accuracy:", accuracy_score(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb))

### PREDICT SENTIMENT FOR WHOLE DATASET ###
df['predicted_sentiment'] = pipeline_lr.predict(df['processed_text'])

# Display sample predictions
print(df[['processed_text', 'sentiment', 'predicted_sentiment']].head())

# Save predictions to CSV
df.to_csv("sentiment_predictions.csv", index=False)
print("Predictions saved to 'sentiment_predictions.csv'")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!



=== Logistic Regression Model ===
Accuracy: 0.8964
              precision    recall  f1-score   support

    negative       0.91      0.88      0.89      4961
    positive       0.89      0.91      0.90      5039

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000


=== Naive Bayes Model ===
Accuracy: 0.8665
              precision    recall  f1-score   support

    negative       0.85      0.88      0.87      4961
    positive       0.88      0.85      0.87      5039

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000

                                      processed_text sentiment  \
0  One reviewer mentioned watching 1 Oz episode '...  positive   
1  wonderful little production . < br / > < br / ...  positive   
2  thought wonderful way spend time hot summer we...