In [None]:
pip install pandas numpy scikit-learn nltk spacy tensorflow torch transformers fastapi uvicorn
python -m spacy download en_core_web_sm


In [None]:
import pandas as pd
import re
import nltk
import spacy
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Load IMDB dataset
df = pd.read_csv("IMDB Dataset.csv")

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize NLTK Lemmatizer and SpaCy
lemmatizer = WordNetLemmatizer()
nlp = spacy.load("en_core_web_sm")

# Preprocessing function
def preprocess_text(text):
    text = re.sub('<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'\W', ' ', text.lower())  # Remove punctuation and lowercase
    doc = nlp(text)
    words = [token.lemma_ for token in doc if not token.is_stop]  # Lemmatization & stopword removal
    return ' '.join(words)

# Apply preprocessing to reviews
df['review'] = df['review'].apply(preprocess_text)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment'], test_size=0.2, random_state=42)

# Apply TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# SVM Model
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train_tfidf, y_train)

# Predict and evaluate
y_pred_svm = svm_model.predict(X_test_tfidf)
svm_accuracy = accuracy_score(y_test, y_pred_svm)
print(f"SVM Accuracy: {svm_accuracy}")


In [None]:
from sklearn.ensemble import RandomForestClassifier

# Random Forest Model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_tfidf, y_train)

# Predict and evaluate
y_pred_rf = rf_model.predict(X_test_tfidf)
rf_accuracy = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Accuracy: {rf_accuracy}")


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Tokenize and pad sequences
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=200)
X_test_pad = pad_sequences(X_test_seq, maxlen=200)

# LSTM Model
lstm_model = Sequential([
    Embedding(5000, 128, input_length=200),
    LSTM(128, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation='sigmoid')
])

lstm_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
lstm_model.fit(X_train_pad, y_train, epochs=3, batch_size=64, validation_data=(X_test_pad, y_test))

# Evaluate LSTM Model
lstm_accuracy = lstm_model.evaluate(X_test_pad, y_test)
print(f"LSTM Accuracy: {lstm_accuracy[1]}")


In [None]:
from tensorflow.keras.layers import GRU

# GRU Model
gru_model = Sequential([
    Embedding(5000, 128, input_length=200),
    GRU(128, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation='sigmoid')
])

gru_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
gru_model.fit(X_train_pad, y_train, epochs=3, batch_size=64, validation_data=(X_test_pad, y_test))

# Evaluate GRU Model
gru_accuracy = gru_model.evaluate(X_test_pad, y_test)
print(f"GRU Accuracy: {gru_accuracy[1]}")


In [None]:
from transformers import BertTokenizer, TFBertForSequenceClassification
from tensorflow.keras.optimizers import Adam

# Load pre-trained BERT model and tokenizer
tokenizer_bert = BertTokenizer.from_pretrained('bert-base-uncased')
model_bert = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Tokenize and encode the data for BERT
def encode_bert(texts):
    return tokenizer_bert(texts.tolist(), padding=True, truncation=True, max_length=512, return_tensors='tf')

X_train_bert = encode_bert(X_train)
X_test_bert = encode_bert(X_test)

# Compile the BERT model
model_bert.compile(optimizer=Adam(learning_rate=1e-5), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train BERT Model
model_bert.fit(X_train_bert, y_train, epochs=3, batch_size=8, validation_data=(X_test_bert, y_test))

# Evaluate BERT Model
bert_accuracy = model_bert.evaluate(X_test_bert, y_test)
print(f"BERT Accuracy: {bert_accuracy[1]}")
