In [None]:
import pandas as pd
import numpy as np
import joblib
import nltk
import string
import tensorflow as tf
from tqdm import tqdm
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, Dropout
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from tensorflow.keras.optimizers import Adam

# Download stopwords
nltk.download('stopwords')

# Load Amazon, Yelp, IMDB Datasets
print("Loading datasets...")

def load_data(filepath, text_column, label_column):
    df = pd.read_excel(filepath, engine='openpyxl')
    df = df[[text_column, label_column]].dropna()
    df.columns = ['Feedback', 'Label']
    return df

amazon_df = load_data("data/amazon_reviews_labelled.csv", 'reviewText', 'label')
yelp_df = load_data("data/Yelp Dataset Reduced.csv", 'text', 'label')
imdb_df = load_data("data/amazon_reviews_training.csv", 'review', 'label')

# Combine datasets
df = pd.concat([amazon_df, yelp_df, imdb_df], ignore_index=True)

# Convert labels to binary (Real=1, Fake=0)
df['Label'] = df['Label'].map({'Real': 1, 'Fake': 0})

# Preprocessing Function
def preprocess_text(text):
    text = str(text).lower()
    text = ''.join([char for char in text if char not in string.punctuation])  # Remove punctuation
    text = ' '.join([word for word in text.split() if word not in stopwords.words('english')])  # Remove stopwords
    return text

print("Preprocessing text data...")
df['Feedback'] = df['Feedback'].apply(preprocess_text)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(df['Feedback'], df['Label'], test_size=0.2, random_state=42)

# Tokenization
MAX_VOCAB = 10000  # Large vocabulary size
MAX_LEN = 200

tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=MAX_LEN)
X_test_seq = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=MAX_LEN)

# Save tokenizer
joblib.dump(tokenizer, 'models/tokenizer.pkl')

# Define LSTM Model
model = Sequential([
    Embedding(input_dim=MAX_VOCAB, output_dim=128, input_length=MAX_LEN),
    Bidirectional(LSTM(128, return_sequences=True)),
    Dropout(0.5),
    Bidirectional(LSTM(64)),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')  # Binary classification
])

# Compile Model
model.compile(optimizer=Adam(learning_rate=0.0005), loss='binary_crossentropy', metrics=['accuracy'])

# Train Model
print("Training BiLSTM model on large dataset...")
model.fit(X_train_seq, y_train, epochs=5, batch_size=64, validation_data=(X_test_seq, y_test))

# Save Model
model.save('models/feedback_model_large.h5')

print("LSTM model trained and saved successfully!")

In [1]:
import pandas as pd
import numpy as np
import joblib
import nltk
import string
import tensorflow as tf
from tqdm import tqdm
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, Dropout
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from tensorflow.keras.optimizers import Adam




In [2]:
# Download stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\karan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [11]:
# Load Amazon, Yelp, IMDB Datasets
print("Loading datasets...")

def load_data(filepath, text_column, label_column):
    df = pd.read_excel(filepath, engine='openpyxl')
    df = df[[text_column, label_column]].dropna()
    df.columns = ['Feedback', 'Label']
    return df

amazon_df = load_data("data/amazon_reviews_labelled.csv", 'reviewText', 'label')
yelp_df = load_data("data/Yelp Dataset Reduced.csv", 'text', 'label')
imdb_df = load_data("data/amazon_reviews_training.csv", 'review', 'label')

Loading datasets...


BadZipFile: File is not a zip file

In [4]:
# Combine datasets
df = pd.concat([amazon_df, yelp_df, imdb_df], ignore_index=True)

NameError: name 'amazon_df' is not defined