In [None]:

!pip install -q imbalanced-learn xgboost catboost lightgbm tensorflow nltk optuna nlpaug

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

import time
time.sleep(5)
import pandas as pd
import numpy as np
import string
import re
import joblib

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder

from imblearn.over_sampling import SMOTE
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.svm import SVC
import lightgbm as lgb
import optuna

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Conv1D, GlobalMaxPooling1D, Concatenate, Dropout, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import nlpaug.augmenter.word as naw

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.strip()
    tokens = text.split()
    stop_words = set(stopwords.words('english')) - {'not', 'no', 'never'}
    tokens = [w for w in tokens if w not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(w) for w in tokens]
    return " ".join(tokens)

df = pd.read_csv('/content/amazon.csv')  # Update path as needed
df['clean_text'] = df['Text'].astype(str).apply(preprocess_text)

X_train, X_test, y_train, y_test = train_test_split(
    df['clean_text'], df['label'], test_size=0.2, random_state=42, stratify=df['label']
)

try:
    aug = naw.SynonymAug(aug_src='wordnet')
    aug_texts = [aug.augment(text) for text in X_train]
    aug_labels = list(y_train)
    X_train_aug = pd.Series(list(X_train) + aug_texts)
    y_train_aug = pd.Series(list(y_train) + aug_labels)
except LookupError as e:
    print("NLTK resource missing, skipping augmentation.")
    X_train_aug = X_train
    y_train_aug = y_train

vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_vec = vectorizer.fit_transform(X_train_aug)
X_test_vec = vectorizer.transform(X_test)

sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train_vec, y_train_aug)

xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_params = {'n_estimators': [100, 200], 'max_depth': [3, 6], 'learning_rate': [0.05, 0.1]}
xgb_grid = GridSearchCV(xgb, xgb_params, cv=3, scoring='accuracy', verbose=0)
xgb_grid.fit(X_train_res, y_train_res)
xgb_best = xgb_grid.best_estimator_

cat = CatBoostClassifier(verbose=0, random_state=42)
cat_params = {'depth': [4, 6], 'learning_rate': [0.03, 0.1], 'iterations': [100, 200]}
cat_grid = GridSearchCV(cat, cat_params, cv=3, scoring='accuracy', verbose=0)
cat_grid.fit(X_train_res, y_train_res)
cat_best = cat_grid.best_estimator_

svm = SVC(probability=True, random_state=42)
svm_params = {'C': [1, 10], 'kernel': ['linear', 'rbf']}
svm_grid = GridSearchCV(svm, svm_params, cv=3, scoring='accuracy', verbose=0)
svm_grid.fit(X_train_res, y_train_res)
svm_best = svm_grid.best_estimator_

def objective(trial):
    params = {
        'objective': 'binary',
        'metric': 'binary_error',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.6, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.6, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7)
    }
    clf = lgb.LGBMClassifier(**params)
    score = cross_val_score(clf, X_train_res, y_train_res, cv=3, scoring='accuracy').mean()
    return score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=25)
lgb_best = lgb.LGBMClassifier(**study.best_params)
lgb_best.fit(X_train_res, y_train_res)

voting_clf = VotingClassifier(
    estimators=[('xgb', xgb_best), ('cat', cat_best), ('svm', svm_best), ('lgb', lgb_best)],
    voting='soft'
)
voting_clf.fit(X_train_res, y_train_res)
voting_acc = accuracy_score(y_test, voting_clf.predict(X_test_vec))

le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)

tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
X_train_pad = pad_sequences(X_train_seq, maxlen=200)
X_test_pad = pad_sequences(X_test_seq, maxlen=200)

input_layer = Input(shape=(200,))
embedding = Embedding(input_dim=10000, output_dim=128, input_length=200)(input_layer)

convs = []
for size in [3, 4, 5]:
    conv = Conv1D(128, kernel_size=size, activation='relu')(embedding)
    pool = GlobalMaxPooling1D()(conv)
    convs.append(pool)

merged = Concatenate()(convs)
drop = Dropout(0.5)(merged)
dense = Dense(64, activation='relu')(drop)
output = Dense(1, activation='sigmoid')(dense)

cnn_model = Model(inputs=input_layer, outputs=output)
cnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
cnn_model.fit(X_train_pad, y_train_enc, epochs=5, batch_size=64, validation_split=0.1, verbose=1)
cnn_acc = cnn_model.evaluate(X_test_pad, y_test_enc, verbose=0)[1]

voting_probs = voting_clf.predict_proba(X_test_vec)[:, 1]
cnn_probs = cnn_model.predict(X_test_pad).reshape(-1)
meta_features = np.vstack((voting_probs, cnn_probs)).T
meta_clf = LogisticRegression()
meta_clf.fit(meta_features, y_test_enc)
stack_preds = meta_clf.predict(meta_features)
stack_acc = accuracy_score(y_test_enc, stack_preds)

print("XGBoost Accuracy:", accuracy_score(y_test, xgb_best.predict(X_test_vec)))
print("CatBoost Accuracy:", accuracy_score(y_test, cat_best.predict(X_test_vec)))
print("SVM Accuracy:", accuracy_score(y_test, svm_best.predict(X_test_vec)))
print("LightGBM Accuracy:", accuracy_score(y_test, lgb_best.predict(X_test_vec)))
print("Voting Ensemble Accuracy:", voting_acc)
print("CNN Accuracy:", cnn_acc)
print("Stacked Model Accuracy:", stack_acc)

joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')
joblib.dump(voting_clf, 'voting_model.pkl')
joblib.dump(lgb_best, 'lightgbm_model.pkl')
cnn_model.save('cnn_model.h5')
joblib.dump(meta_clf, 'stacked_meta_model.pkl')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


NLTK resource missing, skipping augmentation.


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Epoch 1/5
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m89s[0m 384ms/step - accuracy: 0.7886 - loss: 0.4496 - val_accuracy: 0.9025 - val_loss: 0.2386
Epoch 2/5
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m86s[0m 382ms/step - accuracy: 0.9312 - loss: 0.1777 - val_accuracy: 0.8969 - val_loss: 0.2396
Epoch 3/5
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 389ms/step - accuracy: 0.9671 - loss: 0.0974 - val_accuracy: 0.8969 - val_loss: 0.2770
Epoch 4/5
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m139s[0m 375ms/step - accuracy: 0.9874 - loss: 0.0448 - val_accuracy: 0.9019 - val_loss: 0.3098
Epoch 5/5
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 379ms/step - accuracy: 0.9928 - loss: 0.0255 - val_accuracy: 0.8981 - val_loss: 0.4099




[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 57ms/step
XGBoost Accuracy: 0.877
CatBoost Accuracy: 0.884
SVM Accuracy: 0.88725
LightGBM Accuracy: 0.88275
Voting Ensemble Accuracy: 0.90025
CNN Accuracy: 0.9042500257492065
Stacked Model Accuracy: 0.9115




['stacked_meta_model.pkl']