In [None]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

# Load the dataset
df = pd.read_csv('restaurant.csv')

# Prepare the inputs and labels
texts = df['review'].values
labels = df['label'].values

# Load the RoBERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
model = AutoModel.from_pretrained("roberta-base")

# Extract features using RoBERT
inputs = np.zeros((len(texts), 768))
for i, text in enumerate(texts):
    input_ids = tokenizer.encode(text, return_tensors='pt')
    with torch.no_grad():
        last_hidden_states = model(input_ids).last_hidden_state
    inputs[i, :] = last_hidden_states[0, 0, :].numpy()

# Split the data into train and test sets
train_inputs = inputs[:int(0.8 * len(inputs))]
train_labels = labels[:int(0.8 * len(labels))]
test_inputs = inputs[int(0.8 * len(inputs)):]
test_labels = labels[int(0.8 * len(labels)):]

# Train a Gradient Boosting Classifier
clf = GradientBoostingClassifier()
clf.fit(train_inputs, train_labels)

# Make predictions on the test set
predictions = clf.predict(test_inputs)

# Evaluate the model using accuracy
acc = accuracy_score(test_labels, predictions)
print('Accuracy:', acc)


Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score
from transformers import RobertaTokenizer, TFRobertaModel
import tensorflow as tf

# Load the data
df = pd.read_csv('restaurant.csv')

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['label'], test_size=0.2, random_state=42)

# Split the training set into training and validation sets for hyperparameter tuning
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Load the RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
roberta_model = TFRobertaModel.from_pretrained('roberta-base')

# Tokenize the text data
X_train_tokens = tokenizer(X_train.tolist(), padding=True, truncation=True, max_length=128, return_tensors='tf')
X_val_tokens = tokenizer(X_val.tolist(), padding=True, truncation=True, max_length=128, return_tensors='tf')
X_test_tokens = tokenizer(X_test.tolist(), padding=True, truncation=True, max_length=128, return_tensors='tf')

# Extract RoBERTa features from the text data
X_train_features = roberta_model(X_train_tokens)['pooler_output']
X_val_features = roberta_model(X_val_tokens)['pooler_output']
X_test_features = roberta_model(X_test_tokens)['pooler_output']

# Tune hyperparameters using k-fold cross-validation
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

svm_params = {'C': [0.1, 1, 10, 100], 'gamma': ['scale', 'auto']}
svm_grid = GridSearchCV(SVC(probability=True), svm_params, scoring='accuracy', cv=kfold)
svm_grid.fit(X_train_features, y_train)

ada_params = {'n_estimators': [50, 100, 200], 'learning_rate': [0.1, 1, 10]}
ada_grid = GridSearchCV(AdaBoostClassifier(), ada_params, scoring='accuracy', cv=kfold)
ada_grid.fit(X_train_features, y_train)

rf_params = {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20]}
rf_grid = GridSearchCV(RandomForestClassifier(), rf_params, scoring='accuracy', cv=kfold)
rf_grid.fit(X_train_features, y_train)

# Build the voting ensemble classifier with the tuned hyperparameters
svm_best = svm_grid.best_estimator_
ada_best = ada_grid.best_estimator_
rf_best = rf_grid.best_estimator_

ensemble = VotingClassifier(estimators=[('svm', svm_best), ('ada', ada_best), ('rf', rf_best)], voting='soft')
ensemble.fit(X_train_features, y_train)

# Evaluate the final model on the test set
test_preds = ensemble.predict(X_test_features)
test_acc = accuracy_score(y_test, test_preds)
print("Test accuracy:", test_acc)


2023-02-21 21:43:30.306036: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-21 21:43:31.144925: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-02-21 21:43:31.145018: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-02-21 21:43:33.103672: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-

Downloading (…)"tf_model.h5";:   0%|          | 0.00/657M [00:00<?, ?B/s]

In [1]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, KFold, GridSearchCV

# Load the dataset
df = pd.read_csv('dataset.csv')

# Prepare the inputs and labels
texts = df['text'].values
labels = df['label'].values

# Load the RoBERTa tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
model = AutoModel.from_pretrained("roberta-base")

# Extract features using RoBERTa
inputs = np.zeros((len(texts), 768))
for i, text in enumerate(texts):
    input_ids = tokenizer.encode(text, return_tensors='pt')
    with torch.no_grad():
        last_hidden_states = model(input_ids).last_hidden_state
    inputs[i, :] = last_hidden_states[0, 0, :].numpy()

# Divide the data into training, validation, and test sets
X_train, X_test, y_train, y_test = train_test_split(inputs, labels, test_size=0.2, random_state=42)
kf = KFold(n_splits=5, random_state=42)

# Define the classifiers
svc = SVC(kernel='linear')
ada = AdaBoostClassifier(n_estimators=200)
rf = RandomForestClassifier(n_estimators=200)

# Define the voting ensemble classifier
clf = VotingClassifier(estimators=[('svc', svc), ('ada', ada), ('rf', rf)], voting='hard')

# Define the hyperparameters to be tuned
params = {'svc__C': [0.1, 1, 10],
          'ada__n_estimators': [100, 200],
          'rf__max_depth': [10, 20],
          'rf__max_features': ['sqrt', 'log2']}

# Tune the hyperparameters using GridSearchCV
grid_search = GridSearchCV(estimator=clf, param_grid=params, cv=kf, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Make predictions on the test set
predictions = grid_search.predict(X_test)

# Evaluate the model using accuracy
acc = accuracy_score(y_test, predictions)
print('Accuracy:', acc)


2023-02-21 22:00:00.450754: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-21 22:00:00.596970: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-02-21 22:00:00.596990: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-02-21 22:00:01.527499: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-

NameError: name 'torch' is not defined

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, VotingClassifier
from transformers import RobertaTokenizer, RobertaModel
from tqdm import tqdm
import numpy as np



# Load the dataset
df = pd.read_csv("restaurant.csv")

# Split the data into training, validation, and test sets using k-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
train_val_idx, test_idx = next(kf.split(df))
train_idx, val_idx = next(kf.split(df.iloc[train_val_idx]))

# Split the data into features and labels
X_train_val, y_train_val = df['review'].iloc[train_val_idx], df['label'].iloc[train_val_idx]
X_train, y_train = X_train_val.iloc[train_idx], y_train_val.iloc[train_idx]
X_val, y_val = X_train_val.iloc[val_idx], y_train_val.iloc[val_idx]
X_test, y_test = df['review'].iloc[test_idx], df['label'].iloc[test_idx]

# Initialize the RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaModel.from_pretrained('roberta-base')

# Extract features from the data using the RoBERTa model
X_train_features = []
for sentence in tqdm(X_train):
    input_ids = torch.tensor(tokenizer.encode(sentence)).unsqueeze(0)
    outputs = model(input_ids)
    last_hidden_states = outputs[0].squeeze().detach().numpy()
    X_train_features.append(last_hidden_states)
X_train_features = np.array(X_train_features)

X_val_features = []
for sentence in tqdm(X_val):
    input_ids = torch.tensor(tokenizer.encode(sentence)).unsqueeze(0)
    outputs = model(input_ids)
    last_hidden_states = outputs[0].squeeze().detach().numpy()
    X_val_features.append(last_hidden_states)
X_val_features = np.array(X_val_features)

X_test_features = []
for sentence in tqdm(X_test):
    input_ids = torch.tensor(tokenizer.encode(sentence)).unsqueeze(0)
    outputs = model(input_ids)
    last_hidden_states = outputs[0].squeeze().detach().numpy()
    X_test_features.append(last_hidden_states)
X_test_features = np.array(X_test_features)

# Initialize the classifiers
svm = SVC(kernel='linear', probability=True)
ada = AdaBoostClassifier(n_estimators=100, random_state=42)
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Combine the classifiers into a voting ensemble
clf = VotingClassifier(estimators=[('svm', svm), ('ada', ada), ('rf', rf)], voting='soft')

# Fit the classifier to the training data
clf.fit(X_train_features, y_train)

# Evaluate the classifier on the validation and test sets
val_acc = clf.score(X_val_features, y_val)
test_acc = clf.score(X_test_features, y_test)

print(f"Validation accuracy: {val_acc}")
print(f"Test accuracy: {test_acc}")

# Accuracy 94....

In [None]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
from sklearn.ensemble import VotingClassifier, AdaBoostClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, KFold
import torch

# Load the dataset
df = pd.read_csv('restaurant.csv')

# Prepare the inputs and labels
texts = df['review'].values
labels = df['label'].values

# Load the RoBERTa tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
model = AutoModel.from_pretrained("roberta-base")

# Extract features using RoBERTa
inputs = np.zeros((len(texts), 768))
for i, text in enumerate(texts):
    input_ids = tokenizer.encode(text, return_tensors='pt')
    with torch.no_grad():
        last_hidden_states = model(input_ids).last_hidden_state
    inputs[i, :] = last_hidden_states[0, 0, :].numpy()

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(inputs, labels, test_size=0.2, random_state=42)

# Define the ensemble classifiers
svm = SVC(kernel='linear', C=1, probability=True, random_state=42)
ada = AdaBoostClassifier(n_estimators=100, random_state=42)
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Define the voting ensemble classifier
voting_clf = VotingClassifier(estimators=[('svm', svm), ('ada', ada), ('rf', rf)], voting='soft')

# Train the voting ensemble classifier
voting_clf.fit(X_train, y_train)

# Make predictions on the test set
predictions = voting_clf.predict(X_test)

# Evaluate the model using accuracy
acc = accuracy_score(y_test, predictions)
print('Accuracy:', acc)


# RoBERT Large

In [None]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
from sklearn.ensemble import VotingClassifier, AdaBoostClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, KFold
import torch

# Load the dataset
df = pd.read_csv('GrammarandProductReviews[modified].csv')

# Prepare the inputs and labels
texts = df['review'].values
labels = df['positive_review'].values

# Load the RoBERTa tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("roberta-large")
model = AutoModel.from_pretrained("roberta-large")

# Extract features using RoBERTa
inputs = np.zeros((len(texts), 1024))
for i, text in enumerate(texts):
    input_ids = tokenizer.encode(text, return_tensors='pt')
    with torch.no_grad():
        last_hidden_states = model(input_ids).last_hidden_state
    inputs[i, :] = last_hidden_states[0, 0, :].numpy()

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(inputs, labels, test_size=0.2, random_state=42)

# Define the ensemble classifiers
svm = SVC(kernel='linear', C=1, probability=True, random_state=42)
ada = AdaBoostClassifier(n_estimators=100, random_state=42)
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Define the voting ensemble classifier
voting_clf = VotingClassifier(estimators=[('svm', svm), ('ada', ada), ('rf', rf)], voting='soft')

# Train the voting ensemble classifier
voting_clf.fit(X_train, y_train)

# Make predictions on the test set
predictions = voting_clf.predict(X_test)

# Evaluate the model using accuracy
acc = accuracy_score(y_test, predictions)
print('Accuracy:', acc)


# Looks perfect with 95...accuracy

In [None]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import torch
import time
import tracemalloc

# get the start time
st = time.time()
tracemalloc.start()

# Load the dataset
df = pd.read_csv('restaurant.csv')

# Prepare the inputs and labels
texts = df['review'].values
labels = df['label'].values

# Load the RoBERTa tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("roberta-large")
model = AutoModel.from_pretrained("roberta-large")

# Extract features using RoBERTa
inputs = np.zeros((len(texts), 1024))
for i, text in enumerate(texts):
    input_ids = tokenizer.encode(text, return_tensors='pt')
    with torch.no_grad():
        last_hidden_states = model(input_ids).last_hidden_state
    inputs[i, :] = last_hidden_states[0, 0, :].numpy()

# Split the data into train, validation, and test sets
X_train_val, X_test, y_train_val, y_test = train_test_split(inputs, labels, test_size=0.2, random_state=42)

# Define the k-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Define the classifiers for the voting ensemble
svc = SVC(kernel='linear', C=1.0, probability=True)
ada = AdaBoostClassifier(n_estimators=100)
rf = RandomForestClassifier(n_estimators=100)

# Define the voting ensemble classifier
voting_clf = VotingClassifier(estimators=[('svc', svc), ('ada', ada), ('rf', rf)], voting='soft')

# Perform the k-fold cross-validation to tune hyperparameters
for train_index, val_index in kf.split(X_train_val):
    X_train, X_val = X_train_val[train_index], X_train_val[val_index]
    y_train, y_val = y_train_val[train_index], y_train_val[val_index]
    
    # Train the voting ensemble classifier
    voting_clf.fit(X_train, y_train)
    
    # Evaluate the classifier on the validation set
    val_pred = voting_clf.predict(X_val)
    val_acc = accuracy_score(y_val, val_pred)
    print("Validation accuracy:", val_acc)
    
    # Reset the classifiers
    svc = SVC(kernel='linear', C=1.0, probability=True)
    ada = AdaBoostClassifier(n_estimators=100)
    rf = RandomForestClassifier(n_estimators=100)
    voting_clf = VotingClassifier(estimators=[('svc', svc), ('ada', ada), ('rf', rf)], voting='soft')

# Evaluate the voting ensemble classifier on the test set
voting_clf.fit(X_train_val, y_train_val)
test_pred = voting_clf.predict(X_test)
test_acc = accuracy_score(y_test, test_pred)
print("Test accuracy:", test_acc)


# get the end time
et = time.time()

# get the execution time
elapsed_time = et - st
print('Execution time:', elapsed_time, 'seconds')

current, peak = tracemalloc.get_traced_memory()
# Stop tracemalloc
tracemalloc.stop()
print("Current memory usage is", current / (1024 * 1024), "MB; Peak was", peak / (1024 * 1024), "MB")


# Bagged (LR=92.5)

In [None]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
from sklearn.ensemble import VotingClassifier, AdaBoostClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, KFold, cross_val_score
import torch
import time
import tracemalloc
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

# get the start time
st = time.time()
tracemalloc.start()


# Load the dataset
df = pd.read_csv('GrammarandProductReviews[modified].csv')

# Prepare the inputs and labels
texts = df['review'].values
labels = df['positive_review'].values

# Load the RoBERTa tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("roberta-large")
model = AutoModel.from_pretrained("roberta-large")

# Extract features using RoBERTa
inputs = np.zeros((len(texts), 1024))
for i, text in enumerate(texts):
    input_ids = tokenizer.encode(text, return_tensors='pt')
    with torch.no_grad():
        last_hidden_states = model(input_ids).last_hidden_state
    inputs[i, :] = last_hidden_states[0, 0, :].numpy()

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(inputs, labels, test_size=0.2, random_state=42)

# Define the ensemble classifiers
svm = SVC(kernel='linear', C=1, probability=True, random_state=42)
ada = AdaBoostClassifier(n_estimators=100, random_state=42)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
LogReg_clf = LogisticRegression(max_iter=1000)
DTree_clf = DecisionTreeClassifier()


#Bagging Ensemble Method
logreg_bagging_model = BaggingClassifier(base_estimator=LogReg_clf, n_estimators=100, random_state=142)
dtree_bagging_model = BaggingClassifier(base_estimator=DTree_clf, n_estimators=100, random_state=142)
#random_forest = BaggingClassifier(base_estimator=rf, n_estimators=50, random_state=100)
#extra_trees = BaggingClassifier(base_estimator=ETree, n_estimators=50, random_state=100)

def bagging_ensemble(bag):
    k_folds = KFold(n_splits=15)
    results = cross_val_score(bag, X_train, y_train, cv=k_folds)
    print(results.mean())


bagging_ensemble(logreg_bagging_model)
bagging_ensemble(dtree_bagging_model)
#bagging_ensemble(random_forest)
#bagging_ensemble(extra_trees)


# get the end time
et = time.time()

# get the execution time
elapsed_time = et - st
print('Execution time:', elapsed_time, 'seconds')

current, peak = tracemalloc.get_traced_memory()
# Stop tracemalloc
tracemalloc.stop()
print("Current memory usage is", current / (1024 * 1024), "MB; Peak was", peak / (1024 * 1024), "MB")


# RoBERT Large with product review and voting and bagging

In [None]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split, KFold,  cross_val_score
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import torch
import time
import tracemalloc
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

# get the start time
st = time.time()
tracemalloc.start()

# Load the dataset
df = pd.read_csv('GrammarandProductReviews[modified].csv')

# Prepare the inputs and labels
texts = df['review'].values
labels = df['positive_review'].values

# Load the RoBERTa tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("roberta-large")
model = AutoModel.from_pretrained("roberta-large")

# Extract features using RoBERTa
inputs = np.zeros((len(texts), 1024))
for i, text in enumerate(texts):
    input_ids = tokenizer.encode(text, return_tensors='pt')
    with torch.no_grad():
        last_hidden_states = model(input_ids).last_hidden_state
    inputs[i, :] = last_hidden_states[0, 0, :].numpy()

# Split the data into train, validation, and test sets
X_train_val, X_test, y_train_val, y_test = train_test_split(inputs, labels, test_size=0.2, random_state=42)

# Define the k-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Define the classifiers for the voting ensemble
svc = SVC(kernel='linear', C=1.0, probability=True)
ada = AdaBoostClassifier(n_estimators=100)
rf = RandomForestClassifier(n_estimators=100)
LogReg_clf = LogisticRegression(max_iter=1000)

# Define the voting ensemble classifier
voting_clf = VotingClassifier(estimators=[('svc', svc), ('ada', ada), ('rf', rf)], voting='soft')

# Perform the k-fold cross-validation to tune hyperparameters
for train_index, val_index in kf.split(X_train_val):
    X_train, X_val = X_train_val[train_index], X_train_val[val_index]
    y_train, y_val = y_train_val[train_index], y_train_val[val_index]
    
    # Train the voting ensemble classifier
    voting_clf.fit(X_train, y_train)
    
    # Evaluate the classifier on the validation set
    val_pred = voting_clf.predict(X_val)
    val_acc = accuracy_score(y_val, val_pred)
    print("Validation accuracy:", val_acc)
    
    # Reset the classifiers
    svc = SVC(kernel='linear', C=1.0, probability=True)
    ada = AdaBoostClassifier(n_estimators=100)
    rf = RandomForestClassifier(n_estimators=100)
    voting_clf = VotingClassifier(estimators=[('svc', svc), ('ada', ada), ('rf', rf)], voting='soft')

# Evaluate the voting ensemble classifier on the test set
voting_clf.fit(X_train_val, y_train_val)
test_pred = voting_clf.predict(X_test)
test_acc = accuracy_score(y_test, test_pred)
print("Test accuracy for Voting:", test_acc)


#Bagging Ensemble Method
logreg_bagging_model = BaggingClassifier(base_estimator=LogReg_clf, n_estimators=100, random_state=142)
#dtree_bagging_model = BaggingClassifier(base_estimator=DTree_clf, n_estimators=100, random_state=142)
#random_forest = BaggingClassifier(base_estimator=rf, n_estimators=50, random_state=100)
#extra_trees = BaggingClassifier(base_estimator=ETree, n_estimators=50, random_state=100)

def bagging_ensemble(bag):
    k_folds = KFold(n_splits=20)
    results = cross_val_score(bag, X_train_val, y_train_val, cv=k_folds)
    print(results.mean())


bagging_ensemble(logreg_bagging_model)



# get the end time
et = time.time()

# get the execution time
elapsed_time = et - st
print('Execution time:', elapsed_time, 'seconds')

current, peak = tracemalloc.get_traced_memory()
# Stop tracemalloc
tracemalloc.stop()
print("Current memory usage is", current / (1024 * 1024), "MB; Peak was", peak / (1024 * 1024), "MB")
