In [None]:
!pip install transformers

In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.feature_selection import SelectKBest, chi2


# Load the dataset
df = pd.read_csv('/content/English.csv', encoding='latin-1')

# Split the dataset into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Extract target variable from the train dataset
y_train = train_df['label']

# Load the pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

# Tokenize the tweets and get the BERT embeddings
def get_bert_embeddings(text):
    tokens = tokenizer.encode(text, add_special_tokens=True)
    tokens_tensor = torch.tensor([tokens])
    with torch.no_grad():
        outputs = bert_model(tokens_tensor)
        last_hidden_state = outputs[0]
        embedding = torch.mean(last_hidden_state, dim=1).numpy()
    return embedding

# Get the BERT embeddings for train and test sets
train_embeddings = np.vstack(train_df['tweet'].apply(get_bert_embeddings))
test_embeddings = np.vstack(test_df['tweet'].apply(get_bert_embeddings))

# Use TF-IDF vectorizer for the machine learning algorithms
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(train_df['tweet'])
X_test_tfidf = tfidf_vectorizer.transform(test_df['tweet'])

# Feature selection using Chi-Square
selector = SelectKBest(chi2, k=min(500, X_train_tfidf.shape[1]))
selector.fit(X_train_tfidf, y_train)
X_train_tfidf = selector.transform(X_train_tfidf)
X_test_tfidf = selector.transform(X_test_tfidf)

# Create individual classifiers
ada = AdaBoostClassifier(n_estimators=100, random_state=0)
nb = MultinomialNB()
ridge = RidgeClassifier()
gb = GradientBoostingClassifier()
X_train_tfidf = X_train_tfidf.toarray()
X_test_tfidf = X_test_tfidf.toarray()
lda = LinearDiscriminantAnalysis()

# Create a list of classifiers
classifiers = [('ada', ada), ('nb', nb), ('ridge', ridge), ('gb', gb), ('lda', lda)]

# Create the ensemble model
ensemble = VotingClassifier(classifiers, voting='hard')

# Fit the ensemble model on the training data
ensemble.fit(X_train_tfidf, y_train)

# Predict the test data using the ensemble model
y_test = test_df['label']
y_pred_ensemble = ensemble.predict(X_test_tfidf)

# Print the performance metrics of the ensemble model
print("Ensemble Model")
print("Accuracy:", accuracy_score(y_test, y_pred_ensemble))
print("Precision:", precision_score(y_test, y_pred_ensemble, average='weighted'))
print("Recall:", recall_score(y_test, y_pred_ensemble, average='weighted'))
print("F1-Score:", f1_score(y_test, y_pred_ensemble, average='weighted'))


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Ensemble Model
Accuracy: 0.9518223056468011
Precision: 0.9493083626635272
Recall: 0.9518223056468011
F1-Score: 0.9423294468956218


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, roc_auc_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, roc_auc_score, cohen_kappa_score, matthews_corrcoef

In [None]:
print("Kappa:", cohen_kappa_score(y_test, y_pred_ensemble))
print("MCC:", matthews_corrcoef(y_test, y_pred_ensemble))

Kappa: 0.5009564997886272
MCC: 0.5561235344630221


In [None]:
print("AUC: ", round(roc_auc_score(y_test, y_pred_ensemble), 3))


AUC:  0.683
