**Text Classification Report**

In [1]:
# Necessary library installations
!pip install -q imbalanced-learn
!pip install -q keybert transformers

# Step 1: Data Loading and Preparation
# Import necessary libraries
import pandas as pd

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m48.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
# Load data from CSV files
X_train = pd.read_csv('X_train.csv')
X_test = pd.read_csv('X_test.csv')
y_train = pd.read_csv('y_train.csv')
y_test = pd.read_csv('y_test.csv')

In [3]:
# Combine features and labels into DataFrames
df_train = pd.concat([X_train, y_train], axis=1)
df_test = pd.concat([X_test, y_test], axis=1)


In [None]:
df_train.head()

In [4]:
# Step 2: TF-IDF Model Construction
# Import necessary libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, classification_report


In [5]:
# Initialize TfidfVectorizer
tfidf = TfidfVectorizer()


In [6]:
# Transform the training and test data
X_train_tfidf = tfidf.fit_transform(df_train['text'])
X_test_tfidf = tfidf.transform(df_test['text'])


In [7]:
# Train a Naive Bayes classifier
clf = MultinomialNB()
clf.fit(X_train_tfidf, df_train['label'])


In [8]:
# Predict on the test data
y_pred = clf.predict(X_test_tfidf)

In [9]:
# Step 3: Evaluation with Confusion Matrix
# Confusion matrix and classification report
conf_matrix = confusion_matrix(df_test['label'], y_pred)
class_report = classification_report(df_test['label'], y_pred)


In [10]:
print(conf_matrix)
print(class_report)


[[965   0]
 [145   5]]
              precision    recall  f1-score   support

           0       0.87      1.00      0.93       965
           1       1.00      0.03      0.06       150

    accuracy                           0.87      1115
   macro avg       0.93      0.52      0.50      1115
weighted avg       0.89      0.87      0.81      1115



In [11]:
# Step 4: Handling Class Imbalance with SMOTE
# Import necessary libraries
from imblearn.over_sampling import SMOTE


In [12]:
# Apply SMOTE to balance the data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_tfidf, df_train['label'])


In [13]:
# Train a Naive Bayes classifier with resampled data
clf_resampled = MultinomialNB()
clf_resampled.fit(X_train_resampled, y_train_resampled)


In [14]:
# Predict on the test data
y_pred_resampled = clf_resampled.predict(X_test_tfidf)


In [15]:
# Confusion matrix and classification report
conf_matrix_resampled = confusion_matrix(df_test['label'], y_pred_resampled)
class_report_resampled = classification_report(df_test['label'], y_pred_resampled)


In [16]:
print(conf_matrix_resampled)
print(class_report_resampled)


[[914  51]
 [ 15 135]]
              precision    recall  f1-score   support

           0       0.98      0.95      0.97       965
           1       0.73      0.90      0.80       150

    accuracy                           0.94      1115
   macro avg       0.85      0.92      0.88      1115
weighted avg       0.95      0.94      0.94      1115



In [17]:
# Step 5: Integrating Keyword Extraction with KeyBERT
# Import necessary libraries
from keybert import KeyBERT

  from tqdm.autonotebook import tqdm, trange


In [18]:
# Initialize KeyBERT model
kw_model = KeyBERT(model='all-MiniLM-L6-v2')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [19]:
# Extract keywords
keywords = kw_model.extract_keywords(df_train['text'].tolist(), keyphrase_ngram_range=(1, 2), stop_words='english', top_n=10)


In [20]:
# Flatten the list of keywords and get unique keywords
unique_keywords = set([keyword for sublist in keywords for keyword, _ in sublist])


In [21]:
# Update the tf-idf model's vocabulary
tfidf_vocabulary = set(tfidf.get_feature_names_out())
new_keywords = unique_keywords - tfidf_vocabulary


In [22]:
# Update the tf-idf vectorizer with new keywords
tfidf_updated = TfidfVectorizer(vocabulary=tfidf_vocabulary.union(new_keywords))


In [23]:
# Transform the training data with the updated vectorizer
X_train_tfidf_updated = tfidf_updated.fit_transform(df_train['text'])


In [24]:
# Apply SMOTE to balance the data
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_tfidf_updated, df_train['label'])


In [25]:
# Transform the test data using the updated vectorizer
X_test_tfidf_updated = tfidf_updated.transform(df_test['text'])


In [26]:
# Train a Naive Bayes classifier with resampled data
clf_updated = MultinomialNB()
clf_updated.fit(X_train_resampled, y_train_resampled)


In [27]:
# Predict on the test data
y_pred_updated = clf_updated.predict(X_test_tfidf_updated)


In [28]:
# Confusion matrix and classification report
conf_matrix_updated = confusion_matrix(df_test['label'], y_pred_updated)
class_report_updated = classification_report(df_test['label'], y_pred_updated)


In [29]:
print(conf_matrix_updated)
print(class_report_updated)


[[903  62]
 [ 11 139]]
              precision    recall  f1-score   support

           0       0.99      0.94      0.96       965
           1       0.69      0.93      0.79       150

    accuracy                           0.93      1115
   macro avg       0.84      0.93      0.88      1115
weighted avg       0.95      0.93      0.94      1115



In [30]:
# Step 6: Integrating Keyword Extraction with Multiple Models
# Keyword Extraction: Use BERT, RoBERTa, DistilBERT with KeyBERT for extraction.
# TF-IDF Update: Update vocabulary with extracted keywords.
# SMOTE for Balance: Apply SMOTE to handle class imbalances.
# Model Training: Train Naive Bayes on updated TF-IDF data.
# Evaluation: Assess impact on classification metrics.
from keybert import KeyBERT


In [31]:
# Initialize different KeyBERT models
models = ['distilbert-base-nli-mean-tokens', 'bert-base-uncased', 'roberta-base']


In [32]:
# Extract keywords for each model
keywords_dict = {}
for model_name in models:
    kw_model = KeyBERT(model=model_name)
    keywords = kw_model.extract_keywords(df_train['text'].tolist(), keyphrase_ngram_range=(1, 2), stop_words='english', top_n=10)
    keywords_dict[model_name] = set([keyword for sublist in keywords for keyword, _ in sublist])


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.02k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/550 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/450 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [33]:
# Find unique keywords across all models
combined_keywords = set.union(*keywords_dict.values())
from sklearn.feature_extraction.text import TfidfVectorizer


In [34]:
# Initialize TF-IDF vectorizer
tfidf = TfidfVectorizer()


In [35]:
# Fit TF-IDF on the original text data
X_train_tfidf = tfidf.fit_transform(df_train['text'])


In [36]:
# Get existing TF-IDF vocabulary
tfidf_vocabulary = set(tfidf.get_feature_names_out())


In [37]:
# Update TF-IDF vocabulary with combined keywords
new_keywords = combined_keywords - tfidf_vocabulary
tfidf_updated = TfidfVectorizer(vocabulary=tfidf_vocabulary.union(new_keywords))


In [38]:
# Transform the training data with updated TF-IDF vectorizer
X_train_tfidf_updated = tfidf_updated.fit_transform(df_train['text'])


In [39]:
from imblearn.over_sampling import SMOTE


In [40]:
# Apply SMOTE to balance the data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_tfidf_updated, df_train['label'])
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, classification_report


In [41]:
# Train Naive Bayes classifier on original TF-IDF data
clf_original = MultinomialNB()
clf_original.fit(X_train_tfidf, df_train['label'])


In [42]:
# Predict on test data
X_test_tfidf = tfidf.transform(df_test['text'])
y_pred_original = clf_original.predict(X_test_tfidf)


In [43]:
# Evaluate original model
conf_matrix_original = confusion_matrix(df_test['label'], y_pred_original)
class_report_original = classification_report(df_test['label'], y_pred_original)


In [44]:
# Train Naive Bayes classifier on updated TF-IDF data with SMOTE
clf_updated = MultinomialNB()
clf_updated.fit(X_train_resampled, y_train_resampled)


In [45]:
# Transform test data with updated TF-IDF vectorizer
X_test_tfidf_updated = tfidf_updated.transform(df_test['text'])
y_pred_updated = clf_updated.predict(X_test_tfidf_updated)


In [46]:
# Evaluate updated model
conf_matrix_updated = confusion_matrix(df_test['label'], y_pred_updated)
class_report_updated = classification_report(df_test['label'], y_pred_updated)


In [47]:
# Print results
print("Original TF-IDF Model:")
print("Confusion Matrix:\n", conf_matrix_original)
print("Classification Report:\n", class_report_original)

print("\nUpdated TF-IDF Model with Transformer Keywords and SMOTE:")
print("Confusion Matrix:\n", conf_matrix_updated)
print("Classification Report:\n", class_report_updated)

Original TF-IDF Model:
Confusion Matrix:
 [[965   0]
 [145   5]]
Classification Report:
               precision    recall  f1-score   support

           0       0.87      1.00      0.93       965
           1       1.00      0.03      0.06       150

    accuracy                           0.87      1115
   macro avg       0.93      0.52      0.50      1115
weighted avg       0.89      0.87      0.81      1115


Updated TF-IDF Model with Transformer Keywords and SMOTE:
Confusion Matrix:
 [[901  64]
 [ 10 140]]
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.93      0.96       965
           1       0.69      0.93      0.79       150

    accuracy                           0.93      1115
   macro avg       0.84      0.93      0.88      1115
weighted avg       0.95      0.93      0.94      1115



In [48]:
import joblib

# Replace 'model.pkl' with your desired filename
joblib.dump(clf_updated, 'model.pkl')


['model.pkl']

In [49]:
import os
print(os.getcwd())

/content


In [50]:
# Save the tfidf vectorizer
joblib.dump(tfidf_updated, 'tfidf_vectorizer.pkl')


['tfidf_vectorizer.pkl']

In [51]:
import os
print(os.getcwd())

/content
