<a href="https://colab.research.google.com/github/imvignesh003/SocialMediaChannalizationML/blob/main/SocialTransferLearning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import userdata
import os

os.environ["KAGGLE_KEY"] = userdata.get('KAGGLE_KEY')
os.environ["KAGGLE_USERNAME"] = userdata.get('KAGGLE_USERNAME')

In [19]:
!kaggle datasets download -d moazeldsokyx/bbc-news
!pip install datasets
! unzip "bbc-news.zip"

Dataset URL: https://www.kaggle.com/datasets/moazeldsokyx/bbc-news
License(s): unknown
bbc-news.zip: Skipping, found more recently modified local copy (use --force to force download)
Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m33.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.wh

In [3]:
import pandas as pd
import numpy as np
import re
import string
import torch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader
import random
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [5]:
df = pd.read_csv('/content/bbc-text.csv')

In [6]:
df.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [7]:
categories = df['category'].unique()
categories

array(['tech', 'business', 'sport', 'entertainment', 'politics'],
      dtype=object)

In [8]:
category_map = {
    'tech': 'productivity',
    'business': 'productivity',
    'politics': 'productivity',
    'entertainment': 'entertainment',
    'sport': 'entertainment'
}
df['label'] = df['category'].map(category_map)

In [9]:
df.drop(columns=['category'], inplace=True)

In [10]:
df.head()

Unnamed: 0,text,label
0,tv future in the hands of viewers with home th...,productivity
1,worldcom boss left books alone former worldc...,productivity
2,tigers wary of farrell gamble leicester say ...,entertainment
3,yeading face newcastle in fa cup premiership s...,entertainment
4,ocean s twelve raids box office ocean s twelve...,entertainment


In [11]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'http\S+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

df['cleaned_text'] = df['text'].apply(preprocess_text)

In [12]:
df.head()

Unnamed: 0,text,label,cleaned_text
0,tv future in the hands of viewers with home th...,productivity,tv future in the hands of viewers with home th...
1,worldcom boss left books alone former worldc...,productivity,worldcom boss left books alone former worldc...
2,tigers wary of farrell gamble leicester say ...,entertainment,tigers wary of farrell gamble leicester say ...
3,yeading face newcastle in fa cup premiership s...,entertainment,yeading face newcastle in fa cup premiership s...
4,ocean s twelve raids box office ocean s twelve...,entertainment,ocean s twelve raids box office ocean s twelve...


In [13]:
# Fine-tune BERT Model
class SocialMediaDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )
        return {"input_ids": encoding['input_ids'].squeeze(0),
                "attention_mask": encoding['attention_mask'].squeeze(0),
                "labels": torch.tensor(self.labels[idx], dtype=torch.long)}

In [14]:
# Vectorization for KNN & SVM
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['cleaned_text'])
y = df['label'].map({'productivity': 0, 'entertainment': 1})
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
dataset = SocialMediaDataset(df['cleaned_text'].tolist(), y.tolist(), tokenizer)
train_size = int(0.8 * len(dataset))
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, len(dataset) - train_size])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [16]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)
trainer.train()

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Epoch,Training Loss,Validation Loss
1,No log,0.02158
2,No log,0.022412
3,0.071200,0.030138


TrainOutput(global_step=669, training_loss=0.0553460584449483, metrics={'train_runtime': 783.0135, 'train_samples_per_second': 6.82, 'train_steps_per_second': 0.854, 'total_flos': 1405013035622400.0, 'train_loss': 0.0553460584449483, 'epoch': 3.0})

In [23]:
import torch
from sklearn.metrics import accuracy_score

# Get model predictions
predictions = trainer.predict(test_dataset)

# Extract logits and convert to predicted class labels
logits = predictions.predictions
y_pred = np.argmax(logits, axis=-1)
y_true = [test_dataset[i]['labels'].item() for i in range(len(test_dataset))]

# Compute accuracy
accuracy = accuracy_score(y_true, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")


Test Accuracy: 0.9933


In [17]:
results = trainer.evaluate()
results

{'eval_loss': 0.03013778291642666,
 'eval_runtime': 14.8101,
 'eval_samples_per_second': 30.047,
 'eval_steps_per_second': 3.781,
 'epoch': 3.0}

In [None]:
encoding = tokenizer('I am going to a code this weekend', truncation=True, padding=True, return_tensors='pt')
output = model(**encoding.to(model.device))
prediction = torch.argmax(output.logits, dim=1).item()

In [None]:
result = {"category": "Productivity" if prediction == 0 else "Entertainment"}
result

{'category': 'Entertainment'}

In [None]:
# Save the model
trainer.save_model("./saved_model")

# Save the tokenizer (if not already saved)
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
tokenizer.save_pretrained("./saved_model")

('./saved_model/tokenizer_config.json',
 './saved_model/special_tokens_map.json',
 './saved_model/vocab.txt',
 './saved_model/added_tokens.json')

In [None]:
import shutil

# Compress the directory
shutil.make_archive("saved_model", "zip", "./saved_model")

'/content/saved_model.zip'

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Define the path in Google Drive
save_path = "/content/drive/MyDrive/saved_model"

# Save the model
trainer.save_model(save_path)

# Save the tokenizer (if not already saved)
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
tokenizer.save_pretrained(save_path)

('/content/drive/MyDrive/saved_model/tokenizer_config.json',
 '/content/drive/MyDrive/saved_model/special_tokens_map.json',
 '/content/drive/MyDrive/saved_model/vocab.txt',
 '/content/drive/MyDrive/saved_model/added_tokens.json')