In [None]:
import random
import pandas as pd
import torch
import transformers
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split


drug_keywords = ["weed", "cocaine", "mdma", "ecstasy", "pills", "drugdealer"]
neutral_keywords = ["travel", "food", "fitness", "family", "nature"]

drug_hashtags = ["#weedlife", "#drugdealer", "#mdma", "#cocaine"]
neutral_hashtags = ["#travelblog", "#naturelover", "#fitnessgoals", "#foodie"]


def generate_bio(drug_related=False):
    keywords = drug_keywords if drug_related else neutral_keywords
    return f"Love {random.choice(keywords)} and sharing my journey. DM for info!"

def generate_caption(drug_related=False):
    keywords = drug_keywords if drug_related else neutral_keywords
    return f"Check out my latest post about {random.choice(keywords)}! {random.choice(drug_hashtags if drug_related else neutral_hashtags)}"

def generate_comments(drug_related=False, num_comments=3):
    keywords = drug_keywords if drug_related else neutral_keywords
    comments = []
    for _ in range(num_comments):
        comments.append(f"Awesome post about {random.choice(keywords)}!")
    return comments


def generate_synthetic_dataset(num_samples=1000):
    data = []
    for _ in range(num_samples):
        is_drug_related = random.choice([0, 1])  # 0 for negative, 1 for positive
        user_id = f"user_{random.randint(1000, 9999)}"
        username = f"user{random.randint(1000, 9999)}"
        bio = generate_bio(drug_related=bool(is_drug_related))

        post_id = f"post_{random.randint(10000, 99999)}"
        caption = generate_caption(drug_related=bool(is_drug_related))
        hashtags = drug_hashtags if is_drug_related else neutral_hashtags
        comments = generate_comments(drug_related=bool(is_drug_related))

        likes = random.randint(50, 500) if is_drug_related else random.randint(0, 100)


        data.append({
            "user_id": user_id,
            "username": username,
            "bio": bio,
            "post_id": post_id,
            "caption": caption,
            "hashtags": hashtags,
            "comments": comments,
            "likes": likes,
            "is_drug_related": is_drug_related,
            "source": "synthetic"
        })
    return pd.DataFrame(data)


synthetic_data = generate_synthetic_dataset(1000)
synthetic_data.to_csv("synthetic_drug_dataset.csv", index=False)

print("Synthetic dataset generated and saved as 'synthetic_drug_dataset.csv'.")


import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split


data = pd.read_csv('synthetic_drug_dataset.csv')


train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

class TextDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]['caption']
        label = self.data.iloc[idx]['is_drug_related']
        inputs = tokenizer(text, return_tensors='pt', padding='max_length', truncation=True, max_length=128)
        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'label': torch.tensor(label, dtype=torch.long)
        }


train_dataset = TextDataset(train_data)
test_dataset = TextDataset(test_data)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)


text_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
text_model.to(device)


optimizer = torch.optim.Adam(text_model.parameters(), lr=1e-5)
criterion = torch.nn.CrossEntropyLoss()

for epoch in range(5):
    text_model.train()
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = text_model(input_ids, attention_mask=attention_mask).logits
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}, Loss: {loss.item()}")


text_model.eval()
correct = 0
total = 0
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = text_model(input_ids, attention_mask=attention_mask).logits
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        print("correct:", correct)

print(f"Accuracy: {100 * correct / total}%")


Synthetic dataset generated and saved as 'synthetic_drug_dataset.csv'.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Loss: 0.08067600429058075
Epoch 2, Loss: 0.016689222306013107
Epoch 3, Loss: 0.006602112203836441
