In [None]:
import random
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Step 1: Define keywords
drug_keywords = ["weed", "cocaine", "mdma", "ecstasy", "pills", "drugdealer"]
neutral_keywords = ["travel", "food", "fitness", "family", "nature"]

drug_hashtags = ["#weedlife", "#drugdealer", "#mdma", "#cocaine"]
neutral_hashtags = ["#travelblog", "#naturelover", "#fitnessgoals", "#foodie"]

# Step 2: Helper functions for random text generation
def generate_bio(drug_related=False):
    keywords = drug_keywords if drug_related else neutral_keywords
    return f"Love {random.choice(keywords)} and sharing my journey. DM for info!"

def generate_caption(drug_related=False):
    keywords = drug_keywords if drug_related else neutral_keywords
    return f"Check out my latest post about {random.choice(keywords)}! {random.choice(drug_hashtags if drug_related else neutral_hashtags)}"

def generate_comments(drug_related=False, num_comments=3):
    keywords = drug_keywords if drug_related else neutral_keywords
    comments = []
    for _ in range(num_comments):
        comments.append(f"Awesome post about {random.choice(keywords)}!")
    return comments

# Step 3: Generate synthetic dataset
def generate_synthetic_dataset(num_samples=1000):
    data = []
    for _ in range(num_samples):
        is_drug_related = random.choice([0, 1])  # 0 for negative, 1 for positive
        user_id = f"user_{random.randint(1000, 9999)}"
        username = f"user{random.randint(1000, 9999)}"
        bio = generate_bio(drug_related=bool(is_drug_related))

        post_id = f"post_{random.randint(10000, 99999)}"
        caption = generate_caption(drug_related=bool(is_drug_related))
        hashtags = drug_hashtags if is_drug_related else neutral_hashtags
        comments = generate_comments(drug_related=bool(is_drug_related))

        likes = random.randint(50, 500) if is_drug_related else random.randint(0, 100)

        # Add entry
        data.append({
            "user_id": user_id,
            "username": username,
            "bio": bio,
            "post_id": post_id,
            "caption": caption,
            "hashtags": hashtags,
            "comments": comments,
            "likes": likes,
            "is_drug_related": is_drug_related,
        })
    return pd.DataFrame(data)

# Step 4: Generate and save dataset
synthetic_data = generate_synthetic_dataset(1000)
synthetic_data.to_csv("synthetic_drug_dataset.csv", index=False)

# Load the dataset
file_path = "synthetic_drug_dataset.csv"
data = pd.read_csv(file_path)

# Preprocess the data
X = data['caption']  # Use captions as the feature
y = data['is_drug_related']  # Target variable

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create TF-IDF vectorizer
vectorizer = TfidfVectorizer(stop_words='english')
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Train a Logistic Regression model
model = LogisticRegression(max_iter=1000)  # Increase max_iter for better convergence
model.fit(X_train_vec, y_train)

# Make predictions
y_pred = model.predict(X_test_vec)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print(classification_report(y_test, y_pred))

# Example prediction
new_caption = "Enjoying a relaxing hike in the mountains."
new_caption_vec = vectorizer.transform([new_caption])
prediction = model.predict(new_caption_vec)[0]
print(f"Prediction for '{new_caption}': {'Drug-related' if prediction == 1 else 'Not drug-related'}")

Accuracy: 1.00
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        85
           1       1.00      1.00      1.00       115

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200

Prediction for 'Enjoying a relaxing hike in the mountains.': Not drug-related
