<a href="https://colab.research.google.com/github/liron7722/AI-Generated-Text-Detector/blob/Production/tdIdf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import pandas as pd

# Load parameters from environment variables
params = {
    "feature": os.getenv("FEATURE", "tdidf"),  # Default to "tdidf"
    "minGrams": int(os.getenv("MIN_GRAMS", 2)),  # Default to 2
    "maxGrams": int(os.getenv("MAX_GRAMS", 4)),  # Default to 4
    "maxFeatures": int(os.getenv("maxFeatures", 250)),  # Default to 250
}

# Use the parameters
feature = params["feature"]
min_grams = params["minGrams"]
max_grams = params["maxGrams"]
max_features = params["maxFeatures"]

# Example: Print parameters to confirm
print(f"Feature: {feature}, MinGrams: {min_grams}, MaxGrams: {max_grams}, max_features: {max_features}")

# Load the data
data = pd.read_csv("data.csv")


def tokenize_data(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length")


# Apply TF-IDF or BOW logic based on the `feature` parameter
if feature == "tdidf":
    from sklearn.feature_extraction.text import TfidfVectorizer
    vectorizer = TfidfVectorizer(max_features=max_features, ngram_range=(min_grams, max_grams))
    transformed_data = vectorizer.fit_transform(data['text'])
    # Save the output
    output_file = f"{feature}.csv"
    pd.DataFrame(transformed_data.toarray(), columns=vectorizer.get_feature_names_out()).to_csv(output_file, index=False)


elif feature == "bow":
    from sklearn.feature_extraction.text import CountVectorizer
    vectorizer = CountVectorizer(max_features=max_features, ngram_range=(min_grams, max_grams))
    transformed_data = vectorizer.fit_transform(data['text'])
    # Save the output
    output_file = f"{feature}.csv"
    pd.DataFrame(transformed_data.toarray(), columns=vectorizer.get_feature_names_out()).to_csv(output_file, index=False)


elif feature == 'bert':
  import wandb
  from sklearn.model_selection import train_test_split
  from datasets import Dataset
  from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding, EarlyStoppingCallback

  wandb.login(key = '81618a380bd9549a9fc679a0262dcba21519a465')
  wandb.init(project="AI-Generated-Text-Detector-Project", name="distilbert_training")
  # Convert datasets to tokenized format
  target_column = 'generated'
  text_df = data[['text', target_column]]
  del data

  train_df, test_df = train_test_split(text_df, test_size=0.2, random_state=42)
  train_dataset = Dataset.from_pandas(train_df)
  test_dataset = Dataset.from_pandas(test_df)

  tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
  tokenized_train = train_dataset.map(tokenize_data, batched=True)
  tokenized_test = test_dataset.map(tokenize_data, batched=True)

  # Rename the 'label' column to 'labels' to match the model's expectations
  tokenized_train = tokenized_train.rename_column(target_column, "labels")
  tokenized_test = tokenized_test.rename_column(target_column, "labels")

  # Load pre-trained DistilBERT model for sequence classification
  model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

  # Prepare data collator for padding sequences
  data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

  # Define training arguments with W&B integration
  training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    eval_steps=500,  # Evaluate every 500 steps
    logging_steps=100,  # Log metrics every 100 steps
    eval_strategy="steps",
    logging_strategy="steps",
    logging_dir="./logs",
    save_steps=500,                    # Save checkpoint every 500 steps
    save_total_limit=2,                # Keep only the last 2 checkpoints
    report_to="wandb",                 # Log metrics to W&B
    load_best_model_at_end=True,       # Load best model at end of training
    resume_from_checkpoint=True        # Resume training if interrupted
  )

  # Define Trainer object for training the model
  trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
  )

  # Train the model
  trainer.train()

  # Save the trained model
  trainer.save_model('model')

  # Finish the W&B run
  wandb.finish()