This notebook includes code developed and tested in Vertex AI Studio using Gemini 1.5 Pro (gemini-1.5-pro-002), a paid large language model by Google. It is now transferred to Google Colab for documentation and sharing purposes.

In [None]:
!pip install --upgrade google-cloud-aiplatform

In [None]:
import base64
import vertexai
from vertexai.generative_models import GenerativeModel, Part, SafetySetting
import pandas as pd
from google.cloud import aiplatform, storage
from vertexai import init
from vertexai.generative_models import GenerativeModel, SafetySetting
import csv
import time
from google.api_core.exceptions import ResourceExhausted
from tqdm import tqdm

# Initialize Vertex AI environment
init(project="---", location="---") #--> fill with your detailes

# Load data
file_path = "/content/balanced_prompts_nov2022.csv"

data = pd.read_csv(file_path)

# Define the generation model and configuration
model = GenerativeModel("gemini-1.5-pro-002")
generation_config = {
    "max_output_tokens": 200,  # token size
    "temperature": 0.7,
    "top_p": 0.95,
}
safety_settings = [
    SafetySetting(category=SafetySetting.HarmCategory.HARM_CATEGORY_HATE_SPEECH, threshold=SafetySetting.HarmBlockThreshold.OFF),
    SafetySetting(category=SafetySetting.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT, threshold=SafetySetting.HarmBlockThreshold.OFF),
    SafetySetting(category=SafetySetting.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT, threshold=SafetySetting.HarmBlockThreshold.OFF),
    SafetySetting(category=SafetySetting.HarmCategory.HARM_CATEGORY_HARASSMENT, threshold=SafetySetting.HarmBlockThreshold.OFF),
]

prompt_columns = [
    "Minimal Prompt",
    "Full-Feature Prompt"
]


# Prepare CSV to store results in a single file
local_csv_path = "/content/gemini_responses_nov2022.csv"
gcs_bucket_name = ""  # <-- Replace to bucket name
gcs_blob_name = "gemini/gemini_responses_nov2022.csv"

def generate_with_backoff(prompt, prompt_type, retries=2, delay=10):
    # Choose generation config based on prompt type
    if "Chain" in prompt_type or "CoT" in prompt_type:
        config = {"temperature": 0.7, "top_p": 0.95, "max_output_tokens": 200}
    else:
        config = {"temperature": 0.2, "top_p": 0.8, "max_output_tokens": 20}
    for attempt in range(retries):
        try:
            responses = model.generate_content(
                [prompt],
                generation_config=config,
                safety_settings=safety_settings,
                stream=True,
            )
            response_text = "".join([response.text for response in responses])
            return response_text
        except ResourceExhausted:
            print(f"Quota exceeded. Retrying in {delay} seconds... (Attempt {attempt + 1}/{retries})")
            time.sleep(delay)
            delay += 5
    return "Error: Quota exceeded after retries"

with open(local_csv_path, mode="w", newline="", encoding="utf-8-sig") as f:
    writer = csv.writer(f)
    header = ["user_id", "tweet_id", "is_bot"] + [f"{col} Response" for col in prompt_columns]
    writer.writerow(header)

    batch_size = 24
    batch_count = (len(data) + batch_size - 1) // batch_size

    for i in tqdm(range(0, len(data), batch_size), desc="Processing Batches"):
        batch = data.iloc[i:i + batch_size]
        batch_index = i // batch_size + 1

        with tqdm(total=len(batch), desc=f"Batch {batch_index}/{batch_count} Tweets", leave=False) as pbar_tweets:
            for _, row in batch.iterrows():
                user_id = row["user_id"]
                tweet_id = row["tweet_id"]
                bot_label = row["is_bot"]
                responses_for_row = [user_id, tweet_id, bot_label]

                for column in prompt_columns:
                    prompt = row[column]
                    if pd.isna(prompt):
                        response_text = "No prompt available"
                    else:
                        response_text = generate_with_backoff(prompt, column)
                        time.sleep(1)
                    responses_for_row.append(response_text)

                writer.writerow(responses_for_row)
                pbar_tweets.update(1)

        print(f"Batch {batch_index} completed. Pausing for 10 seconds...")
        time.sleep(10)

print(f"CSV written to: {local_csv_path}")

# Upload to Google Cloud Storage
def upload_to_gcs(local_path, bucket_name, destination_blob_name):
    client = storage.Client()
    bucket = client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)
    blob.upload_from_filename(local_path)
    print(f"Uploaded to GCS: gs://{bucket_name}/{destination_blob_name}")

upload_to_gcs(local_csv_path, gcs_bucket_name, gcs_blob_name)