In [None]:
import torch
import os
import requests
from transformers import AutoModelForCausalLM, AutoTokenizer
import pandas as pd
from datetime import datetime
import time

# Load the pre-trained distilgpt2 model and tokenizer
#os.environ["CURL_CA_BUNDLE"] = "/etc/ssl/certs/ca-certificates.crt"
model_name = 'distilgpt2'
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

def generate_response(prompt, max_length=75):
    inputs = tokenizer.encode(prompt, return_tensors='pt')
    outputs = model.generate(inputs, max_length=max_length, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

def generate_conversation():
    conversation = []
    prompts = [
        "Salesman: You are a salesman trying to sell a product. Engage in a conversation with a potential customer.",
        "User: You are a potential customer interested in buying a product. Engage in a conversation with the salesman."
    ]
    
    for i in range(50):  # 50 exchanges to ensure each response is 50-75 words
        prompt = prompts[i % 2]
        response = generate_response(prompt, max_length=75)
        timestamp = datetime.now().isoformat()
        speaker = "Salesman" if i % 2 == 0 else "User"
        conversation.append([speaker, response, timestamp])
        time.sleep(1)  # To simulate time delay and avoid issues
    
    return conversation

# Generate 100 sets of dialogues
conversations = []
for _ in range(100):
    conversations.append(generate_conversation())

# Create DataFrame and save to CSV
data = []
for conv in conversations:
    for line in conv:
        data.append(line)

df = pd.DataFrame(data, columns=["Speaker", "Text", "Timestamp"])
df.to_csv("sales_conversations.csv", index=False)

# README.md content
readme_content = """
# Sales Conversation Dataset

## Description
This dataset contains 100 sets of sales conversations generated using distilgpt2 from Hugging Face. Each conversation consists of exchanges between a salesman and a user, with each response being 50-75 words long.

## Files
- `sales_conversations.csv`: Contains the generated conversations.
- `run.py`: Script to generate the conversations.
- `README.md`: This file.

## Instructions
1. Ensure you have the necessary libraries installed:

## Evaluation Criteria
- Contextual Relevance and Understanding
- Coherence, Fluency, and Readability
- Creativity and Engagement
- Toxicity and Bias Mitigation
- Number of Products Sold
- Accuracy and Completeness of Information
- Compute time to generate data

"""

with open("README.md", "w") as f:
 f.write(readme_content)

print("Dataset and README.md generated successfully!")