In [1]:
import random
from tqdm import tqdm
import pandas as pd
from datasets import Dataset
import torch
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, GenerationConfig

#device = "cpu"
#print(f"Using device: {device}")

In [2]:
saiga_yandexgpt_8b = "C:/Users/KseniaLuschevaExt/Documents/models/saiga_yandexgpt_8b"
rut5_base_headline_gen_telegram = "C:/Users/KseniaLuschevaExt/Documents/models/rut5_base_headline_gen_telegram"
rugpt3large_based_on_gpt2 = "C:/Users/KseniaLuschevaExt/Documents/models/rugpt3large_based_on_gpt2"

In [3]:
MODEL_POOL = [
    {
        "name": saiga_yandexgpt_8b,
        "type": "chat",
        "temperature": 0.7,
        "model_class": "causal",
        "device": "cpu"
    }
    
]

In [4]:
from transformers import (
    AutoModelForCausalLM,  # For GPT-like models
    AutoModelForSeq2SeqLM,  # For T5-like models
    AutoTokenizer,
    AutoConfig
)


class ModelWrapper:
    def __init__(self, model_info):
        self.device = "cpu"
        self.tokenizer = AutoTokenizer.from_pretrained(model_info["name"])
        
        # Get config first to determine architecture
        config = AutoConfig.from_pretrained(model_info["name"])
        
        # Select appropriate model class
        if model_info["model_class"] == "causal":
            self.model = AutoModelForCausalLM.from_pretrained(
                model_info["name"],
                torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
                device_map="auto"
            )
        elif model_info["model_class"] == "seq2seq":
            self.model = AutoModelForSeq2SeqLM.from_pretrained(
                model_info["name"],
                torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
                device_map="auto"
            )
        else:
            raise ValueError(f"Unknown model class: {model_info['model_class']}")
        
        self.model.eval()
        self.model_type = model_info["type"]
        self.default_params = {
            "max_new_tokens":500,
            "do_sample": True,
            "top_p": 0.9,
            "temperature": model_info["temperature"]
        }

    def generate(self, prompt, **kwargs):
        try:
            # Merge default params with any overrides
            params = {**self.default_params, **kwargs}
            
            if isinstance(self.model, AutoModelForSeq2SeqLM):
                # T5-style models
                inputs = self.tokenizer(
                    prompt, 
                    return_tensors="pt",
                    padding=True,
                    truncation=True,
                    max_length=512
                ).to(self.device)
                
                outputs = self.model.generate(
                    **inputs,
                    **params
                )
            else:  # Causal LM
                inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
                outputs = self.model.generate(
                    **inputs,
                    **params,
                    pad_token_id=self.tokenizer.eos_token_id
                )
            
            return self.tokenizer.decode(outputs[0], skip_special_tokens=True)
                
        except Exception as e:
            print(f"Error in {self.model.config.name_or_path}: {str(e)}")
            return None

In [5]:
topics = [
    "изменение климата", "искусственный интеллект", "исследование космоса",  
    "квантовые вычисления", "древние цивилизации", "технологии будущего",  
    "нейробиология", "криптовалюта", "древние мифы", "виртуальная реальность",  
    "синтетическая биология", "черные дыры", "сознание", "постапокалиптические общества",  
    "океанография", "путешествия во времени", "лингвистика", "нанотехнологии", "анализ сновидений",  
    "межзвездная дипломатия"  
]


tones = [
    "нейтральном", "восторженном", "скептическом", "юмористическом",  
    "аналитическом", "саркастичном", "меланхоличном", "вдохновляющем", "драматическом",  
    "причудливом", "мрачном", "оптимистичном", "пессимистичном", "загадочном",  
    "авторитетном", "неформальном", "романтическом"  
]

actions = [
    "объяснить", "описать", "обсудить", "написать рассказ о",  
    "проанализировать", "сравнить и сопоставить", "предсказать будущее",  
    "критиковать", "представить мир, где", "подвести итог",  
    "защитить", "оспаривать", "переосмыслить", "придумать новую теорию о",  
    "пародировать", "взять интервью у эксперта по", "написать новостной репортаж о",  
    "создать диалог о", "перечислить плюсы и минусы", "исследовать этические аспекты"  
]

styles = ["формальный", "деловой", "шуточный", "школьный", "творческий", "публицистический"]

In [None]:
def generate_prompts(num_prompts=10):
    prompts = []
    for _ in range(num_prompts):
        style = random.choice(styles)
        topic = random.choice(topics)
        tone = random.choice(tones)
        action = random.choice(actions)
        
        prompt = (
            f"Напиши текст в стиле {style} в тональности '{tone}'. "
            f"Текст должен {action} тему {topic}'. "
            f"Убедись, что текст выражает  {style} style appropriately."
        )
        prompts.append(prompt)
    return prompts

In [6]:
import concurrent.futures
from functools import partial

def generate_corpus(num_texts, output_csv="generated_corpus.csv", max_workers=4, batch_size=8):
    """
    Generates corpus using diverse prompts and parallel model execution.

    Args:
        num_texts (int): Total texts to generate
        output_csv (str): Output file path
        max_workers (int): Parallel threads (match GPU count)
        batch_size (int): Texts per parallel batch
    """
    # 1. Initialize models
    models = {}
    for model_info in MODEL_POOL:
        try:
            wrapper = ModelWrapper(model_info)
            models[model_info["name"]] = wrapper
            print(f"✅ Loaded {model_info['name']}")
        except Exception as e:
            print(f"❌ Failed to load {model_info['name']}: {str(e)}")

    if not models:
        raise ValueError("No models available!")

    def create_prompt():
        style = random.choice(styles)
        topic = random.choice(topics)
        tone = random.choice(tones)
        action = random.choice(actions)

        prompt = f"Ты профессиональный писатель. Требуется {action} '{topic}' в {tone} тоне, используя {style} стиль. Текст должен быть законченным произведением из 300+ слов.\n\n Текст должен начинаться сразу с раскрытия темы:\n"

        full_prompt = f"""
          [ИНСТРУКЦИИ]
          {prompt}

          [ТРЕБОВАНИЯ]
          - Начни текст сразу с содержания
          - Не повторяй инструкции
          - Сохрани указанный стиль и тон

          [НАЧАЛО ТЕКСТА]
          """

        return full_prompt, style

    all_prompts, all_styles = zip(*[create_prompt() for _ in range(num_texts)])

    def extract_topic(prompt):
        """Extract main topic from prompt"""
        match = re.search(r"тему ['\"](.+?)['\"]", prompt)
        return match.group(1) if match else None

    def validate_output(text, prompt):
        """Enhanced validation with multiple checks"""
        if not text or len(text.strip()) == 0:
            return False
        
        # 1. Length check
        word_count = len(text.split())
        if word_count < 300:
            return False
        
        # 2. Instruction contamination check
        forbidden_phrases = [
            "инструкция", "требование", "напиши текст", 
            "текст должен", "используй стиль", "в тоне"
        ]
        if any(phrase in text.lower() for phrase in forbidden_phrases):
            return False
        
        # 3. Coherence check (simple version)
        sentence_count = len(re.findall(r'[.!?]+', text))
        if sentence_count < 5:  # At least 5 sentences
            return False
        
        # 4. Topic relevance (simple version)
        topic = extract_topic(prompt)
        if topic and topic.lower() not in text.lower():
            return False
    
        return True

    def process_batch(batch_prompts, batch_styles):
        batch_results = []
        for prompt, style in zip(batch_prompts, batch_styles):
            model_name = weighted_model_choice(models, style)
            model = models[model_name]

            try:
                # Generation parameters
                gen_params = {
                    "temperature": 0.6,
                    "top_p": 0.95,
                    "repetition_penalty": 1.4,
                    "no_repeat_ngram_size": 3,
                    "max_new_tokens": 500
                }

                # Generate text
                full_prompt = prompt + "[НАЧАЛО ТЕКСТА]\n"
                raw_text = model.generate(full_prompt, **gen_params)
                
                # Post-process and validate
                if raw_text:
                    # Extract generated content
                    text = raw_text.split("[НАЧАЛО ТЕКСТА]")[-1].strip()
                    text = re.split(r"Инструкция:|Примечание:|Требования:", text)[0].strip()
                    
                    # Apply validation
                    if validate_output(text, prompt):
                        # Additional quality checks
                        topic = extract_topic(prompt)  # Implement this based on your prompt structure
                        word_count = len(text.split())
                        
                        batch_results.append({
                            "text": text,
                            "prompt": prompt,
                            "style": style,
                            "topic": topic,
                            "model": model_name,
                            "length": word_count,
                            "valid": True  # Flag for validation
                        })
                    else:
                        batch_results.append({
                            "text": text,
                            "prompt": prompt,
                            "valid": False,
                            "reason": "Failed validation"
                        })
                        
            except Exception as e:
                print(f"Generation error: {str(e)}")
                batch_results.append({
                    "prompt": prompt,
                    "valid": False,
                    "reason": f"Generation error: {str(e)}"
                })
        
        return batch_results

    corpus = []
    with tqdm(total=num_texts, desc="Generating corpus") as pbar:
        with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
            futures = []
            for i in range(0, num_texts, batch_size):
                batch_p = all_prompts[i:i + batch_size]
                batch_s = all_styles[i:i + batch_size]
                futures.append(executor.submit(process_batch, batch_p, batch_s))

            for future in concurrent.futures.as_completed(futures):
                batch_results = future.result()
                corpus.extend(batch_results)
                pbar.update(len(batch_results))

    # 6. Create DataFrame and ensure all columns exist
    df = pd.DataFrame(corpus)

    # Ensure required columns exist
    if 'length' not in df.columns:
        if 'text' in df.columns:
            df['length'] = df['text'].str.split().str.len()
        else:
            raise ValueError("Generated corpus is missing both 'length' and 'text' columns")

    # Apply filters
    df = df.drop_duplicates(subset=['text'])
    df = df[df['length'] >= 150]  # Adjusted from 150 to match earlier check

    # Add metadata
    df['generation_date'] = pd.Timestamp.now()
    df['word_count'] = df['text'].str.split().str.len()  # Exact word count

    # Save results
    df.to_csv(output_csv, index=False, encoding='utf-8-sig')

    print(f"\nSuccessfully generated {len(df)} texts")
    print("Columns available:", df.columns.tolist())
    return df

def weighted_model_choice(models, style):
    """Select model with preference for style matching"""
    style_preference = {
        "formal": ["IlyaGusev/saiga_yandexgpt_8b"],
        "creative": ["IlyaGusev/saiga_yandexgpt_8b"]
    }

    # Try preferred models first
    for model_name in style_preference.get(style, []):
        if model_name in models:
            return model_name

    # Fallback to any available model
    return random.choice(list(models.keys()))

In [None]:
def filter_corpus(df):
    """Remove low-quality entries"""
    # Remove duplicates
    df = df.drop_duplicates(subset=["text"])
    
    # Filter by length
    df = df[df["text"].str.split().str.len() >= 50]
    
    # Add perplexity filtering if available
    if "perplexity" in df.columns:
        df = df[df["perplexity"] < 100]
    
    return df

In [14]:
torch.cuda.empty_cache()

In [7]:
torch.cuda.is_available = lambda: False
device = torch.device("cpu")

In [None]:
corpus_df = generate_corpus(
    num_texts=4,
    batch_size=1  # Texts to generate in paralle
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu and disk.


✅ Loaded C:/Users/KseniaLuschevaExt/Documents/models/saiga_yandexgpt_8b


Generating corpus:   0%|                                                                         | 0/4 [00:01<?, ?it/s]

Error in C:/Users/KseniaLuschevaExt/Documents/models/saiga_yandexgpt_8b: Tensor on device cuda:0 is not on the expected device meta!Error in C:/Users/KseniaLuschevaExt/Documents/models/saiga_yandexgpt_8b: Tensor on device cuda:0 is not on the expected device meta!



Generating corpus:   0%|                                                                         | 0/4 [00:02<?, ?it/s]

Error in C:/Users/KseniaLuschevaExt/Documents/models/saiga_yandexgpt_8b: Tensor on device cuda:0 is not on the expected device meta!
