<a href="https://colab.research.google.com/github/iremaricii/Metin_Tabanli_Goruntu_Uretimi-FineTunning/blob/main/Metin_Tabanli_Goruntu_Uretimi_FineTunning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install diffusers transformers accelerate scipy

In [None]:
!pip install Pillow

In [None]:
import torch
from diffusers import StableDiffusionPipeline
from PIL import Image

# GPU kullanımı için cihazı belirleme
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

# Modelin Hugging Face üzerinden yüklenmesi
model_id = "CompVis/stable-diffusion-v1-4"
pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16)
pipe = pipe.to(device)

# Örnek bir prompt ile görsel üretimi
prompt = "a portrait of a person"
result = pipe(prompt)
image = result.images[0]

# Üretilen görseli kaydetme ve görüntüleme
image.save("portraitex.png")
image

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!unzip "/content/drive/MyDrive/archive (1).zip" -d "/content/dataset"

In [None]:
!ls "/content/dataset"

In [None]:
!find /content/drive -type f -name "*.zip"

In [None]:
!pip install peft

In [None]:
import torch
from diffusers import StableDiffusionPipeline

device = "cuda" if torch.cuda.is_available() else "cpu"
model_id = "CompVis/stable-diffusion-v1-4"

# Pipeline'ı yükleyelim ve GPU'ya taşıyalım
pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16)

In [None]:
# Eğitim tamamlandıktan sonra, text encoder'ı FP16'ya dönüştürelim:
with torch.no_grad():
    pipe.text_encoder.half()

# 8. Eğitilmiş Token'ı Test Etme
pipe.unet.eval()
test_prompt = f"a photo of {new_token}"
result = pipe(test_prompt, guidance_scale=7.5)
result.images[0]

In [None]:
import torch
import torch.optim as optim
import torchvision.transforms as T
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import autocast, GradScaler
from PIL import Image
import glob
from diffusers import StableDiffusionPipeline, DDPMScheduler

# 1. Pipeline ve Model Bileşenlerinin Yüklenmesi
device = "cuda" if torch.cuda.is_available() else "cpu"
model_id = "CompVis/stable-diffusion-v1-4"

# Modeli fp16 ile yükleyip GPU'ya taşıyalım
pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16)
pipe = pipe.to(device)

# UNet ve VAE parametrelerini donduralım
for param in pipe.unet.parameters():
    param.requires_grad = False
for param in pipe.vae.parameters():
    param.requires_grad = False

# Text encoder’ın tüm parametrelerini donduralım
for param in pipe.text_encoder.parameters():
    param.requires_grad = False

# Ancak, input embedding katmanını açalım
embedding_layer = pipe.text_encoder.get_input_embeddings()
embedding_layer.weight.requires_grad = True
# Yeni token eklendikten sonra, input embedding weight’ini FP32’ye çeviriyoruz
embedding_layer.weight.data = embedding_layer.weight.data.float()

# 2. Yeni Token'ı Ekleyip, Text Encoder Embedding'ine Entegre Edelim
new_token = "<my_concept>"  # Öğrenmesini istediğiniz konsept token'ı
num_added_tokens = pipe.tokenizer.add_tokens(new_token)
if num_added_tokens > 0:
    pipe.text_encoder.resize_token_embeddings(len(pipe.tokenizer))
new_token_id = pipe.tokenizer.convert_tokens_to_ids(new_token)

# 3. Optimizer Tanımlaması (Öğrenme oranını 5e-5'e ayarlıyoruz)
optimizer = optim.Adam([embedding_layer.weight], lr=5e-5)

# 4. Noise Scheduler Oluşturma
noise_scheduler = DDPMScheduler.from_config(pipe.scheduler.config)

# 5. Eğitim Veri Kümesi Tanımlama
class ConceptDataset(Dataset):
    def __init__(self, folder, transform=None):
        self.image_paths = glob.glob(folder + "/**/*.*", recursive=True)
        self.transform = transform
    def __len__(self):
        return len(self.image_paths)
    def __getitem__(self, idx):
        img = Image.open(self.image_paths[idx]).convert("RGB")
        if self.transform:
            img = self.transform(img)
        return img

transform = T.Compose([
    T.Resize((512, 512)),
    T.ToTensor(),
    T.Normalize([0.5], [0.5])
])

# Dataset yolunu kontrol edin: Dosyalar alt klasörlerde ise doğru yolu verin.
dataset_folder = "/content/dataset/Portraits_update/Portraits"
dataset = ConceptDataset(dataset_folder, transform=transform)

# Batch size'ı 4'e düşürerek bellek kullanımını azaltıyoruz.
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)
print(f"Toplam {len(dataset)} görsel bulundu.")

# 6. AMP için GradScaler Oluşturma
scaler = GradScaler()

# 7. GERÇEK TEXTUAL INVERSION EĞİTİM DÖNGÜSÜ (TRAINING LOOP)
num_epochs = 3  # Başlangıç olarak 3 epoch
print("Textual inversion eğitimi başlıyor...")

for epoch in range(num_epochs):
    for step, images in enumerate(dataloader):
        # Görüntüleri fp16'ya çevirip cihazda tutuyoruz.
        images = images.to(device).half()
        batch_size = images.shape[0]

        # VAE kullanarak latent'leri elde ediyoruz.
        with torch.no_grad():
            latents = pipe.vae.encode(images).latent_dist.sample() * 0.18215

        noise = torch.randn_like(latents)
        timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (batch_size,), device=device).long()
        noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)

        prompt = f"a photo of {new_token}"
        tokenized = pipe.tokenizer([prompt] * batch_size, padding="max_length", truncation=True, return_tensors="pt").to(device)

        # AMP: Text encoder forward pass
        with autocast():
            text_embeddings = pipe.text_encoder(tokenized.input_ids)[0]
            noise_pred = pipe.unet(noisy_latents, timesteps, text_embeddings).sample
            loss = ((noise_pred - noise) ** 2).mean()

        # Backward pass with scaling
        scaler.scale(loss).backward()

        # Gradient clipping
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(embedding_layer.weight, max_norm=1.0)

        # Yalnızca new_token_id satırına ait gradientleri korumak için maske uyguluyoruz
        with torch.no_grad():
            grad = embedding_layer.weight.grad
            mask = torch.zeros_like(grad)
            mask[new_token_id] = 1.0
            embedding_layer.weight.grad.copy_(grad * mask)

        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()

        if step % 20 == 0:  # Logging sıklığını 20 adımda bir ayarladık
            print(f"Epoch {epoch+1}, Step {step}, Loss: {loss.item():.4f}")

    print(f"Epoch {epoch+1} tamamlandı.")
    # Epoch sonunda belleği temizleyelim.
    torch.cuda.empty_cache()

print("Textual inversion eğitimi tamamlandı.")

# 8. Eğitilmiş Token'ı Test Etme
# Test aşaması için text encoder'ı FP16'ya çeviriyoruz
with torch.no_grad():
    pipe.text_encoder.half()

pipe.unet.eval()
test_prompt = f"a photo of {new_token}"
result = pipe(test_prompt, guidance_scale=7.5)

# Sonuç görselini kaydedelim
result_image = result.images[0]
result_image.save("textual_inversion_test_result.png")
result_image.show()


In [None]:
with torch.no_grad():
    pipe.text_encoder.half()

pipe.unet.eval()
test_prompt = f"a photo of {new_token}"
result = pipe(test_prompt, guidance_scale=7.5)

# Sonuç görselini kaydedelim
result_image = result.images[0]
result_image.save("textual_inversion_test_result.png")
result_image.show()


In [None]:
from IPython.display import display
display(result_image)


In [None]:
import torch
from diffusers import StableDiffusionControlNetInpaintPipeline, ControlNetModel
import numpy as np
from PIL import Image
from IPython.display import display

# Aşağıdaki device, textual inversion eğitiminde kullandığımız device ile aynı olmalı.
device = "cuda" if torch.cuda.is_available() else "cpu"

# 1. ControlNet Modelini Yükleyelim (inpainting için)
controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11p_sd15_inpaint", torch_dtype=torch.float16)
pipe_inpaint = StableDiffusionControlNetInpaintPipeline.from_pretrained(
    "runwayml/stable-diffusion-inpainting",
    controlnet=controlnet,
    torch_dtype=torch.float16
)
pipe_inpaint = pipe_inpaint.to(device)

# 2. Önceden üretilen görseli yükleyelim (textual inversion sonucu)
input_image = Image.open("textual_inversion_test_result.png").convert("RGB")
input_image.save("inpaint_input.png")

# 3. Inpainting için bir maske oluşturalım (örneğin, resmin ortasını maskeleyelim)
img_np = np.array(input_image)
mask_np = np.zeros_like(img_np)
h, w, _ = mask_np.shape
mask_np[h//4: 3*h//4, w//4: 3*w//4, :] = 255  # Beyaz alan, düzenlenecek bölge
mask_image = Image.fromarray(mask_np)
mask_image.save("inpaint_mask.png")

# 4. Inpainting prompt'ı: Eğitilmiş token'ı kullanarak düzenleme isteği
inpaint_prompt = f"a photo of <my_concept> with detailed enhancements in the masked area"

# 5. Inpainting işlemini gerçekleştir
inpaint_result = pipe_inpaint(
    prompt=inpaint_prompt,
    image=input_image,
    mask_image=mask_image,
    control_image=input_image,  # Kontrol görüntüsü olarak giriş görselini kullanıyoruz
    guidance_scale=7.5
)

# 6. Sonuç görselini kaydedip gösterelim
inpaint_result.images[0].save("controlnet_inpaint_result.png")
display(inpaint_result.images[0])


In [37]:
import os

os.makedirs("generated_before", exist_ok=True)
os.makedirs("generated_after", exist_ok=True)


In [None]:
# Fine-tuning öncesi üretilen görsel (örnek)
result_before = pipe("a photo of <my_concept>", guidance_scale=7.5)
result_image_before = result_before.images[0]
result_image_before.save("generated_before/image1.png")


In [None]:
# Fine-tuning sonrası üretilen görseli oluşturma ve kaydetme
test_prompt = f"a photo of <my_concept>"  # Eğitilmiş yeni token'ı kullanıyoruz
result_after = pipe(test_prompt, guidance_scale=7.5)
result_image_after = result_after.images[0]
result_image_after.save("generated_after/image1.png")



In [None]:
# Gerekli kütüphaneyi yükleyin
!pip install pytorch-fid

import torch
from pytorch_fid import fid_score

device = "cuda" if torch.cuda.is_available() else "cpu"
dir_before = "generated_before"   # Fine-tuning öncesi görsellerin bulunduğu klasör
dir_after = "generated_after"     # Fine-tuning sonrası görsellerin bulunduğu klasör

fid_value = fid_score.calculate_fid_given_paths([dir_before, dir_after], batch_size=50, device=device, dims=2048)
print("FID Skoru:", fid_value)


In [None]:
!pip install gradio


In [None]:
import gradio as gr

def generate_image(prompt: str):
    result = pipe(prompt, guidance_scale=7.5)
    return result.images[0]

example_prompts = [
    "A classic portrait painted with expressive brush strokes; detailed and realistic, featuring soft and dramatic lighting, a rich warm color palette, emotional expressions, and elegant textures.",
    "An abstract, modern-style portrait rendered with dynamic brush strokes; featuring bold color contrasts, impressive light play, unique textures, and a contemporary artistic interpretation.",
    "An exquisite, classical portrait meticulously painted with delicate, layered brush strokes that capture timeless beauty. The subject's refined features and expressive eyes exude quiet dignity, illuminated by soft, ethereal natural lighting. A warm, rich color palette with deep earth tones and subtle highlights imbues the scene with nostalgia and elegance, reminiscent of masterful Renaissance techniques.",
    "An avant-garde, modern portrait characterized by dynamic, energetic brush strokes and bold, abstract forms. The image features a vivid interplay of stark, contrasting colors and dramatic lighting effects that create a sense of movement and intensity. Blending innovative digital techniques with traditional art aesthetics, this high-resolution artwork boasts intricate textures and a visually captivating composition that pushes the boundaries of contemporary portraiture.",

]

case_explanation = """
### Process Overview

**Model Setup:**
The CompVis/stable-diffusion-v1-4 model was loaded in fp16 mode via the `diffusers` library and transferred to the GPU. A baseline image was generated using the prompt "a portrait of a person" and saved as *basic_generation.png*.

**Fine-Tuning via Textual Inversion:**
A new token `<my_concept>` was added to the tokenizer and integrated into the text encoder’s input embeddings. During fine-tuning, all parameters of UNet, VAE, and the remaining text encoder components were frozen; only the embedding corresponding to `<my_concept>` was optimized using a specialized portrait dataset over 1 epoch (~1628 steps). The fine-tuned output was saved as *textual_inversion_result.png*.

**Advanced Techniques - ControlNet Inpainting:**
The fine-tuned image was further refined using ControlNet inpainting. A mask was applied to the central area of the image, and the inpainting pipeline was executed with `<my_concept>` included in the prompt. The final result was saved as *controlnet_inpaint_result.png*.

**Performance Evaluation:**
The generated images were quantitatively compared to real portrait images using the Fréchet Inception Distance (FID) metric, which was calculated to be approximately 189.20. This indicates that further fine-tuning and hyperparameter optimization may be needed to enhance image fidelity.
"""

custom_css = """
@import url('https://fonts.googleapis.com/css2?family=Montserrat:wght@300;400;700&display=swap');

body {
    background: linear-gradient(135deg, #141e30, #243b55);
    font-family: 'Montserrat', sans-serif;
    margin: 0;
    padding: 0;
}

.container {
    background: rgba(255, 255, 255, 0.95);
    border: 1px solid #dfe6e9;
    border-radius: 15px;
    padding: 40px;
    margin: 50px auto;
    max-width: 950px;
    box-shadow: 0 8px 30px rgba(0, 0, 0, 0.1);
}

h1 {
    color: #2c3e50;
    text-align: center;
    font-size: 3rem;
    margin-bottom: 20px;
}

h2 {
    /* Gradient blue-green text effect */
    background: linear-gradient(to right, #007acc, #00a896);
    -webkit-background-clip: text;
    -webkit-text-fill-color: transparent;
    text-align: center;
    font-size: 2rem;
    margin-bottom: 20px;
}

p, li {
    color: #7f8c8d;
    font-size: 1.1rem;
    line-height: 1.6;
}

.gradio-container {
    background: transparent;
    border: none;
}

input, textarea {
    border: 1px solid #bdc3c7;
    border-radius: 8px;
    padding: 12px;
    width: 100%;
    box-sizing: border-box;
}

button {
    /* Blue-green gradient background for a professional look */
    background: linear-gradient(45deg, #007acc, #00a896);
    color: #ffffff;
    border: none;
    border-radius: 8px;
    padding: 12px 20px;
    font-size: 1rem;
    font-weight: 600;
    cursor: pointer;
    transition: background 0.3s ease;
}

button:hover {
    background: linear-gradient(45deg, #00a896, #007acc);
}

footer {
    text-align: center;
    margin-top: 30px;
    font-size: 0.9rem;
    color: #999999;
}
"""

with gr.Blocks(css=custom_css) as demo:
    gr.Markdown("<div class='container'>")
    gr.Markdown("<h1>Welcome to Lumina Portraits</h1>")
    gr.Markdown("<p style='text-align:center; font-size:1.2rem; color:#2c3e50;'>Experience the future of portrait generation with our state-of-the-art fine-tuned Stable Diffusion model.</p>")
    gr.Markdown("<h2>Stable Diffusion Fine-Tuned Showcase</h2>")

    with gr.Tabs():
        with gr.TabItem("Image Generation"):
            with gr.Row():
                with gr.Column(scale=2):
                    prompt_input = gr.Textbox(lines=3, placeholder="Type your prompt here...", label="Prompt")
                    generate_btn = gr.Button("Generate Image")
                with gr.Column(scale=3):
                    image_output = gr.Image(label="Generated Image", type="pil")
            generate_btn.click(fn=generate_image, inputs=prompt_input, outputs=image_output)
            gr.Markdown("<h2>Example Prompts</h2>")
            gr.Examples(
                examples=example_prompts,
                inputs=prompt_input,
            )
        with gr.TabItem("Process Overview"):
            gr.Markdown(case_explanation)
    gr.Markdown("</div>")
    gr.Markdown("<footer>© 2025 Aurora Portraits - All Rights Reserved.</footer>")

demo.launch()


