# 📝 Blog Migration Tool - Google Colab Version

Automated blog content migration with AI-powered rewriting using Gemini AI.

## Features:
- Extract content from any blog URL (RSS or web scraping)
- Rewrite content with AI to avoid plagiarism
- Publish automatically to Blogger
- Schedule posts over time

## Setup:
1. Run all cells in order
2. Configure your API keys when prompted
3. Add source blog URLs
4. Process and publish!

In [None]:
# Install required packages
!pip install -q supabase beautifulsoup4 feedparser google-api-python-client google-genai pandas requests trafilatura python-dotenv

In [None]:
# Import libraries
import os
import sys
from datetime import datetime, timedelta
from typing import List, Dict, Any, Optional
import json
from IPython.display import display, HTML, clear_output
import pandas as pd

In [None]:
# Configuration - Enter your API keys here
os.environ['SUPABASE_URL'] = 'https://cxcsfgrsuqlmxcbnkbik.supabase.co'
os.environ['SUPABASE_ANON_KEY'] = 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6ImN4Y3NmZ3JzdXFsbXhjYm5rYmlrIiwicm9sZSI6ImFub24iLCJpYXQiOjE3NTkzMzI2NjQsImV4cCI6MjA3NDkwODY2NH0.mXNM_nI2Rv_29EL3pkaWNemceqNtszHxAH7WWOs94CQ'

# Your API keys
GEMINI_API_KEY = 'AIzaSyBAqMxp0-Uf9asMQeDCV8uafPYafHXWLI8'
BLOGGER_API_KEY = 'AIzaSyBwwg3SyVN9xslSubGlx5kzJMjgHtZibw8'

os.environ['GEMINI_API_KEY'] = GEMINI_API_KEY
os.environ['BLOGGER_API_KEY'] = BLOGGER_API_KEY

print('✅ Configuration loaded!')

In [None]:
# Database Module
from supabase import create_client, Client

class Database:
    def __init__(self):
        supabase_url = os.getenv('SUPABASE_URL')
        supabase_key = os.getenv('SUPABASE_ANON_KEY')
        self.client: Client = create_client(supabase_url, supabase_key)

    def add_source(self, url: str, name: str) -> bool:
        try:
            self.client.table('sources').insert({'url': url, 'name': name}).execute()
            return True
        except:
            return False

    def get_all_sources(self) -> List[Dict]:
        response = self.client.table('sources').select('*').order('created_at', desc=True).execute()
        return response.data

    def add_post(self, source_id: str, title: str, content: str, source_url: str,
                 images: List = None, tags: List = None) -> str:
        response = self.client.table('posts').insert({
            'source_id': source_id,
            'title': title,
            'content': content,
            'source_url': source_url,
            'images': images or [],
            'tags': tags or [],
            'status': 'extracted'
        }).execute()
        return response.data[0]['id']

    def is_duplicate(self, title: str, source_url: str) -> bool:
        response = self.client.table('posts').select('id').or_(f'title.eq.{title},source_url.eq.{source_url}').limit(1).execute()
        return len(response.data) > 0

    def get_posts_by_status(self, status: str, limit: int = 100) -> List[Dict]:
        response = self.client.table('posts').select('*').eq('status', status).order('created_at', desc=True).limit(limit).execute()
        return response.data

    def update_post_rewritten(self, post_id: str, rewritten_title: str, rewritten_content: str,
                             meta_description: str = None, suggested_tags: List = None):
        self.client.table('posts').update({
            'rewritten_title': rewritten_title,
            'rewritten_content': rewritten_content,
            'meta_description': meta_description,
            'suggested_tags': suggested_tags or [],
            'status': 'rewritten'
        }).eq('id', post_id).execute()

    def update_post_published(self, post_id: str, published_url: str):
        self.client.table('posts').update({
            'published_url': published_url,
            'status': 'published'
        }).eq('id', post_id).execute()

    def get_statistics(self) -> Dict:
        sources_count = len(self.client.table('sources').select('id').execute().data)
        extracted_count = len(self.client.table('posts').select('id').eq('status', 'extracted').execute().data)
        published_count = len(self.client.table('posts').select('id').eq('status', 'published').execute().data)
        pending_count = len(self.client.table('posts').select('id').in_('status', ['extracted', 'rewritten']).execute().data)
        return {
            'total_sources': sources_count,
            'total_extracted': extracted_count,
            'total_published': published_count,
            'total_pending': pending_count
        }

print('✅ Database module loaded!')

In [None]:
# Content Extractor Module
import feedparser
import trafilatura
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import re

class ContentExtractor:
    def __init__(self):
        self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}

    def extract_from_url(self, url: str, max_posts: int = 10) -> List[Dict]:
        posts = self._extract_from_rss(url, max_posts)
        if not posts:
            posts = self._extract_from_webpage(url, max_posts)
        return posts

    def _extract_from_rss(self, url: str, max_posts: int) -> List[Dict]:
        try:
            feed = feedparser.parse(url)
            if not feed.entries:
                return []
            
            posts = []
            for entry in feed.entries[:max_posts]:
                content = ''
                if 'content' in entry:
                    content = entry.content[0].get('value', '')
                elif 'summary' in entry:
                    content = entry.get('summary', '')
                
                posts.append({
                    'title': entry.get('title', 'Untitled'),
                    'url': entry.get('link', url),
                    'content': self._clean_html(content),
                    'images': [],
                    'tags': [tag.term for tag in entry.get('tags', [])]
                })
            return posts
        except:
            return []

    def _extract_from_webpage(self, url: str, max_posts: int) -> List[Dict]:
        try:
            response = requests.get(url, headers=self.headers, timeout=10)
            soup = BeautifulSoup(response.text, 'html.parser')
            article_links = self._find_article_links(soup, url)
            
            posts = []
            for link in article_links[:max_posts]:
                post = self._scrape_single_article(link)
                if post:
                    posts.append(post)
            return posts
        except:
            return []

    def _scrape_single_article(self, url: str) -> Optional[Dict]:
        try:
            downloaded = trafilatura.fetch_url(url)
            content = trafilatura.extract(downloaded)
            if not content:
                return None
            
            response = requests.get(url, headers=self.headers, timeout=10)
            soup = BeautifulSoup(response.text, 'html.parser')
            
            return {
                'title': self._extract_title(soup),
                'url': url,
                'content': content,
                'images': [],
                'tags': []
            }
        except:
            return None

    def _find_article_links(self, soup: BeautifulSoup, base_url: str) -> List[str]:
        links = []
        for selector in ['article a[href]', '.post a[href]', 'h2 a[href]', 'h3 a[href]']:
            for elem in soup.select(selector):
                href = elem.get('href')
                if href:
                    full_url = urljoin(base_url, href)
                    if urlparse(full_url).netloc == urlparse(base_url).netloc:
                        links.append(full_url)
        return list(set(links))

    def _extract_title(self, soup: BeautifulSoup) -> str:
        h1 = soup.find('h1')
        if h1:
            return h1.get_text(strip=True)
        title_tag = soup.find('title')
        return title_tag.get_text(strip=True) if title_tag else 'Untitled Post'

    def _clean_html(self, html: str) -> str:
        if not html:
            return ''
        soup = BeautifulSoup(html, 'html.parser')
        for script in soup(['script', 'style']):
            script.decompose()
        return soup.get_text(separator=' ').strip()

print('✅ Content Extractor module loaded!')

In [None]:
# AI Rewriter Module
from google import genai

class AIRewriter:
    def __init__(self, api_key: str):
        self.client = genai.Client(api_key=api_key)
        self.model = 'gemini-2.0-flash-exp'

    def rewrite_post(self, title: str, content: str) -> Dict:
        prompt = f"""Rewrite this blog post completely to avoid plagiarism while preserving the core message.

Original Title: {title}
Original Content: {content}

Output Format:
REWRITTEN_TITLE:
[title here]

REWRITTEN_CONTENT:
[content here]

META_DESCRIPTION:
[150-160 character description]

TAGS:
[comma-separated tags]
"""
        
        response = self.client.models.generate_content(model=self.model, contents=prompt)
        return self._parse_response(response.text)

    def _parse_response(self, text: str) -> Dict:
        result = {'title': '', 'content': '', 'meta_description': '', 'tags': []}
        
        if 'REWRITTEN_TITLE:' in text:
            result['title'] = text.split('REWRITTEN_TITLE:')[1].split('REWRITTEN_CONTENT:')[0].strip()
        if 'REWRITTEN_CONTENT:' in text:
            result['content'] = text.split('REWRITTEN_CONTENT:')[1].split('META_DESCRIPTION:')[0].strip()
        if 'META_DESCRIPTION:' in text:
            result['meta_description'] = text.split('META_DESCRIPTION:')[1].split('TAGS:')[0].strip()
        if 'TAGS:' in text:
            tags_text = text.split('TAGS:')[1].strip()
            result['tags'] = [tag.strip() for tag in tags_text.split(',')]
        
        return result

print('✅ AI Rewriter module loaded!')

In [None]:
# Blogger Publisher Module
from googleapiclient.discovery import build

class BloggerPublisher:
    def __init__(self, api_key: str):
        self.service = build('blogger', 'v3', developerKey=api_key)

    def publish_post(self, blog_id: str, title: str, content: str, labels: List = None) -> str:
        post_body = {
            'kind': 'blogger#post',
            'blog': {'id': blog_id},
            'title': title,
            'content': content
        }
        if labels:
            post_body['labels'] = labels
        
        response = self.service.posts().insert(blogId=blog_id, body=post_body).execute()
        return response.get('url', '')

print('✅ Blogger Publisher module loaded!')

In [None]:
# Initialize all modules
db = Database()
extractor = ContentExtractor()
rewriter = AIRewriter(GEMINI_API_KEY)
publisher = BloggerPublisher(BLOGGER_API_KEY)

print('🚀 All modules initialized successfully!')

In [None]:
# Display Dashboard
stats = db.get_statistics()

display(HTML(f"""
<h2>📊 Dashboard</h2>
<div style="display: flex; gap: 20px;">
    <div style="padding: 20px; background: #f0f0f0; border-radius: 8px;">
        <h3>{stats['total_sources']}</h3>
        <p>Total Sources</p>
    </div>
    <div style="padding: 20px; background: #e3f2fd; border-radius: 8px;">
        <h3>{stats['total_extracted']}</h3>
        <p>Extracted Posts</p>
    </div>
    <div style="padding: 20px; background: #c8e6c9; border-radius: 8px;">
        <h3>{stats['total_published']}</h3>
        <p>Published Posts</p>
    </div>
    <div style="padding: 20px; background: #fff9c4; border-radius: 8px;">
        <h3>{stats['total_pending']}</h3>
        <p>Pending Posts</p>
    </div>
</div>
"""))

In [None]:
# Add Source Blog
source_url = input('Enter blog homepage URL: ')
source_name = input('Enter a name for this source (optional): ') or source_url

if db.add_source(source_url, source_name):
    print(f'✅ Added source: {source_name}')
else:
    print('❌ Source already exists or invalid')

In [None]:
# Extract Content from Sources
sources = db.get_all_sources()

if not sources:
    print('No sources available. Add sources first.')
else:
    print('Available sources:')
    for i, source in enumerate(sources):
        print(f"{i+1}. {source['name']} - {source['url']}")
    
    source_idx = int(input('Select source number: ')) - 1
    max_posts = int(input('Maximum posts to extract (default 10): ') or '10')
    
    selected_source = sources[source_idx]
    
    print(f"\n🔍 Extracting content from {selected_source['name']}...")
    posts = extractor.extract_from_url(selected_source['url'], max_posts)
    
    added = 0
    for post in posts:
        if not db.is_duplicate(post['title'], post['url']):
            db.add_post(
                source_id=selected_source['id'],
                title=post['title'],
                content=post['content'],
                source_url=post['url'],
                images=post.get('images', []),
                tags=post.get('tags', [])
            )
            added += 1
    
    print(f'\n✅ Extracted {added} new posts!')

In [None]:
# Rewrite Posts with AI
posts_to_rewrite = db.get_posts_by_status('extracted')

if not posts_to_rewrite:
    print('No posts to rewrite. Extract content first.')
else:
    print(f'Found {len(posts_to_rewrite)} posts to rewrite')
    process_count = int(input(f'How many posts to process (max {len(posts_to_rewrite)}): '))
    
    for i, post in enumerate(posts_to_rewrite[:process_count]):
        print(f"\n[{i+1}/{process_count}] Rewriting: {post['title'][:60]}...")
        
        try:
            rewritten = rewriter.rewrite_post(post['title'], post['content'])
            
            db.update_post_rewritten(
                post_id=post['id'],
                rewritten_title=rewritten['title'],
                rewritten_content=rewritten['content'],
                meta_description=rewritten.get('meta_description'),
                suggested_tags=rewritten.get('tags', [])
            )
            
            print('✅ Rewritten successfully')
        except Exception as e:
            print(f'❌ Error: {str(e)}')
    
    print(f'\n✅ Processed {process_count} posts!')

In [None]:
# Publish to Blogger
blog_id = input('Enter your Blogger Blog ID: ')
posts_to_publish = db.get_posts_by_status('rewritten')

if not posts_to_publish:
    print('No posts ready to publish. Rewrite posts first.')
else:
    print(f'Found {len(posts_to_publish)} posts ready to publish')
    publish_count = int(input(f'How many posts to publish: '))
    
    for i, post in enumerate(posts_to_publish[:publish_count]):
        print(f"\n[{i+1}/{publish_count}] Publishing: {post['rewritten_title'][:60]}...")
        
        try:
            published_url = publisher.publish_post(
                blog_id=blog_id,
                title=post['rewritten_title'],
                content=post['rewritten_content'],
                labels=post.get('suggested_tags', [])
            )
            
            db.update_post_published(post_id=post['id'], published_url=published_url)
            
            print(f'✅ Published: {published_url}')
        except Exception as e:
            print(f'❌ Error: {str(e)}')
    
    print(f'\n✅ Published {publish_count} posts!')

## 🎉 Complete!

You have successfully migrated blog content with AI rewriting!

### Next Steps:
- Add more source blogs
- Extract more content
- Process and publish in batches
- Monitor your published posts on Blogger