# üöÄ Ultra-Fast Startup Scraper - Colab Edition
## Optimized for 100+ GB RAM

This notebook is specifically optimized for Google Colab's massive RAM:
- **Unlimited caching** (no memory limits!)
- **100+ parallel workers** for extreme speed
- **All results in memory** for fastest access
- **Target: 100+ companies/second** üî•
- **Silent mode** - only shows progress bar

---

## üì¶ Step 1: Install Dependencies

In [None]:
%%capture
# Install required packages (silent mode)
!pip install beautifulsoup4 lxml requests fake-useragent langdetect nltk tqdm

import nltk
import warnings
warnings.filterwarnings('ignore')
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)

print("‚úÖ Setup complete!")

## üìÅ Step 2: Upload Dataset

Choose one option:
- **Option A**: Upload file directly
- **Option B**: Mount Google Drive

In [None]:
# Option A: Upload file directly
from google.colab import files
import os

print("üì§ Upload your dataset file...")
uploaded = files.upload()

DATASET_FILE = list(uploaded.keys())[0]
print(f"‚úÖ File uploaded: {DATASET_FILE}")

In [None]:
# Option B: Mount Google Drive (uncomment to use)
# from google.colab import drive
# drive.mount('/content/drive')

# # Set path to your dataset in Drive
# DATASET_FILE = '/content/drive/MyDrive/big_startup_secsees_dataset.csv'
# print(f"‚úÖ Using dataset: {DATASET_FILE}")

## ‚öôÔ∏è Step 3: Configuration

In [None]:
# Processing Configuration
START_ROW = 0
END_ROW = 66000  # Process all rows

# Performance Configuration (OPTIMIZED FOR 100+ GB RAM!)
MAX_WORKERS = 100  # üî• EXTREME PARALLELISM!
CHECKPOINT_INTERVAL = 1000  # Save every 1000 companies

# Memory Configuration
UNLIMITED_CACHE = True  # No cache limits!

# File Configuration
CHECKPOINT_FILE = 'scraper_checkpoint.jsonl'
FINAL_OUTPUT_FILE = 'enhanced_dataset.csv'

print("‚öôÔ∏è Configuration:")
print(f"   Workers: {MAX_WORKERS}")
print(f"   Rows: {START_ROW} to {END_ROW}")
print(f"   Memory: UNLIMITED ‚ôæÔ∏è")

## üîß Step 4: Scraper Code (Silent Mode)

In [None]:
import csv
import requests
from bs4 import BeautifulSoup
import time
import re
import json
from urllib.parse import urlparse
from typing import Dict, List, Optional, Any, Tuple, Set
import logging
from concurrent.futures import ThreadPoolExecutor, as_completed
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from dataclasses import dataclass
import langdetect
from fake_useragent import UserAgent
import urllib3
from threading import Lock
from tqdm.notebook import tqdm
import gc
import warnings

# SILENT MODE - Disable all warnings and logs
warnings.filterwarnings('ignore')
urllib3.disable_warnings()
logging.disable(logging.CRITICAL)

@dataclass
class CompanyData:
    name: str
    url: str
    category: str
    funding: str
    status: str
    location: str
    original_data: Dict[str, Any]

class FastURLProcessor:
    @staticmethod
    def clean_and_validate_url(url: str) -> Optional[str]:
        if not url or url == '-' or url.strip() == '':
            return None
        url = url.strip()
        if not url.startswith(('http://', 'https://')):
            if '.' in url:
                url = 'https://' + url
            else:
                return None
        try:
            parsed = urlparse(url)
            if not parsed.netloc or len(parsed.netloc) < 3:
                return None
            return url.replace(' ', '').replace('\n', '').replace('\r', '')
        except:
            return None

    @staticmethod
    def get_url_variations(url: str) -> List[str]:
        variations = [url]
        try:
            parsed = urlparse(url)
            domain = parsed.netloc
            path = parsed.path or ''
            if domain.startswith('www.'):
                non_www = domain[4:]
                variations.append(f'https://{non_www}{path}')
                variations.append(f'http://{non_www}{path}')
            else:
                variations.append(f'https://www.{domain}{path}')
                variations.append(f'http://www.{domain}{path}')
            if url.startswith('https://'):
                variations.append(url.replace('https://', 'http://'))
            elif url.startswith('http://'):
                variations.append(url.replace('http://', 'https://'))
        except:
            pass
        return variations

class EnhancedContentExtractor:
    def __init__(self):
        self.elite_schools = {
            'harvard', 'stanford', 'mit', 'yale', 'princeton', 'caltech',
            'berkeley', 'oxford', 'cambridge', 'wharton', 'columbia',
            'cornell', 'upenn', 'carnegie mellon', 'imperial', 'eth zurich'
        }
        self.tech_keywords = {
            'artificial intelligence', 'machine learning', 'deep learning',
            'blockchain', 'cryptocurrency', 'cloud computing', 'mobile app',
            'saas platform', 'api', 'microservices', 'kubernetes', 'docker',
            'react', 'angular', 'vue', 'python', 'java', 'golang', 'rust',
            'data science', 'big data', 'analytics platform', 'iot',
            'augmented reality', 'virtual reality', 'computer vision'
        }
        self.business_models = {
            'software as a service', 'saas', 'platform as a service', 'paas',
            'marketplace', 'e-commerce', 'subscription', 'freemium',
            'enterprise software', 'b2b', 'b2c', 'd2c'
        }

    def extract_description(self, soup: BeautifulSoup, url: str = '') -> str:
        descriptions = []
        try:
            selectors = [
                ('meta[name="description"]', 'content'),
                ('meta[property="og:description"]', 'content'),
                ('meta[name="twitter:description"]', 'content'),
                ('.hero-description', 'text'),
                ('.tagline', 'text'),
                ('h1 + p', 'text'),
            ]
            for selector, attr_type in selectors:
                try:
                    elements = soup.select(selector)
                    for elem in elements[:3]:
                        desc = elem.get('content', '').strip() if attr_type == 'content' else elem.get_text().strip()
                        if desc and 30 <= len(desc) <= 1000:
                            descriptions.append(desc)
                except:
                    pass
            if descriptions:
                good = [d for d in descriptions if 80 <= len(d) <= 400]
                return good[0] if good else max(descriptions, key=len)[:500]
        except:
            pass
        return ""

    def extract_founder_info(self, soup: BeautifulSoup, content: str) -> Dict[str, Any]:
        info = {
            'founder_count': 0,
            'founder_education_quality': 'Unknown',
            'founder_technical_background': False,
            'founder_business_background': False,
        }
        try:
            content_lower = content.lower()
            founder_patterns = [r'\b(?:founder|co-founder|ceo)\b', r'\bfounded by\b']
            founder_mentions = sum(len(re.findall(p, content_lower)) for p in founder_patterns)
            info['founder_count'] = min(founder_mentions, 10)
            if any(school in content_lower for school in self.elite_schools):
                info['founder_education_quality'] = 'Elite'
            elif any(w in content_lower for w in ['university', 'college', 'phd', 'mba']):
                info['founder_education_quality'] = 'Good'
            tech_count = sum(1 for i in ['engineer', 'developer', 'cto'] if i in content_lower)
            info['founder_technical_background'] = tech_count >= 2
            info['founder_business_background'] = any(i in content_lower for i in ['mba', 'consultant'])
        except:
            pass
        return info

    def extract_business_model(self, soup: BeautifulSoup, content: str) -> str:
        try:
            content_lower = content.lower()
            for model in self.business_models:
                if model in content_lower:
                    if 'saas' in model:
                        return 'SaaS'
                    elif 'marketplace' in model:
                        return 'Marketplace'
                    elif 'e-commerce' in model:
                        return 'E-Commerce'
                    return model.title()
        except:
            pass
        return 'Unknown'

    def extract_technology_stack(self, soup: BeautifulSoup, content: str) -> List[str]:
        technologies = []
        try:
            content_lower = content.lower()
            for tech in self.tech_keywords:
                if re.search(r'\b' + re.escape(tech) + r'\b', content_lower):
                    technologies.append(tech)
            return technologies[:3]
        except:
            pass
        return []

print("‚úÖ Classes loaded")

In [None]:
class Colab_UltraFastScraper:
    def __init__(self, unlimited_cache=True, max_workers=100):
        self.session = requests.Session()
        self.unlimited_cache = unlimited_cache
        self.max_workers = max_workers

        try:
            self.ua = UserAgent()
        except:
            self.ua = None

        retry_strategy = Retry(
            total=1,
            backoff_factor=0.2,
            status_forcelist=[429, 500, 502, 503, 504],
            allowed_methods=["HEAD", "GET"]
        )
        adapter = HTTPAdapter(
            max_retries=retry_strategy,
            pool_connections=200,
            pool_maxsize=200
        )
        self.session.mount("http://", adapter)
        self.session.mount("https://", adapter)

        self.url_processor = FastURLProcessor()
        self.content_extractor = EnhancedContentExtractor()
        self.failure_cache = set()
        self.page_cache = {}

    def get_page_content(self, url: str, timeout: int = 5) -> Tuple[Optional[BeautifulSoup], str, Dict[str, Any]]:
        metadata = {'url': url, 'status_code': 0, 'success': False, 'error': None}
        try:
            clean_url = self.url_processor.clean_and_validate_url(url)
            if not clean_url:
                return None, "", metadata
            if clean_url in self.failure_cache:
                return None, "", metadata
            if clean_url in self.page_cache:
                cached_html = self.page_cache[clean_url]
                soup = BeautifulSoup(cached_html, 'lxml')
                for element in soup(["script", "style", "noscript"]):
                    element.decompose()
                content = soup.get_text(separator=' ', strip=True)
                metadata['success'] = True
                metadata['status_code'] = 200
                return soup, content, metadata

            url_variations = self.url_processor.get_url_variations(clean_url)
            for attempt_url in url_variations:
                try:
                    user_agent = self.ua.random if self.ua else 'Mozilla/5.0'
                    headers = {
                        'User-Agent': user_agent,
                        'Accept': 'text/html,application/xhtml+xml',
                        'Connection': 'keep-alive',
                    }
                    response = self.session.get(attempt_url, headers=headers, timeout=timeout, allow_redirects=True, verify=False)
                    if response.status_code >= 400:
                        continue
                    if 'text/html' not in response.headers.get('content-type', '').lower():
                        continue
                    soup = BeautifulSoup(response.content, 'lxml')
                    for element in soup(["script", "style", "noscript"]):
                        element.decompose()
                    content = soup.get_text(separator=' ', strip=True)
                    self.page_cache[clean_url] = response.content
                    metadata['success'] = True
                    metadata['status_code'] = response.status_code
                    return soup, content, metadata
                except:
                    continue
            self.failure_cache.add(clean_url)
        except:
            pass
        return None, "", metadata

    def scrape_company(self, company_data: CompanyData) -> Dict[str, Any]:
        features = {
            'name': company_data.name,
            'homepage_url': company_data.url,
            'category_list': company_data.category,
            'funding_total_usd': company_data.funding,
            'status': company_data.status,
            'city': company_data.location,
            'website_accessible': False,
            'website_status_code': 0,
            'founder_count': 0,
            'founder_education_quality': 'Unknown',
            'founder_technical_background': False,
            'founder_business_background': False,
            'business_model_clarity': 'Unknown',
            'technology_stack': 'Unknown',
            'technology_count': 0,
            'company_description': '',
            'content_length': 0,
            'word_count': 0,
            'detected_language': 'en',
            'success': company_data.status.lower() in ['ipo', 'acquired'] if company_data.status else False
        }
        try:
            soup, content, metadata = self.get_page_content(company_data.url)
            if soup and metadata['success']:
                features['website_accessible'] = True
                features['website_status_code'] = metadata['status_code']
                features['company_description'] = self.content_extractor.extract_description(soup, company_data.url)
                features['business_model_clarity'] = self.content_extractor.extract_business_model(soup, content)
                tech_stack = self.content_extractor.extract_technology_stack(soup, content)
                features['technology_stack'] = ', '.join(tech_stack) if tech_stack else 'Unknown'
                features['technology_count'] = len(tech_stack)
                founder_info = self.content_extractor.extract_founder_info(soup, content)
                features.update(founder_info)
                features['content_length'] = len(content)
                features['word_count'] = len(content.split())
                try:
                    if len(content) > 50:
                        features['detected_language'] = langdetect.detect(content[:2000])
                except:
                    pass
        except:
            pass
        return features

print("‚úÖ Scraper ready")

## üöÄ Step 5: Run Scraper (Silent Mode - Progress Bar Only)

In [None]:
def run_scraper():
    print("üöÄ Starting scraper...\n")
    
    scraper = Colab_UltraFastScraper(unlimited_cache=UNLIMITED_CACHE, max_workers=MAX_WORKERS)

    # Load dataset
    companies = []
    with open(DATASET_FILE, 'r', encoding='utf-8', errors='ignore') as file:
        reader = csv.DictReader(file)
        all_rows = list(reader)
        rows_to_process = all_rows[START_ROW:END_ROW]
        
        for row in rows_to_process:
            company_data = CompanyData(
                name=row.get('name', ''),
                url=row.get('homepage_url', ''),
                category=row.get('category_list', ''),
                funding=row.get('funding_total_usd', ''),
                status=row.get('status', ''),
                location=row.get('city', ''),
                original_data=row
            )
            companies.append(company_data)

    print(f"üìä Processing {len(companies)} companies with {MAX_WORKERS} workers\n")

    # Storage
    all_results = []
    results_lock = Lock()
    
    stats = {'accessible': 0, 'descriptions': 0, 'founders': 0}
    stats_lock = Lock()

    start_time = time.time()

    # Progress bar only - no other output!
    with tqdm(total=len(companies), desc="Scraping", unit=" companies", ncols=100) as pbar:
        with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
            future_to_company = {executor.submit(scraper.scrape_company, c): c for c in companies}

            checkpoint_buffer = []

            for future in as_completed(future_to_company):
                try:
                    result = future.result(timeout=15)
                    
                    with results_lock:
                        all_results.append(result)
                        checkpoint_buffer.append(result)
                    
                    with stats_lock:
                        if result.get('website_accessible'):
                            stats['accessible'] += 1
                        if result.get('company_description'):
                            stats['descriptions'] += 1
                        if result.get('founder_count', 0) > 0:
                            stats['founders'] += 1
                    
                    pbar.update(1)

                    # Checkpoint
                    if len(all_results) % CHECKPOINT_INTERVAL == 0:
                        with open(CHECKPOINT_FILE, 'a', encoding='utf-8') as f:
                            for item in checkpoint_buffer:
                                f.write(json.dumps(item) + '\n')
                        checkpoint_buffer = []
                        
                        # Update progress bar with stats
                        elapsed = time.time() - start_time
                        rate = len(all_results) / elapsed
                        pbar.set_postfix({
                            'rate': f'{rate:.1f}/s',
                            'ok': stats['accessible'],
                            'desc': stats['descriptions']
                        })

                except:
                    pbar.update(1)

            # Flush buffer
            if checkpoint_buffer:
                with open(CHECKPOINT_FILE, 'a', encoding='utf-8') as f:
                    for item in checkpoint_buffer:
                        f.write(json.dumps(item) + '\n')

    # Save CSV
    print("\nüíæ Saving results...")
    with open(FINAL_OUTPUT_FILE, 'w', newline='', encoding='utf-8') as file:
        if all_results:
            fieldnames = list(all_results[0].keys())
            writer = csv.DictWriter(file, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(all_results)

    # Final stats
    elapsed = time.time() - start_time
    print("\n" + "="*60)
    print("‚úÖ COMPLETE!")
    print("="*60)
    print(f"‚è±Ô∏è  Time: {elapsed/60:.1f} minutes")
    print(f"‚ö° Rate: {len(all_results)/elapsed:.1f} companies/second")
    print(f"üìä Total: {len(all_results)} companies")
    print(f"üåê Accessible: {stats['accessible']} ({stats['accessible']/len(all_results)*100:.1f}%)")
    print(f"üìù Descriptions: {stats['descriptions']} ({stats['descriptions']/len(all_results)*100:.1f}%)")
    print(f"üë• Founders: {stats['founders']} ({stats['founders']/len(all_results)*100:.1f}%)")
    print(f"\nüìÅ Output: {FINAL_OUTPUT_FILE}")
    print("="*60)
    
    gc.collect()
    return all_results

# RUN IT!
results = run_scraper()

## üì• Step 6: Download Results

In [None]:
from google.colab import files

print("üì• Downloading...")
files.download(FINAL_OUTPUT_FILE)
print("‚úÖ Done!")

## üìä Step 7: Quick Analysis (Optional)

In [None]:
import pandas as pd

df = pd.read_csv(FINAL_OUTPUT_FILE)

print("üìä Quick Stats:")
print(f"\nTotal: {len(df)}")
print(f"\nAccessible:")
print(df['website_accessible'].value_counts())
print(f"\nSuccess:")
print(df['success'].value_counts())
print(f"\nTop Business Models:")
print(df['business_model_clarity'].value_counts().head(5))