<a href="https://colab.research.google.com/github/hogan11/hello-world/blob/master/LinkedInScrape.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Colab-Ready LinkedIn Scraper for IT Jobs with Gemini Enrichment (Google Sheets & Secrets)

import requests
import logging
import pandas as pd
from google.colab import auth, userdata, drive
import gspread
from google.auth import default
import google.generativeai as genai
import ipywidgets as widgets
from IPython.display import display, clear_output
from bs4 import BeautifulSoup
import os
!pip install docx2txt
import docx2txt

# ---- AUTH ----
auth.authenticate_user()
drive.mount('/content/drive')
creds, _ = default()
gc = gspread.authorize(creds)

# ---- CONFIG ----
gemini_api_key = userdata.get('GOOGLE_API_KEY')
genai.configure(api_key=gemini_api_key)
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'
linkedin_url = 'https://www.linkedin.com/jobs/search?keywords=IT&location=Seattle%2C%20WA&geoId=&trk=public_jobs_jobs-search-bar_search-submit&position=1&pageNum=0'
sheet_name_default = 'LinkedIn Job Scrapes'

# ---- RESUME INGEST ----
doc_path = '/content/drive/MyDrive/LinkedIn_Resume.docx'  # Update to match your Google Drive path
if os.path.exists(doc_path):
    resume_summary = docx2txt.process(doc_path).strip()
else:
    resume_summary = 'Experienced IT leader skilled in cloud, security, and strategy.'

# ---- UI WIDGETS ----
sheet_input = widgets.Text(value=sheet_name_default, description='Sheet Name:')
worksheet_dropdown = widgets.Dropdown(description='Worksheet:', options=['Loading...'], disabled=True)
run_button = widgets.Button(description='Run Scraper', button_style='success')
output = widgets.Output()

# ---- SCRAPER ----
def get_linkedin_jobs():
    headers = {'User-Agent': user_agent}
    response = requests.get(linkedin_url, headers=headers)
    if response.status_code != 200:
        print("❌ Unable to fetch LinkedIn page. Likely blocked by Cloudflare.")
        return []
    soup = BeautifulSoup(response.content, 'html.parser')
    job_cards = soup.find_all('li', class_='result-card job-result-card')

    jobs = []
    for card in job_cards:
        title = card.find('h3').text.strip() if card.find('h3') else ''
        company = card.find('h4').text.strip() if card.find('h4') else ''
        location = card.find('span', class_='job-result-card__location').text.strip() if card.find('span', class_='job-result-card__location') else ''
        url = card.find('a')['href'] if card.find('a') else ''
        summary = card.text.strip()
        jobs.append({
            'title': title,
            'company': company,
            'location': location,
            'summary': summary,
            'url': url
        })
    return jobs

# ---- GEMINI ENRICHMENT ----
def enrich_job(job):
    model = genai.GenerativeModel('gemini-pro')
    try:
        summary_prompt = f"Provide a 10-word professional summary of the following job description:\n{job['summary']}"
        fit_prompt = f"Based on this resume: '{resume_summary}', rate from 1-10 how well it fits this job: {job['summary']}"

        summary_response = model.generate_content(summary_prompt).text.strip()
        fit_response = model.generate_content(fit_prompt).text.strip()

        job['ai_10_word_summary'] = summary_response
        job['fit_score_1_to_10'] = fit_response
    except Exception as e:
        logging.error(f"Gemini enrichment error: {e}")
        job['ai_10_word_summary'] = ''
        job['fit_score_1_to_10'] = ''
    return job

def enrich_jobs(jobs):
    return [enrich_job(job) for job in jobs]

# ---- SHEET HELPERS ----
def load_worksheets(sheet_name):
    try:
        sh = gc.open(sheet_name)
        return [ws.title for ws in sh.worksheets()]
    except:
        return []

def export_to_gsheet(jobs, sheet_name, worksheet_name):
    df = pd.DataFrame(jobs)
    try:
        sh = gc.open(sheet_name)
    except gspread.SpreadsheetNotFound:
        sh = gc.create(sheet_name)
    try:
        worksheet = sh.worksheet(worksheet_name)
        worksheet.clear()
    except gspread.exceptions.WorksheetNotFound:
        worksheet = sh.add_worksheet(title=worksheet_name, rows=str(len(df)+10), cols=str(len(df.columns)+5))
    worksheet.update([df.columns.values.tolist()] + df.values.tolist())

def refresh_worksheet_dropdown(*args):
    sheet_name = sheet_input.value.strip()
    worksheet_dropdown.options = load_worksheets(sheet_name) or ['Sheet1']
    worksheet_dropdown.disabled = False

# ---- MAIN ----
def run_scraper(b):
    with output:
        clear_output()
        sheet_name = sheet_input.value.strip()
        worksheet_name = worksheet_dropdown.value.strip()

        if not gemini_api_key:
            print("❌ Gemini API Key not set in Colab Secrets. Please go to ⚙️ > Secrets and set GEMINI_API_KEY.")
            return

        print("🔍 Scraping LinkedIn (note: may be blocked)...")
        jobs = get_linkedin_jobs()
        if not jobs:
            print("No jobs found or unable to access page.")
            return

        print(f"✨ Enriching {len(jobs)} jobs with Gemini...")
        jobs = enrich_jobs(jobs)
        export_to_gsheet(jobs, sheet_name, worksheet_name)
        print(f"✅ Exported {len(jobs)} jobs to Google Sheet → {sheet_name} → {worksheet_name}")

# ---- BIND EVENTS ----
sheet_input.observe(refresh_worksheet_dropdown, names='value')
run_button.on_click(run_scraper)
refresh_worksheet_dropdown()

# ---- DISPLAY INTERFACE ----
display(widgets.VBox([sheet_input, worksheet_dropdown, run_button, output]))


upload to a googls sheet