<a href="https://colab.research.google.com/github/jorgejrolo/master-jorge-j-rolo/blob/main/SF%E2%80%91HCU%E2%80%91Audit%E2%80%91GPT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Author:** Jorge J. Rolo  
**Purpose:** Prioritize URLs by Helpful Content (HCU) risk using Screaming Frog exports.

### Input
- Screaming Frog export (`Internal_All.csv` or a custom export with columns: `Address`, `Title 1`, `Meta Description 1`, `Word Count`, `H1-1`, `H2-1`, `Status Code`, `Canonical Link Element 1`, `Inlinks`, `Outlinks`, `Indexability` …)

### Output
- A scored table with **HCU_Risk (0–100)** and explanations.
- Shortlist of **High‑Impact Fixes**.
- A prompt template to push the top 50 URLs to GPT for qualitative review.



In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (10,5)
print('Ready. Upload your Screaming Frog CSV in the next cell.')

In [None]:
try:
    from google.colab import files
    uploaded = files.upload()
    fn = list(uploaded.keys())[0]
except Exception:
    # Fallback: create a tiny sample
    fn = None
    sample = pd.DataFrame({
        'Address':['/guide/rtx-4070','/blog/how-to-choose-psu','/category/gpu','/product/gpu-rtx-4070'],
        'Title 1':['RTX 4070 Guide','Choose PSU','GPUs','RTX 4070 GPU'],
        'Meta Description 1':['Full guide','Tips','Listing','Product'],
        'Word Count':[420,1800,120,350],
        'H1-1':['Guide','How to choose a PSU','GPUs','RTX 4070 GPU'],
        'H2-1':['','Wattage & Efficiency','','Specs'],
        'Status Code':[200,200,200,200],
        'Canonical Link Element 1':['/guide/rtx-4070','','/category/gpu','/product/gpu-rtx-4070'],
        'Inlinks':[8,42,120,33],
        'Outlinks':[12,24,0,4],
        'Indexability':['Indexable','Indexable','Indexable','Indexable']
    })
    sample.to_csv('/mnt/data/sample_sf.csv', index=False)
    print('No file uploaded – using sample at /mnt/data/sample_sf.csv')
    fn = '/mnt/data/sample_sf.csv'

df = pd.read_csv(fn)
df.head()

In [None]:
df['word_count'] = df.get('Word Count', 0).fillna(0).astype(int)
df['has_h1'] = df.get('H1-1','').fillna('').str.len().gt(0)
df['has_h2'] = df.get('H2-1','').fillna('').str.len().gt(0)
df['is_indexable'] = df.get('Indexability','').fillna('').str.contains('Indexable', case=False)
df['inlinks'] = df.get('Inlinks',0).fillna(0).astype(int)
df['outlinks'] = df.get('Outlinks',0).fillna(0).astype(int)
df['thin'] = df['word_count'] < 300
df['orphanish'] = df['inlinks'] < 3
df['no_structure'] = ~df['has_h1'] | ~df['has_h2']
df['weak_meta'] = df.get('Meta Description 1','').fillna('').str.len().lt(80)
df['dup_canonical'] = df['Canonical Link Element 1'].duplicated(keep=False)

# HCU risk scoring (heuristic – tweak weights per site)
score = (
    df['thin'].astype(int)*25 +
    df['orphanish'].astype(int)*15 +
    df['no_structure'].astype(int)*15 +
    df['weak_meta'].astype(int)*10 +
    df['dup_canonical'].astype(int)*20 +
    (~df['is_indexable']).astype(int)*15
)
df['HCU_Risk'] = score.clip(0,100)
df[['Address','word_count','inlinks','has_h1','has_h2','HCU_Risk']].head()

In [None]:
ax = df['HCU_Risk'].plot(kind='hist', bins=10)
ax.set_title('HCU Risk Distribution')
ax.set_xlabel('Risk (0-100)')
plt.show()

top = df.sort_values('HCU_Risk', ascending=False).head(25)
top[['Address','HCU_Risk','word_count','inlinks','has_h1','has_h2']]

In [None]:
out = df.sort_values('HCU_Risk', ascending=False)
out_path = '/mnt/data/hcu_scored_urls.csv'
out.to_csv(out_path, index=False)
print('Exported to', out_path)

In [None]:
def build_prompt(rows):
    tmpl = [
        'You are auditing URLs for Helpful Content risk. For each, comment on usefulness, originality, structure, and what to fix first. Be concise.']
    for _,r in rows.iterrows():
        tmpl.append(f"URL: {r['Address']} | words={r['word_count']} | inlinks={r['inlinks']} | H1={bool(r['has_h1'])} | H2={bool(r['has_h2'])} | risk={r['HCU_Risk']}")
    return '\n'.join(tmpl)

prompt_text = build_prompt(top)
print(prompt_text[:1000] + ('...' if len(prompt_text)>1000 else ''))