In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Step 1: List of URLs (5 product-centric, 5 non-product-centric)
urls_with_labels = [
    # Product-centric (label ~0.9 to 1.0)
    {"url": "https://www.cnet.com/reviews/samsung-galaxy-s24-review/", "label": 0.95},
    {"url": "https://www.techradar.com/reviews/iphone-14-pro-review", "label": 0.97},
    {"url": "https://www.tomsguide.com/reviews/google-pixel-7", "label": 0.9},
    {"url": "https://www.rtings.com/tv/reviews/samsung/s90c-oled", "label": 0.92},
    {"url": "https://www.digitaltrends.com/home/apple-homepod-2-review/", "label": 0.91},

    # Non-product-centric (label ~0.0 to 0.2)
    {"url": "https://www.bbc.com/news/world", "label": 0.05},
    {"url": "https://www.nytimes.com/2023/08/01/world/europe/russia-ukraine-drone.html", "label": 0.02},
    {"url": "https://www.theverge.com/2024/03/20/ai-regulation-europe-update", "label": 0.15},
    {"url": "https://www.onlinebigbrother.com/big-brother-18-week-5-summary-and-live-eviction-results/", "label": 0.04},
    {"url": "https://www.medium.com/personal-growth/how-to-stay-motivated-23f8b1", "label": 0.10}
]

# Step 2: Fetch and clean content
def get_page_text(url):
    try:
        headers = {"User-Agent": "Mozilla/5.0"}
        response = requests.get(url, headers=headers, timeout=10)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, "html.parser")
            text = soup.get_text(separator=" ", strip=True)
            return text
        else:
            print(f"Failed to fetch: {url}")
            return ""
    except Exception as e:
        print(f"Error: {e} | URL: {url}")
        return ""

# Step 3: Build dataset
data = []

for item in urls_with_labels:
    print(f"Processing: {item['url']}")
    text = get_page_text(item["url"])
    data.append({
        "url": item["url"],
        "text": text,
        "label": item["label"]
    })

# Step 4: Save to CSV
df = pd.DataFrame(data)
df.to_csv("product_centricity_dataset.csv", index=False)
print("✅ Saved to product_centricity_dataset.csv")


Processing: https://www.cnet.com/reviews/samsung-galaxy-s24-review/
Failed to fetch: https://www.cnet.com/reviews/samsung-galaxy-s24-review/
Processing: https://www.techradar.com/reviews/iphone-14-pro-review
Error: HTTPSConnectionPool(host='www.techradar.com', port=443): Read timed out. (read timeout=10) | URL: https://www.techradar.com/reviews/iphone-14-pro-review
Processing: https://www.tomsguide.com/reviews/google-pixel-7
Processing: https://www.rtings.com/tv/reviews/samsung/s90c-oled
Processing: https://www.digitaltrends.com/home/apple-homepod-2-review/
Error: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')) | URL: https://www.digitaltrends.com/home/apple-homepod-2-review/
Processing: https://www.bbc.com/news/world
Processing: https://www.nytimes.com/2023/08/01/world/europe/russia-ukraine-drone.html
Failed to fetch: https://www.nytimes.com/2023/08/01/world/europe/russia-ukraine-drone.html
Processing: https://www.theverge.com/2024/03/20/ai

In [2]:
import pandas as pd

df = pd.read_csv("product_centricity_dataset.csv")
df.dropna(inplace=True)  # remove rows with missing text
print(df.shape)
df.head()


(4, 3)


Unnamed: 0,url,text,label
2,https://www.tomsguide.com/reviews/google-pixel-7,Google Pixel 7 review | Tom's Guide Skip to ma...,0.9
3,https://www.rtings.com/tv/reviews/samsung/s90c...,"Samsung S90C OLED Review (QN55S90CAFXZA, QN65S...",0.92
5,https://www.bbc.com/news/world,World | Latest News & Updates | BBC News Skip ...,0.05
9,https://www.medium.com/personal-growth/how-to-...,Medium Sitemap Open in app Sign up Sign in Med...,0.1


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Limit vocabulary size to reduce complexity
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')

# Fit and transform the text data
X = vectorizer.fit_transform(df['text'])

# Labels (target)
y = df['label']


In [5]:
print(f"Feature matrix shape: {X.shape}")  # should be (10, 5000) or similar
print(f"Target shape: {y.shape}")


Feature matrix shape: (4, 2732)
Target shape: (4,)


In [7]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [8]:
# 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [9]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [10]:
y_pred = model.predict(X_test)

In [11]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"✅ Mean Squared Error: {mse:.4f}")
print(f"✅ R² Score: {r2:.4f}")

✅ Mean Squared Error: 0.4064
✅ R² Score: nan




In [12]:
df_results = pd.DataFrame({
    "Actual": y_test.values,
    "Predicted": y_pred
})
print(df_results.head(10))

   Actual  Predicted
0    0.92     0.2825


In [13]:
import requests
from bs4 import BeautifulSoup

def get_page_text(url):
    try:
        headers = {"User-Agent": "Mozilla/5.0"}
        response = requests.get(url, headers=headers, timeout=10)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, "html.parser")
            text = soup.get_text(separator=" ", strip=True)
            return text
        else:
            print(f"❌ Failed to fetch: {url}")
            return ""
    except Exception as e:
        print(f"⚠️ Error: {e} | URL: {url}")
        return ""

In [14]:
def predict_product_centricity(url, vectorizer, model):
    text = get_page_text(url)
    if not text:
        return None
    X_new = vectorizer.transform([text])  # transform only, not fit!
    score = model.predict(X_new)[0]
    return round(float(score), 3)

In [15]:
test_url = "https://www.techradar.com/reviews/samsung-galaxy-s24-ultra-review"
predicted_score = predict_product_centricity(test_url, vectorizer, model)

if predicted_score is not None:
    print(f"🔍 Product-Centricity Score for:\n{test_url}\n➡️ Score: {predicted_score}")
else:
    print("Failed to fetch or analyze the URL.")

❌ Failed to fetch: https://www.techradar.com/reviews/samsung-galaxy-s24-ultra-review
Failed to fetch or analyze the URL.


In [16]:
import joblib

# Save model
joblib.dump(model, 'product_centricity_model.pkl')

# Save vectorizer
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

print("✅ Model and vectorizer saved successfully.")

✅ Model and vectorizer saved successfully.


In [17]:
# Load model and vectorizer
model = joblib.load('product_centricity_model.pkl')
vectorizer = joblib.load('tfidf_vectorizer.pkl')

print("✅ Model and vectorizer loaded.")

✅ Model and vectorizer loaded.


In [18]:
url = "https://www.cnet.com/reviews/samsung-galaxy-s24-review/"
score = predict_product_centricity(url, vectorizer, model)
print(f"Predicted Product Centricity Score: {score}")

❌ Failed to fetch: https://www.cnet.com/reviews/samsung-galaxy-s24-review/
Predicted Product Centricity Score: None
