# Load your data

In [1]:
import pandas as pd
import numpy as np
# Step 1: Load Solar Articles CSV
file_path = 'Solar Articles.csv'  # <-- your CSV filename
df = pd.read_csv(file_path)

## Extracting solar relavant files only

In [2]:
# Step 1.1: Fix publishedAt immediately
df['publishedAt'] = pd.to_datetime(df['publishedAt'], errors='coerce')

# Step 2: Create combined text field for filtering
df['full_text'] = (df['title'].fillna('') + ' ' +
                   df['description'].fillna('') + ' ' +
                   df['content'].fillna('')).str.lower()

# Step 3: Updated broader keywords
solar_keywords = [
    'solar panel', 'solar panels', 'solar installation',
    'solar energy', 'solar power', 'solar farm', 'solar project'
]

# Step 4: Filter relevant articles
mask = df['full_text'].apply(lambda x: any(keyword in x for keyword in solar_keywords))
solar_df = df[mask].copy()

# Step 5: Create simulated price data
date_range = pd.date_range(start=solar_df['publishedAt'].min(), end=solar_df['publishedAt'].max())
np.random.seed(42)
prices = 270 - np.linspace(0, 20, len(date_range)) + np.random.normal(0, 2, len(date_range))

price_df = pd.DataFrame({'publishedAt': date_range, 'solar_panel_price': prices})

# Step 6: Merge solar articles with prices
solar_df = pd.merge_asof(
    solar_df.sort_values('publishedAt'), 
    price_df.sort_values('publishedAt'), 
    on='publishedAt',
    direction='backward'
)

# Step 7: Final Clean Dataset
solar_df = solar_df[['publishedAt', 'title', 'description', 'content', 'solar_panel_price']]

# Step 8: Preview
print(solar_df.head())

                publishedAt  \
0 2023-01-04 13:00:00+00:00   
1 2023-01-04 13:00:00+00:00   
2 2023-01-05 05:59:00+00:00   
3 2023-01-07 16:11:31+00:00   
4 2023-01-09 09:56:00+00:00   

                                               title  \
0  Green jobs are booming, but too few employees ...   
1  Green jobs are booming, but too few employees ...   
2  Massive solar farm plans for East Devon might ...   
3  Self-healing semiconductor withstands light eq...   
4  Solar Startup Enpal Nears $2.4 Billion Valuati...   

                                         description  \
0  Green jobs go beyond solar panel installation ...   
1  Green jobs go beyond solar panel installation ...   
2  “There’s not a solar panel in sight in Cranbro...   
3  A new type of solar panel has achieved 9 perce...   
4  The German-based solar-panel company is on tra...   

                                             content  solar_panel_price  
0  To meet today’s global sustainability challeng...         270.

In [3]:
solar_df.shape

(2393, 5)

In [4]:
solar_df.columns

Index(['publishedAt', 'title', 'description', 'content', 'solar_panel_price'], dtype='object')

In [5]:
print(f"Minimum publish date: {solar_df['publishedAt'].min()}")
print(f"Maximum publish date: {solar_df['publishedAt'].max()}")


Minimum publish date: 2023-01-04 13:00:00+00:00
Maximum publish date: 2025-04-20 00:55:00+00:00


In [6]:
# Install transformers if needed
# !pip install sentence-transformers
# !pip install transformers torch

In [7]:
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np

# Step 1: Load MiniLM Model (small, fast, PyTorch-only)
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

# Step 2: Create combined text field
solar_df['combined_text'] = (
    solar_df['title'].fillna('') + ' ' +
    solar_df['description'].fillna('') + ' ' +
    solar_df['content'].fillna('')
)

# Step 3: BERT Embedder Function
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]  # First element of model_output contains token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

def encode_texts(text_list, batch_size=32):
    embeddings = []
    model.eval()
    
    with torch.no_grad():
        for start_idx in range(0, len(text_list), batch_size):
            batch_texts = text_list[start_idx:start_idx+batch_size]
            encoded_input = tokenizer(batch_texts, padding=True, truncation=True, return_tensors='pt', max_length=512)
            model_output = model(**encoded_input)
            batch_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
            embeddings.append(batch_embeddings.cpu().numpy())
            
    return np.vstack(embeddings)

# Step 4: Generate BERT embeddings
X_text_features = encode_texts(solar_df['combined_text'].tolist())

print(f"Shape of BERT feature matrix: {X_text_features.shape}")


Shape of BERT feature matrix: (2393, 384)


# Generating sentiment labels using bert

In [8]:
from transformers import pipeline

# Step 11: Load Sentiment Pipeline from DistilBERT
sentiment_pipeline = pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english',
    framework='pt')

# Step 12: Define a function to apply it
def get_bert_sentiment(text):
    try:
        result = sentiment_pipeline(text[:512])[0]  # Limit to 512 tokens max
        label = result['label']
        if label == 'POSITIVE':
            return 'Positive'
        elif label == 'NEGATIVE':
            return 'Negative'
        else:
            return 'Neutral'
    except Exception as e:
        print(f"Error: {e}")
        return 'Neutral'

# Step 13: Apply on your combined_text field
solar_df['sentiment_label'] = solar_df['combined_text'].apply(get_bert_sentiment)

# Step 14: Create target variables
y_sentiment = solar_df['sentiment_label']
y_price = solar_df['solar_panel_price']

# Step 15: Preview
print("Sentiment label distribution:")
print(y_sentiment.value_counts())
print("\nSample prices:")
print(y_price.head())

Device set to use mps:0


Sentiment label distribution:
sentiment_label
Positive    1204
Negative    1189
Name: count, dtype: int64

Sample prices:
0    270.993428
1    270.993428
2    270.993428
3    272.974289
4    269.435999
Name: solar_panel_price, dtype: float64


# Sentiment Classifier model

In [9]:
# Step 1: Import Libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Step 2: Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X_text_features, y_sentiment, 
    test_size=0.2, random_state=42, stratify=y_sentiment
)

print(f"Training samples: {X_train.shape[0]}, Test samples: {X_test.shape[0]}")

# Step 3: Train Logistic Regression Classifier
classifier = LogisticRegression(max_iter=1000, solver='lbfgs')  # max_iter needed to ensure convergence
classifier.fit(X_train, y_train)

# Step 4: Predict on Test Set
y_pred = classifier.predict(X_test)

# Step 5: Evaluate the Classifier
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Training samples: 1914, Test samples: 479

Classification Report:
              precision    recall  f1-score   support

    Negative       0.72      0.68      0.70       238
    Positive       0.70      0.74      0.72       241

    accuracy                           0.71       479
   macro avg       0.71      0.71      0.71       479
weighted avg       0.71      0.71      0.71       479


Confusion Matrix:
[[163  75]
 [ 63 178]]


# Checking 5 random sentiment predictions 

In [10]:
# Pick a few random samples
sample_indices = np.random.choice(solar_df.index, size=5, replace=False)

for idx in sample_indices:
    article_text = solar_df.loc[idx, 'combined_text']
    true_sentiment = solar_df.loc[idx, 'sentiment_label']
    
    # Encode the text using the BERT model you built
    encoded_input = tokenizer(article_text, padding=True, truncation=True, return_tensors='pt', max_length=512)
    with torch.no_grad():
        model_output = model(**encoded_input)
        pooled_output = mean_pooling(model_output, encoded_input['attention_mask'])
        article_embedding = pooled_output.cpu().numpy()

    # Predict using your trained Logistic Regression model
    predicted_sentiment = classifier.predict(article_embedding)[0]

    print(f"\nSample Article Text:\n{article_text[:300]}...")  # show only first 300 chars
    print(f"True Sentiment: {true_sentiment}")
    print(f"Predicted Sentiment: {predicted_sentiment}")



Sample Article Text:
More rooftop solar in cities would help solve NZ’s energy crisis - and build disaster resilience Just 14 of Auckland’s largest building rooftops add up to the same area as the biggest solar farm – but they could generate electricity where it’s most used to keep the lights on during disasters. New Ze...
True Sentiment: Negative
Predicted Sentiment: Positive

Sample Article Text:
'Virtual power plant' model could convince more Albertans to switch to solar CALGARY — An electricity retailer in Alberta is betting it can entice more homeowners to make the switch to solar panels by launching what it calls Canada's first retail, 100 per cent green energy-based "virtual power plant...
True Sentiment: Positive
Predicted Sentiment: Positive

Sample Article Text:
Adani Indicted in Major Bribery and Fraud Case Gautam Adani, chair of Adani Group, is indicted in New York over his alleged involvement in a multibillion-dollar bribery and fraud scheme. The charges involve corruptio

# Price Predictor Model

In [11]:
# Step 1: Import Libraries
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
import numpy as np

# Step 2: Train/Test Split
X_train_price, X_test_price, y_train_price, y_test_price = train_test_split(
    X_text_features, y_price, 
    test_size=0.2, random_state=42
)

print(f"Training samples: {X_train_price.shape[0]}, Test samples: {X_test_price.shape[0]}")

# Step 3: Train Random Forest Regressor
price_model = RandomForestRegressor(n_estimators=100, random_state=42)
price_model.fit(X_train_price, y_train_price)

# Step 4: Predict on Test Set
y_pred_price = price_model.predict(X_test_price)

# Step 5: Evaluate
mae = mean_absolute_error(y_test_price, y_pred_price)
rmse = np.sqrt(mean_squared_error(y_test_price, y_pred_price))

print(f"\nPrice Prediction MAE: {mae:.2f}")
print(f"Price Prediction RMSE: {rmse:.2f}")


Training samples: 1914, Test samples: 479

Price Prediction MAE: 4.15
Price Prediction RMSE: 5.09


# Checking 5 random price predictions

In [12]:
# Pick a few random samples
sample_indices = np.random.choice(solar_df.index, size=5, replace=False)

for idx in sample_indices:
    article_text = solar_df.loc[idx, 'combined_text']
    true_price = solar_df.loc[idx, 'solar_panel_price']
    
    # Encode the article text into BERT embedding
    encoded_input = tokenizer(article_text, padding=True, truncation=True, return_tensors='pt', max_length=512)
    with torch.no_grad():
        model_output = model(**encoded_input)
        pooled_output = mean_pooling(model_output, encoded_input['attention_mask'])
        article_embedding = pooled_output.cpu().numpy()

    # Predict price using your trained Random Forest Regressor
    predicted_price = price_model.predict(article_embedding)[0]

    print(f"\nSample Article Text:\n{article_text[:300]}...")  # show only first 300 chars
    print(f"True Solar Panel Price: ${true_price:.2f}")
    print(f"Predicted Solar Panel Price: ${predicted_price:.2f}")



Sample Article Text:
Solar power helps promote integrated village development in Maharashtra’s Vanvasi The project was by the Rotary Club of Bombay (RCB), in collaboration with Chirag Rural Development Foundation. The Rotary Club of Bombay (RCB), in collaboration with Chirag Rural Development Foundation, completed its i...
True Solar Panel Price: $266.89
Predicted Solar Panel Price: $259.98

Sample Article Text:
Clean energy’s dirty secret: the trail of waste left by India’s solar power boom As vast solar plants multiply, so does the scrap, set to reach 19m tonnes by 2050. But disposing of the waste often falls to informal traders who risk injury when dismantling broken panels Under the scorching sun, a sea...
True Solar Panel Price: $257.63
Predicted Solar Panel Price: $259.55

Sample Article Text:
India's energy demand to grow 2-2.5 times by 2047, coal to dominate, says Economic Survey 2023-24 Survey says there needs to be ‘orderly’ transition to mix of energy sources, which include

# Final Testing Prediction Function

In [13]:
from datetime import timedelta

def predict_sentiment_and_price(article_text, article_date=None):
    # Step 1: Encode article text to BERT embedding
    encoded_input = tokenizer(article_text, padding=True, truncation=True, return_tensors='pt', max_length=512)
    with torch.no_grad():
        model_output = model(**encoded_input)
        pooled_output = mean_pooling(model_output, encoded_input['attention_mask'])
        article_embedding = pooled_output.cpu().numpy()

    # Step 2: Predict Sentiment
    predicted_sentiment = classifier.predict(article_embedding)[0]

    # Step 3: Predict Price
    predicted_price = price_model.predict(article_embedding)[0]

    # Step 4: Historical Price Stats
    min_price = solar_df['solar_panel_price'].min()
    max_price = solar_df['solar_panel_price'].max()
    avg_price = solar_df['solar_panel_price'].mean()

    # Step 5: Previous Day and Week Price
    if article_date:
        article_date = pd.to_datetime(article_date).date()
        
        previous_day = article_date - timedelta(days=1)
        previous_week = article_date - timedelta(days=7)
        
        # Find closest price for previous day and week
        prev_day_price = solar_df.loc[solar_df['publishedAt'].dt.date == previous_day, 'solar_panel_price']
        prev_week_price = solar_df.loc[solar_df['publishedAt'].dt.date == previous_week, 'solar_panel_price']
        
        prev_day_price = prev_day_price.iloc[0] if not prev_day_price.empty else None
        prev_week_price = prev_week_price.iloc[0] if not prev_week_price.empty else None
    else:
        prev_day_price = None
        prev_week_price = None

    # Step 6: Display Output
    print("\n========== Prediction Result ==========")
    print(f"Input Article (first 300 chars):\n{article_text[:300]}...")
    print(f"Article Date: {article_date}")
    print("----------------------------------------")
    print(f"Predicted Sentiment: {predicted_sentiment}")
    print(f"Predicted Solar Panel Price: ${predicted_price:.2f}")
    print("----------------------------------------")
    print(f"Solar Panel Price Range in Dataset:")
    print(f"- Minimum Price: ${min_price:.2f}")
    print(f"- Maximum Price: ${max_price:.2f}")
    print(f"- Average Price: ${avg_price:.2f}")
    if prev_day_price:
        print(f"- Price 1 day before ({previous_day}): ${prev_day_price:.2f}")
    if prev_week_price:
        print(f"- Price 7 days before ({previous_week}): ${prev_week_price:.2f}")
    print("========================================")


# Testing on Positive Sentiment Data Sample

In [14]:
# Example usage with an article and a fake date:
predict_sentiment_and_price(
    "Solar panels see increased adoption as governments subsidize renewable energy projects.", 
    article_date='2024-03-15'  # <-- You pass article's publish date here
)


Input Article (first 300 chars):
Solar panels see increased adoption as governments subsidize renewable energy projects....
Article Date: 2024-03-15
----------------------------------------
Predicted Sentiment: Positive
Predicted Solar Panel Price: $259.10
----------------------------------------
Solar Panel Price Range in Dataset:
- Minimum Price: $247.45
- Maximum Price: $273.01
- Average Price: $260.54
- Price 1 day before (2024-03-14): $259.74
- Price 7 days before (2024-03-08): $261.88


# Testing on Negative Sentiment New Data Sample

In [15]:
predict_sentiment_and_price(
    "Solar panel production is facing major shortages due to supply chain disruptions, causing delays and price hikes across global markets.", 
    article_date='2024-03-15'   # You can adjust date if you want
)


Input Article (first 300 chars):
Solar panel production is facing major shortages due to supply chain disruptions, causing delays and price hikes across global markets....
Article Date: 2024-03-15
----------------------------------------
Predicted Sentiment: Negative
Predicted Solar Panel Price: $259.15
----------------------------------------
Solar Panel Price Range in Dataset:
- Minimum Price: $247.45
- Maximum Price: $273.01
- Average Price: $260.54
- Price 1 day before (2024-03-14): $259.74
- Price 7 days before (2024-03-08): $261.88
