In [10]:
import requests
import time
import json
import os
from dotenv import load_dotenv
from datetime import datetime, timedelta
# BeautifulSoup is used for web scraping
from bs4 import BeautifulSoup  

load_dotenv()

True

In [13]:
def search_google(api_key, cse_id, query, start_page):
    # Format the current date to get the past 24 hours' news
    current_time = datetime.now()
    past_24_hours = current_time - timedelta(days=1)
    date_restrict = past_24_hours.strftime("%Y-%m-%d")
    
    params = {
        'key': api_key,
        'cx': cse_id,
        'q': query,
        'dateRestrict': f'd1',  # Restricts results to the past 24 hours
        'start': start_page
    }
    response = requests.get("https://www.googleapis.com/customsearch/v1", params=params)
    return response.json()

def extract_content_from_url(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        # This is a simplified way of extracting main text, might need customization based on the website structure
        text = ' '.join([p.text for p in soup.find_all('p')])
        return text
    except Exception as e:
        print(f"Error extracting content from {url}: {e}")
        return ""
    
def extract_information(result):
    data = []
    for item in result['items']:
        title = item['title']
        link = item['link']
        content = extract_content_from_url(link)  # Extracting the full content from the link
        data.append({'title': title, 'link': link, 'content': content})
    return data

def collect_data(api_key, cse_id, query, pages=1):
    all_data = []
    for page in range(1, pages + 1):
        start_page = (page - 1) * 10 + 1
        results = search_google(api_key, cse_id, query, start_page)
        data = extract_information(results)
        all_data.extend(data)
        if page < pages:
            time.sleep(2)  # Delay between requests to avoid hitting rate limits
    return all_data

In [15]:
# Retrieve API key and CSE ID from environment variables
api_key = os.getenv('GOOGLE_API_KEY')
cse_id = os.getenv('GOOGLE_CSE_ID')

query = "Copper"
# keywords = ["copper"]  # Keywords for relevance filtering

# Collecting data from the top 5 pages
collected_data = collect_data(api_key, cse_id, query, 5)

# Save the collected data to a JSON file
with open('copper.json', 'w') as f:
    json.dump(collected_data, f, indent=4)

print(f"Collected {len(collected_data)} relevant articles.")

In [None]:
import json
import re
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Load JSON data
with open('copper_news.json', 'r') as file:
    articles = json.load(file)

def clean_text(html_content):
    # Remove HTML tags using BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')
    text = soup.get_text(separator=' ')
    
    # Remove URLs, special characters, and numbers
    text = re.sub(r'http\S+|www.\S+|[^A-Za-z\s]', '', text)
    
    # Tokenize, remove stop words, and lemmatize
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text.lower())  # Lowercasing
    clean_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    
    return ' '.join(clean_tokens)

# Preprocess and filter articles
preprocessed_articles = []
keywords = {'nickel', 'copper'}  # Set of relevant keywords

for article in articles:
    clean_content = clean_text(article['content'])
    # Check if the cleaned content contains any of the keywords
    if any(keyword in clean_content for keyword in keywords):
        article['content'] = clean_content  # Update with cleaned content
        preprocessed_articles.append(article)

# Save the preprocessed and filtered articles back to a JSON file
with open('preprocessed_articles.json', 'w') as file:
    json.dump(preprocessed_articles, file, indent=4)



In [None]:
# Load a pre-trained NLP model
nlp = spacy.load("en_core_web_sm")

# Function to extract entities
def extract_entities(text):
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

# Example of extracting entities from the first article's content
entities = extract_entities(preprocessed_articles[0]['content'])
print(entities)

In [None]:
# Initialize a graph
G = nx.Graph()

# Function to add entities to the graph
def add_entities_to_graph(entities, graph):
    for entity, type in entities:
        graph.add_node(entity, type=type)  # Add entity as node

# Function to add a simplistic relationship between entities in the same article
def add_relationships(graph, entities):
    for i, (entity1, type1) in enumerate(entities):
        for entity2, type2 in entities[i+1:]:
            # This is a simplistic example where we add an edge for every pair of entities in the same article
            graph.add_edge(entity1, entity2)

# Process each article in your JSON data
for article in preprocessed_articles:
    entities = extract_entities(article['content'])
    add_entities_to_graph(entities, G)
    add_relationships(G, entities)

In [None]:
# Prune nodes with a low degree
degree_threshold = 2  # Define your own threshold
low_degree_nodes = [node for node, degree in G.degree() if degree < degree_threshold]
G.remove_nodes_from(low_degree_nodes)

# Recalculate layout with increased spacing
pos = nx.spring_layout(G, k=1.0, iterations=100)  # You may need to tweak 'k' based on your graph size

# Draw the pruned graph
plt.figure(figsize=(20, 20))
nx.draw(G, pos, with_labels=False, node_size=100, node_color='blue', alpha=0.7)
plt.show()