In [1]:
%load_ext autoreload
%autoreload 2

In [8]:
import pandas as pd
import os
import requests
from pathlib import Path
import json

In [29]:
files = ["https://github.com/kelu124/Futures/raw/refs/heads/main/data/mrg_behav.parquet.gzip",
"https://github.com/kelu124/Futures/raw/refs/heads/main/data/mrg_concern.parquet.gzip",
"https://github.com/kelu124/Futures/raw/refs/heads/main/data/mrg_issue.parquet.gzip",
"https://github.com/kelu124/Futures/raw/refs/heads/main/data/articles.parquet.gzip",
"https://github.com/kelu124/Futures/raw/refs/heads/main/data/mrg_tech.parquet.gzip",
"https://github.com/kelu124/Futures/raw/refs/heads/main/data/seeds.parquet.gzip"

]

output_dir = Path("files")
output_dir.mkdir(exist_ok=True)


for url in files:
    # Extract filename from URL
    filename = url.split("/")[-1]
    filepath = output_dir / filename
    
    # Check if file already exists
    if filepath.exists():
        print(f"✓ {filename} already exists, skipping...")
        continue
    
    # Download the file
    print(f"⬇ Downloading {filename}...", end=" ")
    try:
        response = requests.get(url, stream=True)
        response.raise_for_status()
        
        # Write to file
        with open(filepath, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        
        file_size = filepath.stat().st_size / (1024 * 1024)  # Size in MB
        print(f"✓ Done ({file_size:.2f} MB)")
        
    except requests.exceptions.RequestException as e:
        print(f"✗ Failed: {e}")

print("\nDownload complete!")



✓ mrg_behav.parquet.gzip already exists, skipping...
✓ mrg_concern.parquet.gzip already exists, skipping...
✓ mrg_issue.parquet.gzip already exists, skipping...
✓ articles.parquet.gzip already exists, skipping...
✓ mrg_tech.parquet.gzip already exists, skipping...
⬇ Downloading seeds.parquet.gzip... ✓ Done (2.17 MB)

Download complete!


In [33]:
# Read the parquet files
print("Reading parquet files...")

df = pd.read_parquet(output_dir / "articles.parquet.gzip")
df = df.fillna("")
lst = df[df.origin.str.startswith("2025")]["file_name"].to_list()
df5 = pd.read_parquet(output_dir / "seeds.parquet.gzip")
df5 = df5[df5.src.isin(lst)]
df1 = pd.read_parquet(output_dir / "mrg_behav.parquet.gzip")
df2 = pd.read_parquet(output_dir / "mrg_concern.parquet.gzip")
df3 = pd.read_parquet(output_dir / "mrg_issue.parquet.gzip")
df4 = pd.read_parquet(output_dir / "mrg_tech.parquet.gzip")

df1 = df1.dropna()
df2 = df2.dropna()
df3 = df3.dropna()
df4 = df4.dropna()
df1 = df1[df1.relevancy >= 4]
df2 = df2[df2.relevancy >= 4]
df3 = df3[df3.relevancy >= 4]
df4 = df4[df4.relevancy >= 4]
df5 = df5[df5.relevancy >= 4]

# Convert dataframes to JSON format
print("\nConverting to JSON format...")
data = {
    'behav': df1[df1.src.isin(lst)].to_dict(orient='records'),
    'concern': df2[df2.src.isin(lst)].to_dict(orient='records'),
    'issue': df3[df3.src.isin(lst)].to_dict(orient='records'),
    'tech': df4[df4.src.isin(lst)].to_dict(orient='records'),
    'seeds': df5.to_dict(orient='records')
}

# Save to JSON file
output_file = "combined_data.json"
print(f"\nSaving to {output_file}...")
with open("docs/data/"+output_file, 'w') as f:
    json.dump(data, f, indent=2)

Reading parquet files...

Converting to JSON format...

Saving to combined_data.json...


In [24]:
df2.columns

Index(['name', 'description', 'relevancy', 'src'], dtype='object')