In [1]:
import json
from pathlib import Path

# Load existing bangla_articles.json
bangla_path = Path("bangla_articles.json")
if bangla_path.exists():
    with open(bangla_path, "r", encoding="utf-8") as f:
        bangla_articles = json.load(f)
else:
    bangla_articles = []

# Load data.json (single article or list)
with open("data.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# If data.json contains a single object, wrap it in a list
if isinstance(data, dict):
    data = [data]

def convert_article(item):
    body = item.get("content", "").strip()
    words = body.split()

    return {
        "title": item.get("title", ""),
        "body": body,
        "url": item.get("url", ""),
        "date": item.get("published_date", ""),
        "language": "bn",
        "source": "Prothom Alo",
        "word_count": len(words),
        "tokens": len(words)
    }

# Convert and merge
for article in data:
    bangla_articles.append(convert_article(article))

# Save merged result
with open("bangla_articles.json", "w", encoding="utf-8") as f:
    json.dump(bangla_articles, f, ensure_ascii=False, indent=2)

print(f" Done! Total articles: {len(bangla_articles)}")


✅ Done! Total articles: 3094


In [2]:
from google.colab import files
files.download("bangla_articles.json")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [3]:
import json
import pandas as pd
from pathlib import Path

#  Load existing English articles ---
english_path = Path("english_articles.json")
if english_path.exists():
    with open(english_path, "r", encoding="utf-8") as f:
        english_articles = json.load(f)
else:
    english_articles = []

#  Function to convert CSV row to JSON article ---
def csv_row_to_article(row):
    body = str(row.get("text", "")).strip()
    words = body.split()
    return {
        "title": str(row.get("title", "")).strip(),
        "body": body,
        "url": str(row.get("urls", "")).strip(),
        "date": str(row.get("publish_date", "")).strip(),
        "language": "en",
        "source": str(row.get("publisher", "Unknown")).strip(),
        "word_count": len(words),
        "tokens": len(words)
    }

#  Load CSV and convert all rows ---
csv_file = "bd_eng_news_daily.csv"
df = pd.read_csv(csv_file)

for _, row in df.iterrows():
    english_articles.append(csv_row_to_article(row))


#  Save merged JSON ---
with open("english_articles.json", "w", encoding="utf-8") as f:
    json.dump(english_articles, f, ensure_ascii=False, indent=2)

print(f" Done! Total English articles: {len(english_articles)}")


 Done! Total English articles: 14455


In [4]:
# Keep only the first 3094 articles
english_articles = english_articles[:3094]

# Save merged JSON
with open("english_articles.json", "w", encoding="utf-8") as f:
    json.dump(english_articles, f, ensure_ascii=False, indent=2)

print(f" Done! Total English articles: {len(english_articles)}")


✅ Done! Total English articles: 3094


In [5]:
files.download("english_articles.json")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>