In [None]:
import pandas as pd
import json
import hashlib
import re

# Load the dataset
df = pd.read_csv("D:\\Thesis\\testwmodel\\Online_Courses.csv")

# Step 1: Normalize Column Names
df.columns = df.columns.str.lower().str.replace(" ", "_")

# Step 2: Clean & Normalize Text Fields

# 🛑 Robust Rating Extraction
def extract_rating(rating):
    """Extracts numeric rating from strings like '4.7stars' or '4.7 stars'."""
    if pd.isna(rating) or not isinstance(rating, str):
        return None
    match = re.search(r"\d+(\.\d+)?", rating)
    return float(match.group()) if match else None

df["rating"] = df["rating"].apply(extract_rating)

# 🛑 Handle number_of_viewers extraction
def extract_number_of_viewers(viewers):
    """Extracts numeric value from number_of_viewers or returns 0."""
    if pd.isna(viewers) or not isinstance(viewers, str):
        return 0  # Return 0 for missing or non-string values
    
    # Regex to extract digits including commas
    match = re.search(r'[\d,]+', viewers)
    if match:
        # Remove commas and convert to integer
        return int(match.group().replace(",", ""))
    return 0  # Return 0 if no numeric part is found

# Apply the function
df["number_of_viewers"] = df["number_of_viewers"].apply(extract_number_of_viewers)

# Extract duration in months
def extract_duration(duration):
    """Extracts the first numeric value from duration or returns None."""
    if pd.isna(duration) or not isinstance(duration, str):
        return None  # Handle missing or non-string values
    
    # Regex to find first numeric value (supports decimals)
    match = re.search(r"\d+(\.\d+)?", duration)
    return float(match.group()) if match else None

# Apply the function
df["duration_months"] = df["duration"].apply(extract_duration)

# 🛑 Handle missing lists properly
df["subtitle_languages"] = (
    df["subtitle_languages"]
    .fillna("")
    .str.replace("Subtitles: ", "", regex=False)
    .str.split(", ")
    .apply(lambda x: x if x != [""] else None)
)

df["skills"] = (
    df["skills"]
    .fillna("")
    .str.strip(", ")
    .str.split(", ")
    .apply(lambda x: x if x != [""] else None)
)

df["instructors"] = (
    df["instructors"]
    .fillna("")
    .str.strip(", ")
    .str.split(", ")
    .apply(lambda x: x if x != [""] else None)
)

# Step 3: Function to Generate a Unique Course ID
def generate_course_id(title):
    """Generate a unique course ID using a hash of the title."""
    return "C" + hashlib.md5(title.encode()).hexdigest()[:8]

# Step 4: Create Processed Data Format
processed_data = []

for _, row in df.iterrows():
    # Extract rating, duration, and reviews
    rating = row["rating"] if not pd.isna(row["rating"]) else None
    duration = row["duration_months"] if not pd.isna(row["duration_months"]) else None
    total_reviews = int(row["number_of_viewers"]) if row["number_of_viewers"] else 0
    
    # Calculate positive percentage
    positive_percentage = (
        round((rating / 5) * 100, 2) if rating is not None else None
    )

    course_entry = {
        "course_id": generate_course_id(str(row["title"])),  # Unique Course ID
        "title": str(row.get("title", "")),

        "url": str(row.get("url", "")) or None,
        "description": str(row.get("short_intro", "")) or None,  # Ensures NaN is replaced with None
        
        "category": str(row.get("category", "")) or None,
        "sub_category": row.get("sub-category", "Unknown"),

        "course_info": {
            "provider": str(row.get("provider", "")) or None,
            "type": str(row.get("type", "")) or None,
            "language": str(row.get("language", "")) or None,
            "subtitle_languages": row.get("subtitle_languages", None)
        },

        "rating": rating,  # None if no rating
        "positive_percentage": positive_percentage,  # None if no rating

        "duration_months": duration,  # None if no duration
        "reviews": {
            "total_reviews": total_reviews,  # Integer or 0
            "positive_percentage": positive_percentage  # None if no rating
        },

        "knowledge_requirements": {
            "teaches": row["skills"] if isinstance(row["skills"], list) else None,
            "prerequisites": row["skills"][:3] if isinstance(row["skills"], list) and row["skills"] else None
        },
        "learning_path": {
            "suitable_for": ["Entry-Level AI"] if "beginner" in str(row.get("course_type", "")).lower() else ["Advanced Learners"],
            "career_paths": ["Data Scientist"] if "data science" in str(row.get("category", "")).lower() else ["AI Engineer"]
        },

        "instructors": row["instructors"] if isinstance(row["instructors"], list) else None
    }
    
    processed_data.append(course_entry)

# Step 5: Save Processed Data to JSON File
output_path = "D:\\Thesis\\Courses-Searching\\src\\db\\processed_courses_detail.json"
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(processed_data, f, indent=4)

# Step 6: Print Sample Output
print(f"✅ Data processing complete. Saved to: {output_path}")
print(json.dumps(processed_data[:2], indent=4))  # Print first two entries for preview


✅ Data processing complete. Saved to: D:\Thesis\Courses-Searching\src\db\processed_courses_detail.json
[
    {
        "course_id": "Ccd34bc1e",
        "title": "Machine Learning Specialization",
        "url": "https://www.coursera.org/specializations/machine-learning-introduction",
        "description": "#BreakIntoAI with Machine Learning Specialization. Master fundamental AI concepts and develop practical machine learning skills in the beginner-friendly, 3-course program by AI visionary Andrew Ng",
        "category": "Data Science",
        "sub_category": "Unknown",
        "course_info": {
            "provider": null,
            "type": null,
            "language": "English",
            "subtitle_languages": [
                "English"
            ]
        },
        "rating": 4.9,
        "positive_percentage": 98.0,
        "duration_months": 3.0,
        "reviews": {
            "total_reviews": 10438,
            "positive_percentage": 98.0
        },
        "knowledg