In [None]:
# ===========================
# DATA HANDLING & ANALYSIS SCRIPT
# Covers: Import, Cleaning, Validation, API/JSON/HTML Parsing
# ===========================

import pandas as pd
import numpy as np
import sqlite3
import requests
import json
from bs4 import BeautifulSoup

# -------------------------------------------------------
# --- 1.1 Perform Standard Data Import, Joining and Aggregation Tasks ---
# -------------------------------------------------------

# 📥 Import data from flat files
df_csv = pd.read_csv("data/sample.csv")  # Replace with your actual path
df_excel = pd.read_excel("data/sample.xlsx")  # Requires openpyxl/xlrd

# 🛢 Import data from SQLite database
conn = sqlite3.connect("data/sample.db")
df_sql = pd.read_sql("SELECT * FROM employees", conn)

# 📊 Aggregate numeric, categorical variables, and dates
df_grouped = df_csv.groupby("department")["salary"].mean().reset_index()

# 🧩 Combine tables
# Combine by rows (append)
df_combined_rows = pd.concat([df_csv, df_excel], axis=0)

# Combine by columns using key
df_merged = pd.merge(df_csv, df_sql, on="employee_id", how="inner")

# 🔍 Filter data based on multiple conditions
filtered_df = df_csv[(df_csv["age"] > 30) & (df_csv["department"] == "IT")]

# -------------------------------------------------------
# --- 1.2 Perform Cleaning Tasks ---
# -------------------------------------------------------

# 🔤 Match strings using regex: Names starting with 'A'
df_filtered_names = df_csv[df_csv["name"].str.contains(r"^A", regex=True)]

# 🔁 Convert between types
df_csv["date"] = pd.to_datetime(df_csv["date"], errors="coerce")
df_csv["age"] = df_csv["age"].astype(int, errors="ignore")  # If clean

# 🧼 Clean text: Trim and lowercase
df_csv["department"] = df_csv["department"].str.strip().str.lower()

# 🗓 Clean date/time: Extract year and month
df_csv["year"] = df_csv["date"].dt.year
df_csv["month"] = df_csv["date"].dt.month

# -------------------------------------------------------
# --- 1.3 Assess Data Quality and Perform Validation Tasks ---
# -------------------------------------------------------

# ❓ Handle missing values: Fill null salaries with median
df_csv["salary"] = df_csv["salary"].fillna(df_csv["salary"].median())

# ✅ Data validation
# Duplicate check
duplicates = df_csv[df_csv.duplicated("employee_id")]

# Value range check
invalid_ages = df_csv[(df_csv["age"] < 18) | (df_csv["age"] > 65)]

# 🆔 Validate data types
print("Data types:\n", df_csv.dtypes)

# Check if 'salary' column is numeric
is_salary_numeric = pd.to_numeric(df_csv["salary"], errors='coerce').notnull()

# -------------------------------------------------------
# --- 1.4 Collect Data from Non-Standard Formats ---
# -------------------------------------------------------

# 🔧 Import JSON from API
api_url = "https://jsonplaceholder.typicode.com/users"
response = requests.get(api_url)
data = response.json()
df_api = pd.DataFrame(data)

# 📂 Parse HTML with BeautifulSoup
html = "<html><body><h1>Title</h1><p class='content'>Hello World</p></body></html>"
soup = BeautifulSoup(html, "html.parser")
content = soup.find("p", class_="content").text  # Result: 'Hello World'

# 📂 Parse raw JSON string
json_str = '{"name": "Alice", "age": 25}'
parsed_json = json.loads(json_str)

# --- END OF SCRIPT ---
print("✅ Script ran successfully. Review DataFrames for results.")
