### Student Performance

In [9]:
import pandas as pd
import numpy as np

# Reproducibility
np.random.seed(42)

# Number of students
num_students = 100

# Student IDs
student_ids = np.arange(1001, 1001 + num_students)

# Attendance between 55% and 100%
attendance = np.random.randint(55, 101, num_students)

# Function to generate subject marks based on attendance
def generate_marks(attendance, difficulty_factor):
    """
    attendance: percentage attendance
    difficulty_factor: subject difficulty multiplier
    """
    base_score = attendance * difficulty_factor
    noise = np.random.normal(0, 5, len(attendance))  # natural variation
    marks = base_score + noise
    return np.clip(marks, 25, 100).round(0)

# Subject difficulty factors (Maths & Science harder)
english = generate_marks(attendance, 0.85)
hindi = generate_marks(attendance, 0.88)
maths = generate_marks(attendance, 0.80)
science = generate_marks(attendance, 0.78)
social = generate_marks(attendance, 0.82)

# Create DataFrame
df = pd.DataFrame({
    "Student_ID": student_ids,
    "Attendance_Percentage": attendance,
    "English": english.astype(int),
    "Hindi": hindi.astype(int),
    "Mathematics": maths.astype(int),
    "Science": science.astype(int),
    "Social_Studies": social.astype(int)
})

# Save to CSV
df.to_csv("student_performance_data.csv", index=False)

print("CSV file 'student_performance_data.csv' generated successfully!")


CSV file 'student_performance_data.csv' generated successfully!


### Sales

In [11]:
import pandas as pd
import numpy as np

# Reproducibility
np.random.seed(7)

# Number of records
num_records = 100

# Order IDs
order_ids = np.arange(5001, 5001 + num_records)

# Date range (last 6 months)
order_dates = pd.to_datetime(
    np.random.choice(
        pd.date_range("2025-01-01", "2025-06-30"),
        num_records
    )
)

# Regions with sales bias
regions = np.random.choice(
    ["North", "South", "East", "West"],
    size=num_records,
    p=[0.35, 0.30, 0.20, 0.15]  # North & South sell more
)

# Categories
categories = np.random.choice(
    ["Electronics", "Furniture", "Stationery"],
    size=num_records,
    p=[0.45, 0.30, 0.25]
)

# Product mapping
product_map = {
    "Electronics": ["Laptop", "Mobile", "Headphones"],
    "Furniture": ["Chair", "Table", "Cupboard"],
    "Stationery": ["Notebook", "Pen", "Backpack"]
}

# Base prices per product
price_map = {
    "Laptop": 55000,
    "Mobile": 22000,
    "Headphones": 2500,
    "Chair": 3200,
    "Table": 7500,
    "Cupboard": 18000,
    "Notebook": 120,
    "Pen": 20,
    "Backpack": 1500
}

# Generate products based on category
products = [np.random.choice(product_map[cat]) for cat in categories]

# Quantity sold
quantity = np.random.randint(1, 8, num_records)

# Unit prices with slight variation
unit_price = [
    int(price_map[p] * np.random.uniform(0.9, 1.1))
    for p in products
]

# Sales amount
sales_amount = np.array(quantity) * np.array(unit_price)

# Create DataFrame
df = pd.DataFrame({
    "Order_ID": order_ids,
    "Order_Date": order_dates,
    "Region": regions,
    "Category": categories,
    "Product": products,
    "Quantity": quantity,
    "Unit_Price": unit_price,
    "Sales_Amount": sales_amount
})

# Sort by date (realistic trend view)
df = df.sort_values("Order_Date")

# Save to CSV
df.to_csv("sales_performance_data.csv", index=False)

print("CSV file 'sales_performance_data.csv' generated successfully!")


CSV file 'sales_performance_data.csv' generated successfully!


### Employee

In [12]:
import pandas as pd
import numpy as np

# Reproducibility
np.random.seed(21)

# Parameters
num_employees = 20
num_days = 5  # working days
dates = pd.date_range("2025-03-01", periods=num_days, freq="B")

# Employee IDs
employee_ids = np.arange(2001, 2001 + num_employees)

# Departments with employee distribution
departments = np.random.choice(
    ["HR", "Finance", "IT", "Sales", "Operations"],
    size=num_employees,
    p=[0.15, 0.20, 0.25, 0.20, 0.20]
)

# Department-wise attendance probabilities
attendance_prob = {
    "HR": [0.88, 0.07, 0.05],          # Present, Absent, Leave
    "Finance": [0.85, 0.10, 0.05],
    "IT": [0.80, 0.12, 0.08],
    "Sales": [0.75, 0.15, 0.10],
    "Operations": [0.72, 0.18, 0.10]
}

records = []

for emp_id, dept in zip(employee_ids, departments):
    for date in dates:
        status = np.random.choice(
            ["Present", "Absent", "Leave"],
            p=attendance_prob[dept]
        )
        records.append([emp_id, dept, date, status])

# Create DataFrame
df = pd.DataFrame(
    records,
    columns=["Employee_ID", "Department", "Date", "Attendance_Status"]
)

# Save to CSV
df.to_csv("employee_attendance_data.csv", index=False)

print("CSV file 'employee_attendance_data.csv' generated successfully!")


CSV file 'employee_attendance_data.csv' generated successfully!


### Expense

In [13]:
import pandas as pd
import numpy as np

# Reproducibility
np.random.seed(14)

# Number of records
num_records = 100

# Expense IDs
expense_ids = np.arange(9001, 9001 + num_records)

# Date range (last 6 months)
expense_dates = np.random.choice(
    pd.date_range("2025-01-01", "2025-06-30"),
    num_records
)

# Expense categories
categories = np.random.choice(
    ["Rent", "Food", "Transport", "Utilities", "Entertainment", "Medical"],
    size=num_records,
    p=[0.20, 0.30, 0.15, 0.15, 0.12, 0.08]
)

# Payment modes
payment_modes = np.random.choice(
    ["UPI", "Cash", "Debit Card", "Credit Card"],
    size=num_records,
    p=[0.45, 0.25, 0.20, 0.10]
)

# Category-wise realistic amount generation
def generate_amount(category):
    if category == "Rent":
        return np.random.randint(8000, 15000)
    elif category == "Food":
        return np.random.randint(150, 1200)
    elif category == "Transport":
        return np.random.randint(50, 600)
    elif category == "Utilities":
        return np.random.randint(500, 2500)
    elif category == "Entertainment":
        return np.random.randint(200, 3000)
    elif category == "Medical":
        return np.random.randint(300, 4000)

amounts = [generate_amount(cat) for cat in categories]

# Simple descriptions
description_map = {
    "Rent": "House Rent",
    "Food": "Grocery / Meal",
    "Transport": "Travel Expense",
    "Utilities": "Electricity / Water Bill",
    "Entertainment": "Leisure Expense",
    "Medical": "Medical Expense"
}

descriptions = [description_map[cat] for cat in categories]

# Create DataFrame
df = pd.DataFrame({
    "Expense_ID": expense_ids,
    "Expense_Date": pd.to_datetime(expense_dates),
    "Category": categories,
    "Description": descriptions,
    "Payment_Mode": payment_modes,
    "Amount": amounts
})

# Sort by date for trend analysis
df = df.sort_values("Expense_Date")

# Save to CSV
df.to_csv("monthly_expense_data.csv", index=False)

print("CSV file 'monthly_expense_data.csv' generated successfully!")


CSV file 'monthly_expense_data.csv' generated successfully!


In [14]:
import pandas as pd
import numpy as np

# Reproducibility
np.random.seed(30)

# Number of products
num_products = 30

# Product IDs
product_ids = np.arange(3001, 3001 + num_products)

# Categories
categories = np.random.choice(
    ["Electronics", "Stationery", "Furniture"],
    size=num_products,
    p=[0.40, 0.35, 0.25]
)

# Product names by category
product_map = {
    "Electronics": ["Laptop", "Mobile", "Keyboard", "Mouse", "Printer"],
    "Stationery": ["Notebook", "Pen", "Marker", "Stapler", "File"],
    "Furniture": ["Chair", "Table", "Cupboard", "Desk"]
}

# Suppliers
suppliers = ["ABC Traders", "Global Supply Co", "Prime Distributors"]

products = []
for cat in categories:
    products.append(np.random.choice(product_map[cat]))

# Stock In (received stock)
stock_in = np.random.randint(50, 300, num_products)

# Stock Out (sold/used stock, cannot exceed stock_in)
stock_out = []
for s in stock_in:
    stock_out.append(np.random.randint(int(s * 0.4), s))

stock_out = np.array(stock_out)

# Available stock
available_stock = stock_in - stock_out

# Reorder level (depends on category)
reorder_level = []
for cat in categories:
    if cat == "Electronics":
        reorder_level.append(np.random.randint(15, 30))
    elif cat == "Furniture":
        reorder_level.append(np.random.randint(10, 20))
    else:
        reorder_level.append(np.random.randint(20, 40))

# Suppliers
supplier_list = np.random.choice(suppliers, size=num_products)

# Create DataFrame
df = pd.DataFrame({
    "Product_ID": product_ids,
    "Product_Name": products,
    "Category": categories,
    "Supplier": supplier_list,
    "Stock_In": stock_in,
    "Stock_Out": stock_out,
    "Available_Stock": available_stock,
    "Reorder_Level": reorder_level
})

# Save to CSV
df.to_csv("inventory_management_data.csv", index=False)

print("CSV file 'inventory_management_data.csv' generated successfully!")


CSV file 'inventory_management_data.csv' generated successfully!


In [15]:
import pandas as pd
import numpy as np

# Reproducibility
np.random.seed(50)

# Number of records
num_records = 120

# Admission IDs
admission_ids = np.arange(7001, 7001 + num_records)

# Admission years with growth trend
years = np.random.choice(
    [2021, 2022, 2023, 2024, 2025],
    size=num_records,
    p=[0.15, 0.17, 0.20, 0.23, 0.25]
)

# Courses with popularity bias
courses = np.random.choice(
    ["ITI Electrician", "ITI Fitter", "COPA", "Welder", "Plumber"],
    size=num_records,
    p=[0.25, 0.20, 0.30, 0.15, 0.10]
)

# Gender distribution (slightly male dominant)
genders = np.random.choice(
    ["Male", "Female"],
    size=num_records,
    p=[0.60, 0.40]
)

# Category distribution
categories = np.random.choice(
    ["General", "OBC", "SC", "ST"],
    size=num_records,
    p=[0.42, 0.32, 0.18, 0.08]
)

# Create DataFrame
df = pd.DataFrame({
    "Admission_ID": admission_ids,
    "Admission_Year": years,
    "Course": courses,
    "Gender": genders,
    "Category": categories
})

# Save to CSV
df.to_csv("institute_admission_data.csv", index=False)

print("CSV file 'institute_admission_data.csv' generated successfully!")


CSV file 'institute_admission_data.csv' generated successfully!


In [16]:
import pandas as pd
import numpy as np

# Reproducibility
np.random.seed(77)

# Number of records
num_records = 100

# Feedback IDs
feedback_ids = np.arange(8001, 8001 + num_records)

# Date range
feedback_dates = np.random.choice(
    pd.date_range("2025-01-01", "2025-06-30"),
    num_records
)

# Products / Services
products = np.random.choice(
    ["Mobile App", "Website", "Customer Support", "Delivery Service", "Payment System"],
    size=num_records,
    p=[0.25, 0.20, 0.20, 0.20, 0.15]
)

# Ratings (realistic skew toward higher ratings)
ratings = np.random.choice(
    [1, 2, 3, 4, 5],
    size=num_records,
    p=[0.05, 0.10, 0.20, 0.35, 0.30]
)

# Feedback type derived from rating
feedback_type = []
for r in ratings:
    if r >= 4:
        feedback_type.append("Positive")
    elif r == 3:
        feedback_type.append("Neutral")
    else:
        feedback_type.append("Negative")

# Feedback channels
channels = np.random.choice(
    ["App", "Website", "Email", "Call Center"],
    size=num_records,
    p=[0.40, 0.30, 0.20, 0.10]
)

# Create DataFrame
df = pd.DataFrame({
    "Feedback_ID": feedback_ids,
    "Feedback_Date": pd.to_datetime(feedback_dates),
    "Product": products,
    "Rating": ratings,
    "Feedback_Type": feedback_type,
    "Channel": channels
})

# Sort by date
df = df.sort_values("Feedback_Date")

# Save to CSV
df.to_csv("customer_feedback_data.csv", index=False)

print("CSV file 'customer_feedback_data.csv' generated successfully!")


CSV file 'customer_feedback_data.csv' generated successfully!


In [17]:
import pandas as pd
import numpy as np

# Reproducibility
np.random.seed(101)

# Number of workout records
num_records = 100

# Workout IDs
workout_ids = np.arange(6001, 6001 + num_records)

# Date range (3 months, workout days only)
workout_dates = pd.date_range("2025-01-01", periods=num_records, freq="B")

# Exercises
strength_exercises = ["Bench Press", "Squats", "Deadlift", "Shoulder Press", "Bicep Curl"]
cardio_exercises = ["Treadmill", "Cycling", "Skipping"]

exercise_types = []
exercises = []

for _ in range(num_records):
    if np.random.rand() < 0.7:
        ex = np.random.choice(strength_exercises)
        exercises.append(ex)
        exercise_types.append("Strength")
    else:
        ex = np.random.choice(cardio_exercises)
        exercises.append(ex)
        exercise_types.append("Cardio")

# Progressive weight logic
base_weights = {
    "Bench Press": 40,
    "Squats": 50,
    "Deadlift": 60,
    "Shoulder Press": 30,
    "Bicep Curl": 15
}

weights = []
reps = []
duration = []
calories = []

progress_factor = np.linspace(1.0, 1.2, num_records)

for i, ex in enumerate(exercises):
    if ex in base_weights:
        w = base_weights[ex] * progress_factor[i] + np.random.randint(-3, 4)
        weights.append(int(max(w, 10)))
        reps.append(np.random.choice([8, 10, 12]))
        duration.append(np.random.randint(25, 45))
        calories.append(np.random.randint(180, 350))
    else:
        weights.append(0)
        reps.append(0)
        duration.append(np.random.randint(20, 40))
        calories.append(np.random.randint(200, 450))

# Create DataFrame
df = pd.DataFrame({
    "Workout_ID": workout_ids,
    "Workout_Date": workout_dates,
    "Exercise": exercises,
    "Exercise_Type": exercise_types,
    "Weight_kg": weights,
    "Reps": reps,
    "Duration_Min": duration,
    "Calories_Burned": calories
})

# Save to CSV
df.to_csv("fitness_progress_data.csv", index=False)

print("CSV file 'fitness_progress_data.csv' generated successfully!")


CSV file 'fitness_progress_data.csv' generated successfully!


In [18]:
import pandas as pd
import numpy as np

# Reproducibility
np.random.seed(88)

# Number of travel records
num_records = 100

# Travel IDs
travel_ids = np.arange(10001, 10001 + num_records)

# Date range (6 months)
travel_dates = np.random.choice(
    pd.date_range("2025-01-01", "2025-06-30"),
    num_records
)

# Transport modes
modes = np.random.choice(
    ["Bus", "Train", "Bike", "Auto", "Cab"],
    size=num_records,
    p=[0.30, 0.20, 0.25, 0.15, 0.10]
)

# Purpose of travel
purposes = np.random.choice(
    ["Office", "Personal", "Shopping", "Medical"],
    size=num_records,
    p=[0.45, 0.30, 0.15, 0.10]
)

# Distance and cost logic
distance = []
cost = []

for mode in modes:
    if mode == "Bus":
        d = np.random.randint(5, 30)
        c = d * np.random.uniform(1.5, 2.5)
    elif mode == "Train":
        d = np.random.randint(10, 80)
        c = d * np.random.uniform(1.2, 2.0)
    elif mode == "Bike":
        d = np.random.randint(3, 40)
        c = d * np.random.uniform(2.0, 3.0)
    elif mode == "Auto":
        d = np.random.randint(2, 25)
        c = d * np.random.uniform(5.0, 7.0)
    else:  # Cab
        d = np.random.randint(5, 60)
        c = d * np.random.uniform(10.0, 15.0)

    distance.append(d)
    cost.append(int(c))

# Create DataFrame
df = pd.DataFrame({
    "Travel_ID": travel_ids,
    "Travel_Date": pd.to_datetime(travel_dates),
    "Mode": modes,
    "Distance_km": distance,
    "Cost": cost,
    "Purpose": purposes
})

# Sort by date
df = df.sort_values("Travel_Date")

# Save to CSV
df.to_csv("transport_expense_data.csv", index=False)

print("CSV file 'transport_expense_data.csv' generated successfully!")


CSV file 'transport_expense_data.csv' generated successfully!


In [21]:
import pandas as pd
import numpy as np

# Reproducibility
np.random.seed(123)

# Years and months
years = [2021, 2022, 2023, 2024, 2025]
months = [
    "January", "February", "March", "April", "May", "June",
    "July", "August", "September", "October", "November", "December"
]

# Season mapping
season_map = {
    "January": "Winter",
    "February": "Winter",
    "March": "Summer",
    "April": "Summer",
    "May": "Summer",
    "June": "Monsoon",
    "July": "Monsoon",
    "August": "Monsoon",
    "September": "Monsoon",
    "October": "Winter",
    "November": "Winter",
    "December": "Winter"
}

records = []

# Base consumption by season
base_units = {
    "Winter": (160, 260),
    "Summer": (280, 420),
    "Monsoon": (220, 330)
}

for year in years:
    growth_factor = 1 + (year - 2021) * 0.05  # ~5% yearly increase
    for month in months:
        season = season_map[month]
        low, high = base_units[season]
        
        units = int(np.random.randint(low, high) * growth_factor)
        cost = int(units * np.random.uniform(5.5, 6.5))
        
        records.append([
            year,
            month,
            units,
            cost,
            season
        ])

# Create DataFrame
df = pd.DataFrame(
    records,
    columns=["Year", "Month", "Units_Consumed", "Cost", "Season"]
)

# Save to CSV
df.to_csv("electricity_consumption_2021_2025.csv", index=False)

print("CSV file 'electricity_consumption_2021_2025.csv' generated successfully!")

CSV file 'electricity_consumption_2021_2025.csv' generated successfully!
