In [None]:
import pandas as pd
import random
import datetime

# --- The List of 123 Headers ---
# This list is required by the function.
ALL_HEADERS = [
    'employee_id', 'punch_in_time', 'punch_out_time', 'total_logged_hours_day', 'late_arrival_flag', 
    'early_departure_flag', 'break_duration_minutes', 'biometric_match_score', 'punch_location', 
    'attendance_status', 'overtime_hours', 'late_start_count_per_week', 'early_exit_count_per_week', 
    'total_hours_per_week', 'variance_in_work_hours', 'absenteeism_rate', 'avg_daily_hours', 
    'punctuality_score', 'meeting_hours_per_week', 'meeting_counts_per_week', 'avg_meeting_duration_minutes', 
    'back_to_back_meetings_count', 'after_hours_meetings_count', 'meeting_accepted_ratio', 
    'meeting_organizer_count', 'meeting_attendee_count_avg', 'focus_time_hours_per_week', 
    'largest_meeting_attendees', 'recurring_meeting_percentage', 'messages_sent_per_day', 
    'messages_received_per_day', 'messages_sent_per_week', 'messages_received_per_week', 
    'avg_response_latency_min', 'communication_burstiness', 'after_hours_message_ratio', 
    'communication_balance', 'conversation_length_avg', 'channels_active_count', 'dm_vs_channel_ratio', 
    'emoji_reaction_count', 'mentions_received_count', 'message_edit_ratio', 'urgent_message_count', 
    'status_available_percentage', 'status_busy_percentage', 'status_away_percentage', 
    'status_dnd_percentage', 'avg_tasks_assigned_per_week', 'avg_tasks_completed_per_week', 
    'task_completion_rate', 'avg_task_age_days', 'overdue_task_ratio', 'overdue_task_count', 
    'in_progress_task_count', 'story_points_committed_per_week', 'story_points_completed_per_week', 
    'unique_projects_count', 'context_switching_score', 'logged_hours_per_week', 'logged_hours_per_day', 
    'worklog_entry_count_per_week', 'avg_worklog_session_duration', 'worklog_variance', 
    'billable_hours_percentage', 'github_commits_per_day', 'github_commits_per_week', 
    'github_commit_days_active', 'github_lines_added_per_week', 'github_lines_deleted_per_week', 
    'github_lines_changed_total', 'github_avg_commit_size', 'github_commit_message_length_avg', 
    'github_weekend_commit_ratio', 'github_prs_per_week', 'github_prs_merged_count', 
    'github_prs_open_count', 'github_pr_merge_rate', 'github_avg_pr_merge_time_hours', 
    'github_pr_comment_count_avg', 'github_pr_review_cycles_avg', 'github_pr_size_lines_avg', 
    'github_reviews_per_week', 'github_review_comments_given', 'github_review_approval_rate', 
    'github_review_response_time_hours', 'github_issues_created_per_week', 'github_issues_closed_per_week', 
    'github_issue_close_rate', 'github_repo_context_switching', 'github_activity_consistency', 
    'github_primary_repo_percentage', 'github_language_diversity', 'emails_sent_per_day', 
    'emails_received_per_day', 'email_response_rate', 'email_avg_response_time_hours', 
    'unread_email_count', 'emails_with_attachments_ratio', 'email_importance_high_ratio', 
    'email_thread_depth_avg', 'after_hours_email_ratio', 'calendar_fragmentation_score', 
    'deep_work_hours_per_week', 'collaboration_network_size', 'network_centrality_score', 
    'internal_external_comm_ratio', 'avg_Jira_ticket_resolution_time_days', 'jira_tickets_created_per_week', 
    'jira_tickets_closed_per_week', 'jira_ticket_reopen_rate', 'jira_bug_count_per_week', 
    'productivity_index', 'collaboration_score', 'focus_efficiency', 'multitasking_load', 
    'burnout_risk_score', 'work_life_balance_score', 'stress_indicator', 'workload_intensity', 
    'consistency_score', 'morning_activity_ratio', 'afternoon_activity_ratio', 'evening_activity_ratio', 
    'night_activity_ratio', 'weekend_work_ratio',
    # --- New Headers Added ---
    'Wellbeing Score', 'Overall Effciency score'
]


# --- The Single Function to Generate Dummy Data ---

def Generate_dummy_data(column_headers, num_rows=100):
    """
    Generates a DataFrame with dummy data based on column header keywords.
    Uses only standard Python libraries (random, datetime) and pandas.

    Args:
        column_headers (list): A list of string column names. (e.g., ALL_HEADERS)
        num_rows (int): The number of rows (employees) to generate.

    Returns:
        pd.DataFrame: A DataFrame populated with dummy data.
    """
    print(f"Starting to generate {num_rows} rows for {len(column_headers)} columns...")
    data = {}
    base_date = datetime.datetime(2023, 1, 1)
    
    # Create unique employee IDs
    employee_ids = [f"E{1001 + i}" for i in range(num_rows)]
    random.shuffle(employee_ids)
    
    for col in column_headers:
        col_data = []
        col_lower = col.lower() # This line handles capitalization
        
        for i in range(num_rows):
            # --- IDs and Names ---
            if col == 'employee_id':
                val = employee_ids[i]
            # --- Timestamps and Dates ---
            elif 'punch_in_time' in col_lower:
                day = random.randint(0, 364)
                punch_in = base_date + datetime.timedelta(days=day, hours=random.randint(8, 10), minutes=random.randint(0, 59))
                val = punch_in
            elif 'punch_out_time' in col_lower:
                day = random.randint(0, 364) 
                punch_out = base_date + datetime.timedelta(days=day, hours=random.randint(17, 19), minutes=random.randint(0, 59))
                val = punch_out
            elif '_time' in col_lower or '_date' in col_lower:
                val = base_date + datetime.timedelta(days=random.randint(0, 364), hours=random.randint(0, 23), minutes=random.randint(0, 59))
            # --- Flags (Boolean) ---
            elif '_flag' in col_lower:
                val = random.choice([True, False])
            # --- Durations and Hours ---
            elif 'break_duration_minutes' in col_lower:
                val = round(random.uniform(15.0, 60.0), 2)
            elif 'avg_meeting_duration_minutes' in col_lower:
                val = round(random.uniform(15.0, 90.0), 2)
            elif 'avg_response_latency_min' in col_lower:
                val = round(random.uniform(5.0, 120.0), 2)
            elif '_hours' in col_lower or 'duration' in col_lower:
                val = round(random.uniform(0.5, 8.0), 2)
            # --- Scores, Ratios, Rates, Percentages (Floats 0-1 or 0-100) ---
            elif '_score' in col_lower or '_rate' in col_lower or '_ratio' in col_lower or 'consistency' in col_lower:
                # This logic block will automatically handle the new "score" columns
                val = round(random.uniform(0.0, 1.0), 4)
            elif '_percentage' in col_lower:
                val = round(random.uniform(0.0, 100.0), 2)
            # --- Counts (Integers) ---
            elif '_count' in col_lower or 'commits' in col_lower or 'lines' in col_lower or 'messages' in col_lower:
                val = random.randint(0, 100)
            elif 'largest_meeting_attendees' in col_lower:
                val = random.randint(5, 50)
            # --- Averages ---
            elif '_avg' in col_lower or 'avg_' in col_lower:
                val = round(random.uniform(0.0, 100.0), 2)
            # --- Statuses and Locations ---
            elif 'punch_location' in col_lower:
                val = random.choice(['Office-A', 'Office-B', 'Remote', 'Client-Site'])
            elif 'attendance_status' in col_lower:
                val = random.choice(['Present', 'Absent', 'On-Leave'])
            # --- Other specific floats ---
            elif 'worklog_variance' in col_lower or 'variance_in_work_hours' in col_lower:
                val = round(random.uniform(-5.0, 5.0), 2)
            elif 'communication_burstiness' in col_lower:
                 val = round(random.uniform(0.0, 5.0), 2)
            # --- Default Fallback ---
            else:
                val = random.randint(0, 10)
                
            col_data.append(val)
        
        data[col] = col_data

    df = pd.DataFrame(data)
    
    # --- Post-processing for logical consistency ---
    # This makes the generated data even more realistic.
    
    if 'punch_in_time' in df.columns and 'punch_out_time' in df.columns:
        # Ensure punch_out_time is after punch_in_time
        df['punch_out_time'] = df['punch_in_time'].apply(
            lambda x: x + datetime.timedelta(hours=random.uniform(7.5, 9.5), minutes=random.randint(0, 59))
        )
        
    if ('total_logged_hours_day' in df.columns and 
        'punch_in_time' in df.columns and 
        'punch_out_time' in df.columns and 
        'break_duration_minutes' in df.columns):
        
        # Calculate total_logged_hours_day from punch times minus break
        df['total_logged_hours_day'] = (df['punch_out_time'] - df['punch_in_time']).dt.total_seconds() / 3600.0
        df['total_logged_hours_day'] = df['total_logged_hours_day'] - (df['break_duration_minutes'] / 60.0)
        df['total_logged_hours_day'] = df['total_logged_hours_day'].apply(lambda x: round(max(0, x), 2))
    
    print("...Data generation complete.")
    return df

In [None]:
# First, make sure the function and ALL_HEADERS list are defined
# (e.g., by importing the file or copying the code)

number_of_employees = 5000
employee_dataframe = Generate_dummy_data(ALL_HEADERS, num_rows=number_of_employees)

print(employee_dataframe.head())

Starting to generate 5000 rows for 126 columns...
...Data generation complete.
  employee_id       punch_in_time             punch_out_time  \
0       E4286 2023-04-05 10:06:00 2023-04-05 17:59:07.404291   
1       E4188 2023-04-12 10:46:00 2023-04-12 19:29:39.212266   
2       E3228 2023-09-05 09:01:00 2023-09-05 18:50:17.691227   
3       E4882 2023-07-08 10:22:00 2023-07-08 18:42:31.315543   
4       E1931 2023-01-10 09:19:00 2023-01-10 18:01:48.571804   

   total_logged_hours_day  late_arrival_flag  early_departure_flag  \
0                    7.63               True                 False   
1                    7.87              False                  True   
2                    9.43              False                  True   
3                    7.56              False                  True   
4                    8.39              False                  True   

   break_duration_minutes  biometric_match_score punch_location  \
0                   15.15                 0.1501

In [None]:
# save the employee_dataframe to a CSV file
employee_dataframe.to_csv("Generated_employee_data_5000_3.csv", index=False)