<a href="https://colab.research.google.com/github/kjan318/Data-INSIGHTS-Lab/blob/main/People_Analytics_dashboard.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
import gspread
import pandas as pd
from gspread_dataframe import set_with_dataframe
from oauth2client.service_account import ServiceAccountCredentials
from faker import Faker
from datetime import date, timedelta
import random

# --- Data Generation Function (from the original project) ---
fake = Faker()
DEPARTMENTS = ['Engineering', 'Sales', 'HR', 'Marketing', 'Finance', 'Operations']
LOCATIONS = ['New York', 'London', 'Tokyo', 'Sydney', 'Berlin']
JOB_LEVELS = ['Entry-Level', 'Mid-Level', 'Senior', 'Manager', 'Director']
HIRE_SOURCES = ['Referral', 'Careers Page', 'LinkedIn', 'Agency', 'University']
QUALIFICATION_STATUS = ['Qualified', 'Not Qualified']
OFFER_STATUS = ['Accepted', 'Rejected']
TERMINATION_REASONS = ['Voluntary', 'Involuntary', 'Retirement']

def generate_employee_data(start_date, end_date):
    data = []
    # Generate a larger number of employees for historical data
    num_employees_per_month = random.randint(20, 50)
    total_months = (end_date.year - start_date.year) * 12 + end_date.month - start_date.month
    num_employees = num_employees_per_month * total_months

    for i in range(num_employees):
        hire_date = fake.date_between(start_date=start_date, end_date=end_date)
        termination_date = None
        if random.random() < 0.15: # 15% annual turnover chance
            term_date = fake.date_between(start_date=hire_date, end_date=date.today())
            termination_date = term_date if term_date > hire_date else None

        requisition_approval_date = hire_date - timedelta(days=random.randint(30, 90))
        offer_acceptance_date = hire_date - timedelta(days=random.randint(1, 14))

        record = {
            'Employee_ID': 10000 + i,
            'Department': random.choice(DEPARTMENTS),
            'Location': random.choice(LOCATIONS),
            'Job_Level': random.choice(JOB_LEVELS),
            'Hire_Date': hire_date.strftime('%Y-%m-%d'),
            'Termination_Date': termination_date.strftime('%Y-%m-%d') if termination_date else None,
            'Termination_Reason': random.choice(TERMINATION_REASONS) if termination_date else None,
            'Application_Source_Channel': random.choice(HIRE_SOURCES),
            'Requisition_Approval_Date': requisition_approval_date.strftime('%Y-%m-%d'),
            'Offer_Acceptance_Date': offer_acceptance_date.strftime('%Y-%m-%d'),
            'Qualification_Status': random.choice(QUALIFICATION_STATUS),
            'Offer_Status': random.choice(OFFER_STATUS),
            'First_Year_Performance_Rating': round(random.uniform(2.5, 5.0), 1) if (date.today() - hire_date).days > 365 else None,
            'Hiring_Manager_Satisfaction_Score': random.randint(1, 5),
            'Salary': random.randint(50000, 150000)
        }
        data.append(record)
    return pd.DataFrame(data)

# --- Main Script to Authenticate and Upload ---
def main():
    print("--- Initial Data Generation for People Analytics Dashboard ---")

    # Authenticate with Google Sheets
    scope = [
        "https://spreadsheets.google.com/feeds",
        "https://www.googleapis.com/auth/spreadsheets",
        "https://www.googleapis.com/auth/drive.file",
        "https://www.googleapis.com/auth/drive",
    ]
    try:
        creds = ServiceAccountCredentials.from_json_keyfile_name("credentials.json", scope)
        client = gspread.authorize(creds)
    except FileNotFoundError:
        print("\nERROR: `credentials.json` not found in this directory.")
        print("Please follow the README to get your credentials file and place it here.")
        return

    # Get Google Sheet Key from user
    spreadsheet_key = input("Please enter your Google Sheet key: ")
    sheet_name = "PeopleAnalyticsData"

    try:
        spreadsheet = client.open_by_key(spreadsheet_key)
        try:
            sheet = spreadsheet.worksheet(sheet_name)
            print(f"Found existing worksheet '{sheet_name}'. It will be cleared and overwritten.")
            sheet.clear()
        except gspread.WorksheetNotFound:
            sheet = spreadsheet.add_worksheet(title=sheet_name, rows="1", cols="1")
            print(f"Created new worksheet '{sheet_name}'.")
    except gspread.exceptions.SpreadsheetNotFound:
        print("\nERROR: Spreadsheet not found. Check your key and sharing settings.")
        print("Did you share the sheet with the client_email from your credentials file?")
        return

    # Generate historical data for the last 24 months
    print("\nGenerating historical data for the past 24 months...")
    today = date.today()
    start_date = today - timedelta(days=24*30)
    df = generate_employee_data(start_date=start_date, end_date=today)
    print(f"Generated {len(df)} employee records.")

    # Upload data to the sheet
    print(f"Uploading data to '{sheet_name}'...")
    set_with_dataframe(sheet, df)
    print("\n✅ Success! Your Google Sheet is now populated with initial data.")

if __name__ == "__main__":
    main()

--- Initial Data Generation for People Analytics Dashboard ---
Please enter your Google Sheet key: 1f67Xxd_eX8s8GoSf1KmHdUQ676seKv0G3Tc_rtEvbN4
Found existing worksheet 'PeopleAnalyticsData'. It will be cleared and overwritten.

Generating historical data for the past 24 months...
Generated 483 employee records.
Uploading data to 'PeopleAnalyticsData'...

✅ Success! Your Google Sheet is now populated with initial data.


In [15]:
%%writefile google_sheets_handler.py
import gspread
import pandas as pd
import os  # <-- Import the 'os' library
from oauth2client.service_account import ServiceAccountCredentials

# --- Google Sheets Authentication for Colab ---
scope = [
    "https://spreadsheets.google.com/feeds",
    "https://www.googleapis.com/auth/spreadsheets",
    "https://www.googleapis.com/auth/drive.file",
    "https://www.googleapis.com/auth/drive",
]

# Authenticate using the uploaded credentials.json
creds = ServiceAccountCredentials.from_json_keyfile_name("credentials.json", scope)
client = gspread.authorize(creds)

def get_sheet(sheet_name="PeopleAnalyticsData"):
    # Read the sheet key from the environment variable
    spreadsheet_key = os.environ.get('GOOGLE_SHEET_KEY')
    if not spreadsheet_key:
        raise ValueError("GOOGLE_SHEET_KEY environment variable not set!")

    spreadsheet = client.open_by_key(spreadsheet_key)
    try:
        sheet = spreadsheet.worksheet(sheet_name)
    except gspread.WorksheetNotFound:
        sheet = spreadsheet.add_worksheet(title=sheet_name, rows="1", cols="1")
    return sheet

def read_data_from_sheet(sheet):
    data = sheet.get_all_records()
    df = pd.DataFrame(data)
    for col in df.columns:
        if 'date' in col.lower():
            df[col] = pd.to_datetime(df[col], errors='coerce')
    return df

def append_data_to_sheet(sheet, df_to_append):
    header = sheet.row_values(1)
    # Check for missing columns in df_to_append and add them with None
    for col in header:
        if col not in df_to_append.columns:
            df_to_append[col] = None

    new_rows = df_to_append[header].values.tolist()
    sheet.append_rows(new_rows, value_input_option='USER_ENTERED')

Overwriting google_sheets_handler.py


In [3]:
%%writefile data_generator.py
import pandas as pd
from faker import Faker
import random
from datetime import date, timedelta

fake = Faker()

# --- Configuration for Data Generation ---
DEPARTMENTS = ['Engineering', 'Sales', 'HR', 'Marketing', 'Finance', 'Operations']
LOCATIONS = ['New York', 'London', 'Tokyo', 'Sydney', 'Berlin']
JOB_LEVELS = ['Entry-Level', 'Mid-Level', 'Senior', 'Manager', 'Director']
HIRE_SOURCES = ['Referral', 'Careers Page', 'LinkedIn', 'Agency', 'University']
QUALIFICATION_STATUS = ['Qualified', 'Not Qualified']
OFFER_STATUS = ['Accepted', 'Rejected']
TERMINATION_REASONS = ['Voluntary', 'Involuntary', 'Retirement']

# --- Main Data Generation Function ---
def generate_employee_data(start_date, end_date):
    """
    Generates a DataFrame with realistic employee data for a given date range.
    """
    data = []
    num_employees = random.randint(50, 150) # Generate a variable number of employees per period

    for i in range(num_employees):
        hire_date = fake.date_between(start_date=start_date, end_date=end_date)
        department = random.choice(DEPARTMENTS)
        location = random.choice(LOCATIONS)
        job_level = random.choice(JOB_LEVELS)

        termination_date = None
        if random.random() < 0.1: # 10% chance of turnover
            term_date = fake.date_between(start_date=hire_date, end_date=hire_date + timedelta(days=365))
            termination_date = term_date if term_date <= date.today() else None

        requisition_approval_date = hire_date - timedelta(days=random.randint(30, 90))
        offer_acceptance_date = hire_date - timedelta(days=random.randint(1, 14))

        record = {
            'Employee_ID': 1000 + i + random.randint(1, 99999),
            'Department': department,
            'Location': location,
            'Job_Level': job_level,
            'Hire_Date': hire_date.strftime('%Y-%m-%d'),
            'Termination_Date': termination_date.strftime('%Y-%m-%d') if termination_date else None,
            'Termination_Reason': random.choice(TERMINATION_REASONS) if termination_date else None,
            'Application_Source_Channel': random.choice(HIRE_SOURCES),
            'Requisition_Approval_Date': requisition_approval_date.strftime('%Y-%m-%d'),
            'Offer_Acceptance_Date': offer_acceptance_date.strftime('%Y-%m-%d'),
            'Qualification_Status': random.choice(QUALIFICATION_STATUS),
            'Offer_Status': random.choice(OFFER_STATUS),
            'First_Year_Performance_Rating': round(random.uniform(2.5, 5.0), 1) if (date.today() - hire_date).days > 365 else None,
            'Hiring_Manager_Satisfaction_Score': random.randint(1, 5),
            'Salary': random.randint(50000, 150000)
        }
        data.append(record)

    return pd.DataFrame(data)

Writing data_generator.py


In [4]:
%%writefile app.py
# This is the same app.py file from the original response.
# No changes are needed here.
import streamlit as st
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from datetime import date, timedelta
from google_sheets_handler import get_sheet, read_data_from_sheet, append_data_to_sheet
from data_generator import generate_employee_data

# --- Page Configuration ---
st.set_page_config(
    page_title="People Analytics Dashboard",
    page_icon="📊",
    layout="wide",
)

# --- Data Loading and Caching ---
# @st.cache_data decorator removed for Colab compatibility with gspread objects
def load_data():
    sheet = get_sheet()
    df = read_data_from_sheet(sheet)
    if df.empty:
        st.warning("Data sheet is empty. Generating initial data for the last 12 months.")
        today = date.today()
        initial_data = generate_employee_data(start_date=today - timedelta(days=365), end_date=today)
        append_data_to_sheet(sheet, initial_data)
        df = read_data_from_sheet(sheet) # Reload data
    return df

df = load_data()

# --- Sidebar Filters ---
st.sidebar.header("📊 People Analytics Dashboard")
st.sidebar.markdown("Filter your data to get specific insights.")

# Ensure Hire_Date is datetime before finding min/max
df["Hire_Date"] = pd.to_datetime(df["Hire_Date"])

min_date = df["Hire_Date"].min().date()
max_date = df["Hire_Date"].max().date()

date_range = st.sidebar.date_input(
    "Select Hire Date Range",
    value=(min_date, max_date),
    min_value=min_date,
    max_value=max_date,
)

selected_departments = st.sidebar.multiselect(
    "Select Departments",
    options=sorted(df["Department"].unique()),
    default=sorted(df["Department"].unique()),
)

selected_locations = st.sidebar.multiselect(
    "Select Locations",
    options=sorted(df["Location"].unique()),
    default=sorted(df["Location"].unique()),
)

# --- Data Generation Sidebar ---
st.sidebar.markdown("---")
st.sidebar.header("Generate New Data")
if st.sidebar.button("Generate New Month's Data"):
    with st.spinner("Generating and appending new data..."):
        last_date = df["Hire_Date"].max().date()
        new_data = generate_employee_data(start_date=last_date + timedelta(days=1), end_date=last_date + timedelta(days=31))
        sheet = get_sheet()
        append_data_to_sheet(sheet, new_data)
        st.success("New data generated! Please refresh the page to see updates.")


# --- Filter Data based on selection ---
start_date, end_date = date_range
filtered_df = df[
    (df["Hire_Date"].dt.date >= start_date) &
    (df["Hire_Date"].dt.date <= end_date) &
    (df["Department"].isin(selected_departments)) &
    (df["Location"].isin(selected_locations))
]

# --- Main Dashboard Display ---
st.title("📈 HR KPI Dashboard")
st.markdown("This dashboard provides an overview of key human resources metrics.")

if filtered_df.empty:
    st.warning("No data available for the selected filters.")
else:
    # --- Key Metrics ---
    total_hires = len(filtered_df)

    # Ensure Termination_Date is also datetime
    filtered_df['Termination_Date'] = pd.to_datetime(filtered_df['Termination_Date'])
    turnover_90_day = filtered_df[
        (filtered_df['Termination_Date'].notna()) &
        ((filtered_df['Termination_Date'] - filtered_df['Hire_Date']).dt.days <= 90)
    ].shape[0]
    turnover_rate = (turnover_90_day / total_hires * 100) if total_hires > 0 else 0
    offer_acceptance_rate = (filtered_df[filtered_df['Offer_Status'] == 'Accepted'].shape[0] / filtered_df.shape[0] * 100)

    col1, col2, col3 = st.columns(3)
    with col1:
        st.metric(label="Total Hires", value=f"{total_hires}")
    with col2:
        st.metric(label="90-Day Turnover Rate", value=f"{turnover_rate:.2f}%")
    with col3:
        st.metric(label="Offer Acceptance Rate", value=f"{offer_acceptance_rate:.2f}%")

    st.markdown("---")

    # --- Visualizations ---
    with st.container():
        st.header("KPI Deep Dive")

        with st.expander("Attract & Hire"):
            col1, col2 = st.columns(2)
            with col1:
                filtered_df['Requisition_Approval_Date'] = pd.to_datetime(filtered_df['Requisition_Approval_Date'])
                filtered_df['Offer_Acceptance_Date'] = pd.to_datetime(filtered_df['Offer_Acceptance_Date'])
                filtered_df['Time_to_Fill'] = (filtered_df['Offer_Acceptance_Date'] - filtered_df['Requisition_Approval_Date']).dt.days
                avg_time_to_fill = filtered_df.groupby(pd.Grouper(key='Hire_Date', freq='M'))['Time_to_Fill'].mean().reset_index()

                fig = px.line(avg_time_to_fill, x='Hire_Date', y='Time_to_Fill', title='Average Time to Fill (Days)', markers=True)
                st.plotly_chart(fig, use_container_width=True)
                st.markdown("_A line chart shows the trend in hiring efficiency over time._")

            with col2:
                source_counts = filtered_df['Application_Source_Channel'].value_counts().reset_index()
                source_counts.columns = ['Source', 'Count']

                fig = px.pie(source_counts, names='Source', values='Count', title='Source of Hire Effectiveness')
                st.plotly_chart(fig, use_container_width=True)
                st.markdown("_A pie chart is used to show the proportion of hires from each recruitment channel._")

        with st.expander("Talent & Leadership"):
            turnover_df = filtered_df[filtered_df['Termination_Date'].notna()]
            if not turnover_df.empty:
                turnover_by_dept = turnover_df.groupby('Department').size().reset_index(name='Turnover Count')
                fig = px.bar(turnover_by_dept, x='Department', y='Turnover Count', title='Turnover by Department')
                st.plotly_chart(fig, use_container_width=True)
                st.markdown("_A bar chart helps compare turnover volumes across different departments._")
            else:
                st.info("No turnover data to display for the selected period.")

Writing app.py


In [5]:
!pip install streamlit pandas plotly gspread gspread-dataframe oauth2client Faker pyngrok -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m61.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m66.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m84.2 MB/s[0m eta [36m0:00:00[0m
[?25h

streamlit
pandas
plotly
gspread
gspread-dataframe
oauth2client
Faker

In [13]:
from pyngrok import ngrok

# Terminate any existing tunnels
ngrok.kill()

# Set up the ngrok authentication token
NGROK_AUTH_TOKEN = "31fq6Ze7AbyVhGp5cFN3WM5HSPY_5dwBEa7toz8xUPZ9S5aPe"  # <--- PASTE YOUR TOKEN HERE
ngrok.set_auth_token(NGROK_AUTH_TOKEN)

# Open a tunnel to the streamlit port 8501
public_url = ngrok.connect(8501)
print(f"Your Streamlit app is live at: {public_url}")

# Run the streamlit app in the background
!streamlit run app.py --server.port 8501 --server.headless true

Your Streamlit app is live at: NgrokTunnel: "https://2fa284f0dbeb.ngrok-free.app" -> "http://localhost:8501"

Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://35.185.72.133:8501[0m
[0m
[31m──[0m[31m────────────────────────[0m[31m [0m[1;31mTraceback [0m[1;2;31m(most recent call last)[0m[31m [0m[31m─────────────────────────[0m[31m──[0m
[31m [0m [2;33m/usr/local/lib/python3.12/dist-packages/streamlit/runtime/scriptrunner/[0m[1;33mexec_code.py[0m: [31m [0m
[31m [0m [94m128[0m in [92mexec_func_with_error_handling[0m                                                 [31m [0m
[31m [0m                                                                                      [31m [0m
[31m [0m [2;33m/usr/l

In [14]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
