In [None]:
class ConsultancyEmployee:
 def __init__(self,faker,reference_date=datetime(2024,1,1)):
  self.fake=faker
  self.reference_date=reference_date
  self._init_department_config()
  self._init_location_config()
  self._init_specialization_config()
  self._init_certification_config()
  self.assigned_unique_positions=set()
 def _init_department_config(self):
  self.department_weights={'Product Architecture':0.2,'Design & UX':0.2,'Engineering Delivery':0.3,'Product Strategy':0.15,'Client Services':0.15}
  self.departments={'Product Architecture':{'entry':['Junior Solution Architect','System Analyst I','Technical Documentation Specialist'],'mid':['Solution Architect','Enterprise Architect','Integration Specialist'],'senior':['Principal Architect','Lead Solution Architect','Senior Enterprise Architect']},'Design & UX':{'entry':['UI Designer I','UX Researcher I','Design System Specialist'],'mid':['Senior UX Designer','Design Lead','Design Strategist'],'senior':['Principal Designer','Lead Design Strategist','Senior Design Lead']},'Engineering Delivery':{'entry':['Associate Developer','Junior DevOps','QA Engineer I'],'mid':['Tech Lead','Senior Developer','Cloud Architect'],'senior':['Principal Engineer','Engineering Lead','Development Director']},'Product Strategy':{'entry':['Product Analyst','Business Analyst I','Market Researcher'],'mid':['Product Strategist','Senior Product Manager','Domain Expert'],'senior':['Principal Product Manager','Lead Product Strategist','Senior Strategy Lead']},'Client Services':{'entry':['Project Coordinator','Client Success Associate','Engagement Analyst'],'mid':['Engagement Manager','Client Partner','Program Manager'],'senior':['Senior Program Director','Principal Client Partner','Senior Delivery Lead']}}
  self.executive_positions={'Product Architecture':['Chief Architect','Head of Architecture'],'Design & UX':['Head of Design','Chief Experience Officer'],'Engineering Delivery':['CTO','VP of Engineering'],'Product Strategy':['Head of Product','Chief Product Officer'],'Client Services':['Head of Client Services','Chief Customer Officer']}
  self.level_weights={'Product Architecture':{'entry':0.3,'mid':0.5,'senior':0.2},'Design & UX':{'entry':0.4,'mid':0.4,'senior':0.2},'Engineering Delivery':{'entry':0.5,'mid':0.3,'senior':0.2},'Product Strategy':{'entry':0.3,'mid':0.4,'senior':0.3},'Client Services':{'entry':0.2,'mid':0.5,'senior':0.3}}
 def _select_position(self,department,level):
  if level=='senior' and len(self.assigned_unique_positions)<len([pos for positions in self.executive_positions.values() for pos in positions]):
   available_exec_positions=set(self.executive_positions[department])-self.assigned_unique_positions
   if available_exec_positions and random.random()<0.2:
    position=random.choice(list(available_exec_positions))
    self.assigned_unique_positions.add(position)
    return position
  return random.choice(self.departments[department][level])
 def _init_location_config(self):
  self.location_data={'London':{'country':'UK','region':'EMEA','cost_multiplier':1.2,'timezone':'GMT','market_demand':0.9},'New York':{'country':'USA','region':'Americas','cost_multiplier':1.3,'timezone':'EST','market_demand':1.0},'San Francisco':{'country':'USA','region':'Americas','cost_multiplier':1.35,'timezone':'PST','market_demand':1.1},'Singapore':{'country':'Singapore','region':'APAC','cost_multiplier':1.15,'timezone':'SGT','market_demand':0.95},'Berlin':{'country':'Germany','region':'EMEA','cost_multiplier':1.1,'timezone':'CET','market_demand':0.85}}
  self.location_weights={'Product Architecture':{'London':0.3,'New York':0.2,'San Francisco':0.2,'Singapore':0.15,'Berlin':0.15},'Design & UX':{'London':0.25,'New York':0.3,'San Francisco':0.2,'Singapore':0.15,'Berlin':0.1},'Engineering Delivery':{'London':0.2,'New York':0.25,'San Francisco':0.25,'Singapore':0.15,'Berlin':0.15},'Product Strategy':{'London':0.3,'New York':0.25,'San Francisco':0.2,'Singapore':0.15,'Berlin':0.1},'Client Services':{'London':0.3,'New York':0.2,'San Francisco':0.2,'Singapore':0.2,'Berlin':0.1}}
 def _init_specialization_config(self):
  self.specializations={'Product Architecture':['Cloud Infrastructure','Microservices','System Integration','API Design','Data Architecture','Security Architecture'],'Design & UX':['Mobile Design','Web Applications','Design Systems','User Research','Service Design','Design Thinking'],'Engineering Delivery':['Full Stack','Frontend','Backend','DevOps','Cloud Native','Mobile Development'],'Product Strategy':['Digital Transformation','Innovation Strategy','Go-to-Market','Product Discovery','Market Analysis'],'Client Services':['Agile Delivery','Digital Strategy','Change Management','Portfolio Management','Strategic Planning']}
 def _init_certification_config(self):
  self.certification_types={'Product Architecture':['AWS Solutions Architect','Azure Architect','TOGAF','Google Cloud Architect'],'Design & UX':['Google UX Design','Interaction Design Foundation','Human Factors International'],'Engineering Delivery':['AWS Developer','Kubernetes','Azure Developer','Scrum Master'],'Product Strategy':['Product Management (PMP)','Agile Product Management','PSPO'],'Client Services':['Prince2','PMP','Agile Project Management','Scrum Master']}
 def _select_department(self):
  return random.choices(list(self.department_weights.keys()),weights=list(self.department_weights.values()))[0]
 def _select_level(self,department):
  weights=self.level_weights[department]
  return random.choices(list(weights.keys()),weights=list(weights.values()))[0]
 def _select_location(self,department):
  weights=self.location_weights[department]
  return random.choices(list(weights.keys()),weights=list(weights.values()))[0]
 def _initialize_core_attributes(self):
  is_outlier=random.random()<0.05
  self.current_department=self._select_department()
  self.current_level=self._select_level(is_outlier)
  self.hire_date=self._generate_hire_date(self.current_level,is_outlier)
  self.current_location=self._select_location(self.current_department)
 def _generate_hire_date(self,level,is_outlier):
  experience_ranges={'entry':(0,2),'mid':(2,6),'senior':(5,12)}
  if is_outlier:
   experience_ranges={'entry':(2,4),'mid':(5,8),'senior':(10,15)}
  min_years,max_years=experience_ranges[level]
  days_ago=random.randint(min_years*365,max_years*365)
  return self.reference_date-timedelta(days=days_ago)
 def _get_position(self):
  return random.choice(self.departments[self.current_department][self.current_level])
 def _generate_location_data(self):
  location_info=self.location_data[self.current_location]
  return{'city':self.current_location,'country':location_info['country'],'region':location_info['region'],'remote_work_ratio':self._calculate_remote_ratio(),'travel_percentage':self._calculate_travel_percentage()}
 def _generate_financial_metrics(self):
  base_salary=self._calculate_base_salary()
  return{'base_salary':base_salary,'billing_rate':self._calculate_billing_rate(base_salary),'utilization_target':self._calculate_utilization_target(),'actual_utilization':self._calculate_actual_utilization()}
 def _calculate_base_salary(self):
  base_ranges={'entry':(65000,85000),'mid':(95000,145000),'senior':(150000,250000)}
  dept_multipliers={'Product Architecture':1.25,'Design & UX':1.15,'Engineering Delivery':1.20,'Product Strategy':1.18,'Client Services':1.10}
  min_salary,max_salary=base_ranges[self.current_level]
  base=random.uniform(min_salary,max_salary)
  location_multiplier=self.location_data[self.current_location]['cost_multiplier']
  dept_multiplier=dept_multipliers[self.current_department]
  experience_multiplier=self._calculate_experience_multiplier()
  final_salary=base*location_multiplier*dept_multiplier*experience_multiplier
  return round(final_salary,-3)
 def _calculate_experience_multiplier(self):
  years_experience=(self.reference_date-self.hire_date).days/365
  return 1+(min(years_experience,15)*0.025)
 def _calculate_billing_rate(self,base_salary):
  annual_cost=base_salary*2.5
  utilization_target=self._calculate_utilization_target()/100
  annual_billable_hours=2080*utilization_target
  base_rate=annual_cost/annual_billable_hours
  market_multipliers={'Product Architecture':1.4,'Design & UX':1.3,'Engineering Delivery':1.35,'Product Strategy':1.45,'Client Services':1.25}
  location_demand=self.location_data[self.current_location]['market_demand']
  final_rate=base_rate*market_multipliers[self.current_department]*location_demand
  return round(final_rate/50)*50
 def _calculate_utilization_target(self):
  base_targets={'Product Architecture':70,'Design & UX':75,'Engineering Delivery':85,'Product Strategy':65,'Client Services':80}
  level_adjustments={'entry':10,'mid':0,'senior':-15}
  target=base_targets[self.current_department]+level_adjustments[self.current_level]
  target+=random.uniform(-3,3)
  return round(min(max(target,50),90))
 def _calculate_actual_utilization(self):
  target=self._calculate_utilization_target()
  variation=random.uniform(-15,5)
  return round(min(max(target+variation,50),95),1)
 def _generate_expertise_data(self):
  return{'primary_specialization':self._select_specialization(),'secondary_specialization':self._select_specialization(),'industry_expertise':self._select_industry_expertise(),'certifications':self._assign_certifications()}
 def _select_specialization(self):
  specializations={'Product Architecture':['Cloud Infrastructure','Microservices','System Integration','API Design','Data Architecture','Security Architecture'],'Design & UX':['Mobile Design','Web Applications','Design Systems','User Research','Service Design','Design Thinking'],'Engineering Delivery':['Full Stack','Frontend','Backend','DevOps','Cloud Native','Mobile Development'],'Product Strategy':['Digital Transformation','Innovation Strategy','Go-to-Market','Product Discovery','Market Analysis'],'Client Services':['Agile Delivery','Digital Strategy','Change Management','Portfolio Management','Strategic Planning']}
  return random.choice(specializations[self.current_department])
 def _select_industry_expertise(self):
  industries=['FinTech','HealthTech','E-commerce','Enterprise Software','Social Impact','Consumer Tech','EdTech','Industrial Tech']
  num_industries=random.randint(2,4)
  return random.sample(industries,k=num_industries)
 def _assign_certifications(self):
  available_certs=self.certification_types.get(self.current_department,[])
  cert_counts={'entry':(0,2),'mid':(1,3),'senior':(2,4)}
  min_certs,max_certs=cert_counts[self.current_level]
  num_certs=random.randint(min_certs,min(max_certs,len(available_certs)))
  return random.sample(available_certs,k=num_certs)
 def _calculate_project_complexity(self,level,is_outlier):
  complexity_weights={'entry':[0.5,0.3,0.15,0.05,0],'mid':[0.1,0.3,0.4,0.15,0.05],'senior':[0,0.1,0.3,0.4,0.2]}
  if is_outlier:
   complexity_weights={'entry':[0.2,0.3,0.3,0.15,0.05],'mid':[0,0.15,0.25,0.4,0.2],'senior':[0,0,0.2,0.4,0.4]}
  return np.random.choice(range(1,6),p=complexity_weights[level])
 def _calculate_flight_risk(self,is_outlier):
  base_risk=20
  years_in_role=(self.reference_date-self.hire_date).days/365
  engagement_score=self._generate_engagement_score(is_outlier)
  performance_score=self._generate_performance_score(self.current_level,is_outlier)
  utilization=self._calculate_actual_utilization()
  if years_in_role>4 and self.current_level!='senior':
   base_risk+=np.random.normal(15,3)
  if engagement_score<6:
   base_risk+=np.random.normal(20,4)
  elif engagement_score<7:
   base_risk+=np.random.normal(10,2)
  if performance_score>4.5:
   base_risk+=np.random.normal(12,3)
  elif performance_score<3.0:
   base_risk+=np.random.normal(15,3)
  market_demand=self.location_data[self.current_location]['market_demand']
  if market_demand>0.9:
   base_risk+=np.random.normal(8,2)
  high_demand_positions={'Solution Architect':12,'Cloud Architect':15,'Tech Lead':10,'Senior Developer':8,'Senior UX Designer':7,'Product Strategist':6}
  position_risk=high_demand_positions.get(self._get_position(),0)
  base_risk+=np.random.normal(position_risk,2) if position_risk>0 else 0
  base_risk+=np.random.normal(0,3)
  return round(min(max(base_risk,15),95))
 def _calculate_promotion_readiness(self,level,hire_date,is_outlier):
  if level=='senior':
   return round(random.uniform(0,30),1)
  years_in_role=(self.reference_date-hire_date).days/365
  performance_score=self._generate_performance_score(level,is_outlier)
  base_score=20
  experience_score=min(years_in_role*8,24)
  performance_factor=25 if performance_score>=4.5 else 15 if performance_score>=4.0 else 8 if performance_score>=3.5 else 0
  cert_count=len(self._assign_certifications())
  skill_factor=min(cert_count*3,12)
  complexity_factor=self._calculate_project_complexity(level,is_outlier)*3
  total_score=base_score+experience_score+performance_factor+skill_factor+complexity_factor+np.random.normal(0,2)
  return round(min(max(min(total_score,92),15),92),1)
 def _calculate_innovation_score(self,level,department):
    base_innovation={'Product Architecture':7.5,'Design & UX':8.0,'Engineering Delivery':7.0,'Product Strategy':8.5,'Client Services':6.5}
    level_factors={'entry':0.7,'mid':1.0,'senior':1.3}
    performance_score=self._generate_performance_score(level,False)
    performance_factor=(performance_score-3)*0.2
    project_complexity=self._calculate_project_complexity(level,False)
    complexity_factor=(project_complexity-3)*0.15
    cert_count=len(self._assign_certifications())
    certification_factor=cert_count*0.1
    innovation=(base_innovation[department]*level_factors[level]*(1+performance_factor+complexity_factor+certification_factor))
    return round(min(max(innovation+random.uniform(-0.5,0.5),1),10),1)
 def _assign_office(self,department):
    weights={'Product Architecture':[0.3,0.2,0.2,0.15,0.15],'Design & UX':[0.25,0.3,0.2,0.15,0.1],'Engineering Delivery':[0.2,0.25,0.25,0.15,0.15],'Product Strategy':[0.3,0.25,0.2,0.15,0.1],'Client Services':[0.3,0.2,0.2,0.2,0.1]}
    office_key=np.random.choice(list(self.office_locations.keys()),p=weights[department])
    return self.office_locations[office_key]
 def _calculate_remote_ratio(self):
    base_remote={'entry':20,'mid':30,'senior':40}[self.current_level]
    return round(min(max(base_remote+random.uniform(-10,10),0),100),1)
 def _calculate_travel_percentage(self):
    base_travel={'entry':10,'mid':20,'senior':30}[self.current_level]
    dept_modifier={'Product Architecture':0,'Design & UX':-5,'Engineering Delivery':-10,'Product Strategy':15,'Client Services':20}[self.current_department]
    return round(min(max(base_travel+dept_modifier+random.uniform(-5,5),0),100),1)
 def _assign_industry_expertise(self):
    num_industries=random.randint(2,min(4,len(self.industry_expertise)))
    return random.sample(self.industry_expertise,k=num_industries)
 def _generate_performance_score(self,level,is_outlier):
    base_scores={'entry':np.random.normal(3.5,0.4),'mid':np.random.normal(3.8,0.3),'senior':np.random.normal(4.1,0.2)}
    score=base_scores[level]
    if is_outlier:score=random.choice([2.0,4.8])
    return round(min(max(score,1),5),1)
 def _generate_project_metrics(self):
    is_outlier=random.random()<0.05
    base_projects={'entry':(1,2),'mid':(2,3),'senior':(2,4)}
    dept_modifiers={'Product Architecture':0,'Design & UX':-1,'Engineering Delivery':1,'Product Strategy':0,'Client Services':1}
    min_projects,max_projects=base_projects[self.current_level]
    modifier=dept_modifiers[self.current_department]
    min_projects=max(1,min_projects+modifier)
    max_projects=max(min_projects,max_projects+modifier)
    active_projects=random.randint(min_projects,max_projects)
    projects=[]
    for _ in range(active_projects):
        project={'complexity':self._calculate_project_complexity(self.current_level,is_outlier),'status':random.choice(['Planning','In Progress','Final Stage']),'duration_months':random.randint(3,12),'team_size':random.randint(3,10)}
        projects.append(project)
    avg_complexity=round(sum(p['complexity']for p in projects)/len(projects),1)
    avg_duration=round(sum(p['duration_months']for p in projects)/len(projects),1)
    avg_team_size=round(sum(p['team_size']for p in projects)/len(projects),1)
    return{'active_projects':active_projects,'avg_project_complexity':avg_complexity,'avg_project_duration':avg_duration,'avg_team_size':avg_team_size,'projects_on_time':self._calculate_projects_on_time(active_projects,is_outlier),'project_satisfaction':self._generate_client_satisfaction(self.current_level,self.current_department,projects,self._generate_performance_score(self.current_level,is_outlier))}
 def _calculate_projects_on_time(self,active_projects,is_outlier):
    if is_outlier:return random.choice([100,60])
    base_on_time={'entry':80,'mid':85,'senior':90}
    dept_modifiers={'Product Architecture':-5,'Design & UX':0,'Engineering Delivery':-3,'Product Strategy':2,'Client Services':5}
    on_time_pct=base_on_time[self.current_level]+dept_modifiers[self.current_department]
    on_time_pct+=random.uniform(-5,5)
    return round(min(max(on_time_pct,60),100),1)
 def _generate_development_metrics(self):
    is_outlier=random.random()<0.05
    return{'training_hours':self._calculate_training_hours(self.current_level,is_outlier),'mentorship_hours':self._calculate_mentorship_hours(self.current_level),'knowledge_sharing_score':self._calculate_knowledge_sharing(self.current_level,is_outlier),'promotion_readiness':self._calculate_promotion_readiness(self.current_level,self.hire_date,is_outlier)}
 def _generate_risk_metrics(self):
    is_outlier=random.random()<0.05
    engagement_score=self._generate_engagement_score(is_outlier)
    flight_risk=self._calculate_flight_risk(is_outlier)
    return{'engagement_score':engagement_score,'flight_risk':flight_risk,'retention_risk':'High'if flight_risk>70 else'Medium'if flight_risk>40 else'Low'}
 def _generate_engagement_score(self,is_outlier):
    if is_outlier:return random.choice([4.0,9.5])
    base_engagement={'entry':np.random.normal(7.5,0.8),'mid':np.random.normal(7.8,0.7),'senior':np.random.normal(8.2,0.6)}
    score=base_engagement[self.current_level]
    dept_adjustments={'Product Architecture':0.2,'Design & UX':0.3,'Engineering Delivery':0.1,'Product Strategy':0.4,'Client Services':0.2}
    score+=dept_adjustments[self.current_department]
    return round(min(max(score,1),10),1)
 def _generate_performance_metrics(self,is_outlier):
    if is_outlier:
        performance_score=round(random.uniform(4.5,5.0),2)
        innovation_score=round(random.uniform(90,100),1)
        delivery_quality=round(random.uniform(95,100),1)
    else:
        performance_score=round(random.normalvariate(3.7,0.4),2)
        performance_score=max(min(performance_score,4.5),2.5)
        innovation_score=round(random.normalvariate(75,8),1)
        innovation_score=max(min(innovation_score,90),50)
        delivery_quality=round(random.normalvariate(82,7),1)
        delivery_quality=max(min(delivery_quality,95),60)
    return{'performance_score':performance_score,'innovation_score':innovation_score,'delivery_quality':delivery_quality}
 def _generate_project_data(self):
    return{'complexity':random.randint(1,5),'status':random.choice(['Planning','In Progress','Final Stage'])}
 def _generate_client_satisfaction(self,level,department,projects,performance_score):
    if not projects:return None
    dept_base_satisfaction={'Product Architecture':4.2,'Design & UX':4.0,'Engineering Delivery':4.1,'Product Strategy':4.0,'Client Services':4.3}
    total_weight=0;weighted_satisfaction=0
    for project in projects:
        project_weight=1.0;base=dept_base_satisfaction[department]
        complexity=project['complexity']
        complexity_factor={1:0.1,2:0.05,3:0,4:-0.05,5:-0.1}
        status_factor={'Planning':0.05,'In Progress':0,'Final Stage':-0.05}
        project_satisfaction=base
        project_satisfaction+=complexity_factor[complexity]
        project_satisfaction+=status_factor[project['status']]
        performance_impact=(performance_score-3.5)*0.2
        project_satisfaction+=performance_impact
        level_factor={'entry':-0.1,'mid':0,'senior':0.1}
        project_satisfaction+=level_factor[level]
        project_satisfaction+=random.uniform(-0.1,0.1)
        weighted_satisfaction+=project_satisfaction*project_weight
        total_weight+=project_weight
    final_satisfaction=round(weighted_satisfaction/total_weight,1)
    return min(max(final_satisfaction,1.0),5.0)
 def _generate_internal_contribution(self,level):
    base_contribution={'entry':(1,3),'mid':(3,5),'senior':(4,7)}
    min_contrib,max_contrib=base_contribution[level]
    return random.randint(min_contrib,max_contrib)
 def _generate_delivery_quality(self,level,is_outlier):
    base_quality={'entry':np.random.normal(85,5),'mid':np.random.normal(90,4),'senior':np.random.normal(93,3)}
    quality=base_quality[level]
    if is_outlier:quality=random.choice([75,98])
    return round(min(max(quality,70),100),1)
 def _calculate_training_hours(self,level,is_outlier):
    base_hours={'entry':(40,60),'mid':(30,50),'senior':(20,40)}
    if is_outlier:return random.choice([10,100])
    min_hours,max_hours=base_hours[level]
    return random.randint(min_hours,max_hours)
 def _calculate_mentorship_hours(self,level):
    mentorship_ranges={'entry':(-30,-10),'mid':(-10,20),'senior':(20,40)}
    min_hours,max_hours=mentorship_ranges[level]
    return random.randint(min_hours,max_hours)
 def _calculate_knowledge_sharing(self,level,is_outlier):
    base_sharing={'entry':(3,6),'mid':(5,8),'senior':(7,9)}
    if is_outlier:return round(random.uniform(8.5,9.5),1)if random.random()<0.8 else round(random.uniform(2.5,3.5),1)
    min_share,max_share=base_sharing[level]
    if level=='entry':score=np.random.beta(4,3)*(max_share-min_share)+min_share
    elif level=='mid':score=np.random.beta(5,3)*(max_share-min_share)+min_share
    else:score=np.random.beta(6,2)*(max_share-min_share)+min_share
    return round(score,1)
 def generate_employee(self,department=None,level=None,is_outlier=False):
    self.current_department=department or self._select_department()
    self.current_level=level or self._select_level(self.current_department)
    self.hire_date=self._generate_hire_date(self.current_level,is_outlier)
    self.current_location=self._select_location(self.current_department)
    return{'employee_id':self.fake.unique.random_number(digits=6),'full_name':self.fake.name(),'department':self.current_department,'position':self._select_position(self.current_department,self.current_level),'level':self.current_level,'hire_date':self.hire_date,**self._generate_location_data(),**self._generate_financial_metrics(),**self._generate_expertise_data(),**self._generate_performance_metrics(is_outlier),**self._generate_project_metrics(),**self._generate_development_metrics(),**self._generate_risk_metrics(),'manager_id':None,'total_comp':None,'team_lead_projects':0,'direct_reports':0}
class ConsultancyDataGenerator:
    def __init__(self,num_records,seed=42):
        self.fake=Faker()
        Faker.seed(seed)
        np.random.seed(seed)
        random.seed(seed)
        self.num_records=num_records
        self.employee_generator=ConsultancyEmployee(self.fake)
        self.departments=list(self.employee_generator.departments.keys())
        self.levels=['entry','mid','senior']
        self.department_weights=self.employee_generator.department_weights
        self.level_weights=self.employee_generator.level_weights
        self.senior_ratio=0.03 if num_records>=1000 else 0.05
        self.mid_ratio=0.10 if num_records>=1000 else 0.12 if num_records>=100 else 0.08
        self.max_senior_managers=max(1,int(num_records*self.senior_ratio))
        self.max_mid_managers=max(2,int(num_records*self.mid_ratio))
        self.max_direct_reports_senior=10 if num_records>=1000 else 8 if num_records>=100 else 6
        self.max_direct_reports_mid=8 if num_records>=1000 else 6 if num_records>=100 else 4
    def _select_department(self):
        weights=list(self.department_weights.values())
        if not all(w>0 for w in weights) or abs(sum(weights)-1)>1e-9:
            total=sum(weights)
            weights=[w/total if w>0 else 0.01 for w in weights]
            weights=[w/sum(weights) for w in weights]
        return random.choices(list(self.department_weights.keys()),weights=weights)[0]
    def _select_level(self,department):
        weights=list(self.level_weights[department].values())
        if not all(w>0 for w in weights) or abs(sum(weights)-1)>1e-9:
            total=sum(weights)
            weights=[w/total if w>0 else 0.01 for w in weights]
            weights=[w/sum(weights) for w in weights]
        return random.choices(list(self.level_weights[department].keys()),weights=weights)[0]
    def _assign_managers(self,df):
        if df.empty:return df
        df['is_manager']=False
        df['management_level']=None
        df['direct_reports']=0
        for dept in self.departments:
            dept_mask=df['department']==dept
            if not any(dept_mask):continue
            dept_indices=df[dept_mask].index
            senior_mask=(df['level']=='senior')&dept_mask&df['position'].str.contains('Director|Head|Chief|Lead',case=False,na=False)
            mid_mask=(df['level']=='mid')&dept_mask&df['position'].str.contains('Lead|Senior|Manager',case=False,na=False)
            dept_size=len(dept_indices)
            if dept_size==0:continue
            dept_senior_managers=max(1,int(dept_size*self.senior_ratio))
            dept_mid_managers=max(2,int(dept_size*self.mid_ratio))
            senior_candidates=df[senior_mask].sort_values('performance_score',ascending=False)
            mid_candidates=df[mid_mask].sort_values('performance_score',ascending=False)
            max_senior=min(dept_senior_managers,len(senior_candidates),int(self.max_senior_managers/len(self.departments)))
            max_mid=min(dept_mid_managers,len(mid_candidates),int(self.max_mid_managers/len(self.departments)))
            if max_senior==0 or max_mid==0:continue
            senior_manager_indices=senior_candidates.head(max_senior).index
            mid_manager_indices=mid_candidates.head(max_mid).index
            df.loc[senior_manager_indices,'is_manager']=True
            df.loc[senior_manager_indices,'management_level']='senior'
            df.loc[mid_manager_indices,'is_manager']=True
            df.loc[mid_manager_indices,'management_level']='mid'
            mid_level_mask=(df['level']=='mid')&dept_mask&~df.index.isin(mid_manager_indices)
            mid_level_indices=df[mid_level_mask].index
            for idx in mid_level_indices:
                valid_managers=df.loc[senior_manager_indices]
                if not valid_managers.empty:
                    valid_managers=valid_managers[valid_managers['direct_reports']<self.max_direct_reports_senior]
                    if not valid_managers.empty:
                        manager=valid_managers.sample(n=1)
                        df.loc[idx,'manager_id']=manager.iloc[0]['employee_id']
                        df.loc[manager.index[0],'direct_reports']+=1
            entry_mask=(df['level']=='entry')&dept_mask
            entry_indices=df[entry_mask].index
            for idx in entry_indices:
                valid_mid_managers=df.loc[mid_manager_indices]
                valid_mid_managers=valid_mid_managers[valid_mid_managers['direct_reports']<self.max_direct_reports_mid]
                if not valid_mid_managers.empty:
                    manager=valid_mid_managers.sample(n=1)
                    df.loc[idx,'manager_id']=manager.iloc[0]['employee_id']
                    df.loc[manager.index[0],'direct_reports']+=1
                else:
                    valid_senior_managers=df.loc[senior_manager_indices]
                    valid_senior_managers=valid_senior_managers[valid_senior_managers['direct_reports']<self.max_direct_reports_senior]
                    if not valid_senior_managers.empty:
                        manager=valid_senior_managers.sample(n=1)
                        df.loc[idx,'manager_id']=manager.iloc[0]['employee_id']
                        df.loc[manager.index[0],'direct_reports']+=1
        df['span_of_control']=df['direct_reports'].apply(lambda x:'None' if x==0 else'Small (1-3)' if x<=3 else'Medium (4-6)' if x<=6 else'Large (7+)')
        return df
    def _calculate_management_compensation_adjustment(self,df):
        if df.empty:return df
        df['management_premium']=0.0
        df['span_premium']=0.0
        senior_mask=df['is_manager']&(df['management_level']=='senior')
        if any(senior_mask):
            df.loc[senior_mask,'management_premium']=(df.loc[senior_mask,'base_salary']*np.random.uniform(0.25,0.35,size=senior_mask.sum())).round(2)
        mid_mask=df['is_manager']&(df['management_level']=='mid')
        if any(mid_mask):
            df.loc[mid_mask,'management_premium']=(df.loc[mid_mask,'base_salary']*np.random.uniform(0.15,0.25,size=mid_mask.sum())).round(2)
        df['span_premium']=(df['direct_reports'].apply(lambda x:0.0 if x==0 else 0.05 if x<=3 else 0.10 if x<=6 else 0.15)*df['base_salary']).round(2)
        df['total_comp']=(df['base_salary']+df['management_premium']+df['span_premium']).round(2)
        return df
    def _update_team_metrics(self,df):
        manager_counts=df['manager_id'].value_counts()
        manager_mask=df['employee_id'].isin(manager_counts.index)
        df.loc[manager_mask,'direct_reports']=df.loc[manager_mask,'employee_id'].map(manager_counts)
        senior_mask=df['level']=='senior'
        mid_mask=df['level']=='mid'
        df.loc[senior_mask,'team_lead_projects']=df.loc[senior_mask,'active_projects'].apply(lambda x:random.randint(1,max(1,x)) if pd.notnull(x) else 0)
        df.loc[mid_mask,'team_lead_projects']=df.loc[mid_mask,'active_projects'].apply(lambda x:random.randint(0,max(0,x-1)) if pd.notnull(x) else 0)
        return df
    def _assign_outliers(self):
        total_outliers=max(len(self.departments),int(self.num_records*0.02))
        self.outlier_assignments={dept:{level:0 for level in self.levels}for dept in self.departments}
        for dept in self.departments:
            level=random.choice(self.levels)
            self.outlier_assignments[dept][level]+=1
            total_outliers-=1
        while total_outliers>0:
            dept=random.choice(self.departments)
            level=random.choice(self.levels)
            self.outlier_assignments[dept][level]+=1
            total_outliers-=1
    def _get_expected_count(self,department,level):
        return int(self.num_records*self.department_weights[department]*self.level_weights[department][level])
    def _calculate_total_compensation(self,df):
        df['total_comp']=df.apply(lambda row:row['base_salary']*(1+(0.2 if row['level']=='senior' else 0.15 if row['level']=='mid' else 0.1)+(0.05 if row['performance_score']>=4.5 else 0)),axis=1)
    def generate_dataset(self):
        try:
            employees=[self.employee_generator.generate_employee() for _ in range(self.num_records)]
            df=pd.DataFrame(employees)
            required_columns=['employee_id','department','level','position','base_salary']
            missing_columns=[col for col in required_columns if col not in df.columns]
            if missing_columns:raise ValueError(f"Missing required columns: {missing_columns}")
            df=self._assign_managers(df)
            df=self._calculate_management_compensation_adjustment(df)
            df=self._update_team_metrics(df)
            return df
        except Exception as e:
            print(f"Error generating dataset: {str(e)}")
            raise
    def _prepare_data_for_export(self):
        df=self.generate_dataset()
        df['hire_date']=df['hire_date'].dt.strftime('%Y-%m-%d')
        for column in df.select_dtypes(include=[np.number]).columns:df[column]=df[column].astype(float)
        list_columns=['industry_expertise','certifications']
        for col in list_columns:df[col]=df[col].apply(lambda x:', '.join(x) if isinstance(x,list) else x)
        return df
    def _save_to_excel(self,df,filepath):
        with pd.ExcelWriter(filepath,engine='openpyxl') as writer:
            df.to_excel(writer,sheet_name='Employee Data',index=False)
            summary_stats=pd.DataFrame({'Department Distribution':df['department'].value_counts(),'Average Salary by Level':df.groupby('level')['base_salary'].mean(),'Average Utilization':df.groupby('department')['actual_utilization'].mean()})
            summary_stats.to_excel(writer,sheet_name='Summary Stats')
    def _save_to_json(self,df,filepath):
        records=df.to_dict(orient='records')
        with open(filepath,'w',encoding='utf-8') as f:json.dump(records,f,indent=2,ensure_ascii=False)
    def generate_and_save_data(self,base_filename='consultancy_data'):
        df=self._prepare_data_for_export()
        os.makedirs('output',exist_ok=True)
        excel_path=os.path.join('output',f'{base_filename}.xlsx')
        json_path=os.path.join('output',f'{base_filename}.json')
        self._save_to_excel(df,excel_path)
        print(f"Excel file created successfully: {excel_path}")
        self._save_to_json(df,json_path)
        print(f"JSON file created successfully: {json_path}")
        print(f"Number of records: {len(df)}")
def main():
    generator=ConsultancyDataGenerator(num_records=1450)
    try:
        generator.generate_and_save_data()
        print("Data generation completed successfully!")
    except Exception as e:print(f"An error occurred: {str(e)}")
if __name__=="__main__":main() 