## Mounting

In [63]:
# connect to the google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
# the google drive is mounted at /content/drive/
# you will need to specify the file location

# data location
file_location = "/content/drive/My Drive/intern/08-14.csv"
# import pandas and read the data
import pandas as pd
df = pd.read_csv(file_location)

In [65]:
# column list
list(df)

['Company',
 '# of CUNY Alumni',
 '# of NYC Professionals',
 'Year Founded',
 'Sector',
 'Industry',
 'Job(s) or Opportunities',
 'Major(s)',
 'Skills',
 'Type',
 'Website']

In [66]:
pd.DataFrame(df["Sector"].value_counts())

Unnamed: 0,Sector
Business,152
Tech,119
Health,74
Law,60


# Functions

## Get major

In [0]:
def get_major(data, sector="Any", unique=False):
  if sector == "Any":
    majors = data["Major(s)"].tolist()
  else:
    majors = data.loc[data["Sector"] == sector]["Major(s)"].tolist()
  split_major = []
  for major in majors:
    temp = [x for x in str(major).split(", ")]
    split_major += temp
  if unique:
    return pd.Series(split_major).unique().tolist()
  else:
    return split_major
#get_major(df, "Business", True)

## Get skill

In [0]:
def get_skill(data, sector="Any", unique=False):
  if sector=="Any":
    skills = data["Skills"].tolist()
  else:
    skills = data.loc[data["Sector"] == sector]["Skills"].tolist()
  split_skill = []
  for skill in skills:
    temp = [x for x in str(skill).split(", ")]
    split_skill += temp
  if unique:
    return pd.Series(split_skill).unique().tolist()
  else:
    return split_skill


In [0]:
#pd.Series(get_skill(df, "Any", True)).sort_values().tolist()

## Get job title

In [0]:
# get all the unique value for job title
# parameters:
  # data: DataFrame -> the data
  # sector: string -> the sector that we want
# return:
  # position: list -> a list contains all the position belong to certain sector 
def get_position(data, sector="Any", unique=False):
  if sector=="Any":
    h = get_position(data, "Health")
    l = get_position(data, "Law")
    t = get_position(data, "Tech")
    b = get_position(data, "Business")
    if unique:
      return pd.Series(h + l + b + t).unique().tolist()
    else:
      return h + l + b + t
  else:
    positions = data.loc[data["Sector"] == sector]["Job(s) or Opportunities"].tolist()
    split_position = []
    for position in positions:
      if sector == "Law": temp = [x for x in str(position).split('. ')]
      else: temp = [x for x in str(position).split(', ')]
      split_position += temp
    if unique:
      return pd.Series(split_position).unique().tolist()
    else:
      return split_position

In [0]:
# pd.Series(get_position(df, "Tech", True)).value_counts()

## Group Tech

In [0]:
def group_tech(position):
  sales = [
      'Sales Development Associate',
      'Sales',
      'Customer Representative',
      'CRM Analyst',
      'Business Intelligence',
      'Marketing',
      'Sales Development Representative',
      'Digital Marketing',
      'Social Media Specialist',
      'Senior Brand Designer',
      'Growth Analyst',
      'Sales Development',
      'Customer Support',
      'Service Desk Engineer',
      'Associate Account Manager',
      'Account Executive',
      'Senior Presales Engineer',
      'Customer Successs and Implementation Specialist',
      'Technical Account Manager',
      'Client Relations Specialist',
      'Marketing Manager',
      'Business Development',
      'Demand Generation Specialist',
      'Business Analyst',
      'Business Strategy',
      'Marketing Strategy',
      'Business',
      'Associate Account Executive',
      'Demand Generation',
      'Sales Executive',
      'Sales Generation',
      'Customer Success',
      'Solutions Engineer',
      'Sales Engineer',
      'Business Development Representative',
      'Customer Success Operation Manager',
      'Sales Developement Representative',
      'Customer Success Specialist',
      'Client Care Associate',
      "Sales Representative",
      "Business Operations & Strategy Manager",
      "Financial Planning Associate",
      "Demand/MRP Planning Associate",
      "Customer Support Rep.",
      "Manager of Marketing",
      "Sales Manager",
      "Product Marketing Manager",
      "Customer Success Associate",
      "Enterprise Account Manager",
      "Customer Success Manager",
      "Sales Support Specialist",
      "Sales Rep.",
      "Business Insights Analyst",
      "Community Manager",
      "Business Dev. Associate",
      "Enterprise B2B Account Exec.",
      "Financial Operations Analyst",
      "Lead SalesForce Dev.",
      " Lead SalesForce Dev.",
  ]
  software_engineer = [
      'Software Engineer',
      'Associate Software Developer',
      'Engineer: Storage Application',
      'Software Developer',
      'Fullstack Engineer',
      'Product Engineer',
      'Web Developer',
      'Software Support & Developer',
      'Software Engineer Trainee',
      'Staff Applications Engineer',
      'Python Software Engineer',
      'Java Developer',
      "Full Stack Developer",
      ".Net Developer",
      "C++ Developer",
      "Core Java Software Developer",
      "Full-Stack Engineer",
      "Full Stack Engineer",
      "Platform Software Engineer",
      "BIM Applications Specialist",
      "Fullstack Java Developer",

  ]
  data_scientist = [
      'Data Scientist',
      'Staff Data Scientist',
      'Data Quality Analyst',
      'Machine Learning Engineer',
      'Collections Analyst',
      'Data Engineer',
      'Data Administrator',
      'Data Analyst',
      'Data Platform Engineer',
      'Data Consultant',
      'Research Analyst',
      'Big Data Developer',
      'Machine Learning Developer',
      'Data Scientist ',
      "Data Warehouse Developer",
      "Senior Data Analyst",
      "Head of Data",
      "VP of Data Science",
  ]
  miscellaneous = [
      'Reporter',
      'Network Consultant',
      'Site Lead',
      'Consultant',
      'General Counsel',
      'Administrator',
      'Legal Analyst',
      'Operation Manager',
      'Intern',
      'Analyst',
      'Videographers',
      'PR',
      'Communication Associate',
      'Job Captain',
      'Resident Engineer',
      'Survey Technicians',
      'Assistant Engineer',
      'Solution Consultant',
      "None",
      'Coordinator',
      'Office Experience Coordinator',
      'Billing Coodinator',
      "Social Media Manager",
      "People Business Partner",
      "Events Coordinator",
      "Brand Manager",
      "Manager of Wedding Planning Tools",
      "Merchandising Assistant",
      "Counsel Associate",
      "nan",
      "Research Internship",
      "Research Scientist",
      "Internal Communications Director",
      "Murex RISK Business Analyst",
      "Murex Functional Analyst",
      "Murex Configuration and Environment Management",
      "Murex FO",
      "Murex Consultant",
      "Regional Operations Associate",
      "Ticket Distribution Agent",
      "Express Shopper",
      "Outbound Dept. Manager",
      " Corporate Controller",
      "Paid Social Manager",
      "Corporate Controller",
  ]
  devops = [
      'DevOps Engineer',
      'Infrastructure Developer',
      'DevOps',
      "Platform Engineer",
      "Integration Engineer",
      "Infrastructure Applications Specialist",
      "Linux DevOps Engineer",
  ]
  accounting = [
      'Accouting',
      'Accountant',
      'Revenue Accountant',
      'Staff Accountant',
      'Accounts Payable Clerk'
  ]
  hr = [
      'Recruiter',
      'Recruiting Coordinator',
      'HR',
      "Sales Recruiter",
      "Architectural Recruiter",
  ]
  frontend = [
      'Qualitative UX Researcher',
      'Frontend Developer',
      'Front End Engineer',
      'Frontend Engineer',
      'UX Designer',
      'Graphic Designer',
      'Visual Designer',
      "Front End Developer",
      "React Engineer",
      "Front End Software Engineer",
      "Head of User Experience",
  ]
  it = [
      'IT Support',
      'Contract Technician',
      'Technician',
      'Support Engineer',
      'IT Support Specialist',
      'Junior Technical Analyst',
      "EDI Senior Specialist",
      "Technical Support Engineer",
      "App. Support Analyst",
      "Telecom/Networking Director",
  ]
  product_manager = [
      'Product Manager',
      'Head of Product',
      'Product Manager Machine Learning',
      'Project Manager',
      'Product Designer',
      'Engineering Lead',
      "Learning & Dev. Manager",
      "Product Dev. Manager",
      "Platform Product Director",
      "Director of Product Dev.",
      "Technical Product Manager",
      "Self-Service Experiences Product Manager",
      "Product Management Director",
      "Content Platform Product Manager",
      "Portfolio Manager",
  ]
  qa = [
      'Quality Assurance Engineer',
      'QA Engineer',
      'QA',
      'QA Automation Engineer',
      "SDET Engineer",
      "QA Analyst",
  ]
  security = [
      'Senior Security Engineer',
      'Security Engineer',
      'Information Security Specialist',
      "Network Security Engineer",
      "Cyber Risk Analyst",
      "Chief Information Security Officer",
  ]
  mobile_engineer = [
      'Mobile Engineer',
      'IOS Developer',
      'Mobile Software Engineer',
      'IOS Engineer',
      'Android Developer',
  ]
  backend = [
      'Ruby Developer- Back End',
      'Backend Engineer',
      "Platform Reliability Engineer",
      "Backend Engineering Manager",
      "Back End Software Developer",
      "Back End Software Engineer",
      "Backend Software Engineer",

  ]
  electrical_engineer = [
      'Electrical Engineer',
      'Project/Senior Electrical Engineer',
      'Entry Level Electrical Engineer',
  ]
  architect = [
      'Interior Architecture and Design',
      'Architect',
      'Urban Planner',
  ]
  mechanical = [
      'Mechanical Engineer',
  ]
  civil = [
      'Civil Engineer',
      'Geotechnical Engineer',
      "Construction Engineer",
  ]
  structural = [
      'Structural Engineer',
      'Junior Bridge Engineer',
      'Junior Structural Engineer',
  ]
  if position in sales:
    return "Sales"
  elif position in software_engineer:
    return "Software Engineer"
  elif position in data_scientist:
    return "Data Scientist"
  elif position in miscellaneous:
    return "Miscellaneous"
  elif position in devops:
    return "DevOps"
  elif position in accounting:
    return "Accounting"
  elif position in hr:
    return "HR"
  elif position in frontend:
    return "Frontend"
  elif position in it:
    return "IT"
  elif position in product_manager:
    return "Product Manager"
  elif position in qa:
    return "Quality Assurance"
  elif position in security:
    return "Security"
  elif position in mobile_engineer:
    return "Mobile Engineer"
  elif position in backend:
    return "Backend Engineer"
  elif position in electrical_engineer:
    return "Electrical Engineer"
  elif position in architect:
    return "Architecture"
  elif position in mechanical:
    return "Mechanical Engineer"
  elif position in civil:
    return "Civil Engineer"
  elif position in structural:
    return "Structural Engineer"
  else:
    print("\"" + position + "\",")
    return False

## Group Health

In [0]:
def group_health(position):
  admin = [
      'Member Services Administrator',
      'Office Support',
      'Registrar',
      'Front Desk',
      'Admission Clerk',
      'Administrative Assistant', 
      'General Counsel',
      "Health Information Clerk",
      'Office Assistant',
      'Unit Clerk',
      'Program Supervisor',
      'Medical Office Front Desk',
      "Member Services Administrator"
  ]
  marketing = [
      'Digital Marketing Associate',
      'Marketing Director',
      'Media Relation',
      "Customer Success Associate",
      "Marketing",
      'Product Marketing Coordinator',
      'Product Marketing Manager'
  ]
  hr = [
      'Human Resources Coordinator',
      'Payroll Manager',
      "HR",
      'Payroll & Benefit Specialist',
      "Human Resources Coordinator",
      "Payroll Manager"
  ]
  it = [
      'Technician',
      'IT',
      'IT Technician',
      'IT Service',
      'Technical Support',
      'Technical Support Engineer',
      'IT support',
      "IT Engineer",
      'IT Service',
      'Technical Support',
      'IT Operations Analyst',
      'Finance IT Director',
      'Corporate IT Engineer',
      "Computer Information System",
      "Information System Analyst"
  ]
  coordinator = [ 
      'Care Coordinator',
      'Studio Coodinator',
      'Coordinator', 
      'Outreach',
      'Studio Coordinator',
      'Weekend Opener',
      "Studio Coodinator",
      "Community Relations Coordinator"
  ]
  miscellaneous = [
      'Swim Instructor',
      'None', 
      'Graphic Designer',
      'Residental Counselor',
      'Intern',
      'Work Study Associate',
      "Swim Instructor",
      "Graphic Designer"
  ]
  health_pro = [
      'Patient Care Associate',
      'Physician Assistant',
      'Medical Assistant',
      'Physical Assistant',
      'Pharmacist',
      'Dietition',
      "Health Fellowship",
      'Medical Officer',
      'Child Focused Adaption Services Recruiter',
      'Hospitalist',
      'Nutritionist',
      'Nursing Assistant',
      'Caregiver',
      '305 PWR/FLX Instructor',
      'Instructor',
      'Overnight Custodian'
  ]
  representative = [ 
      'Patient Access Representative',
      'Customer Service Representative', 
      'Customer Relation',
      'Customer Representative', 
      'Sales Development Representative',
      'Customer Support Representative',
      'Sales Development Rep',
      'Customer Care',
      'Customer Relation',
      'Patient Representative',
      'Sales',
      'Overnight Member Services Rep.',
      "Member Services Rep."

  ]
  accountant = [
      'Junior Accountant'
  ]
  therapist = [
      'Massage Therapist',
      'Occupational Therapy Assistant', 
      'Occupational Therapist',
      'Physical Therapist'
  ]
  executive = [
      'Project Manager',
      'Secretary',
      'Vendor Management',
      'Client Success Manager',
      'Account Manager',
      'Strategic Account Management',
      'Office Manager',
      'Revenue Management',
      'Medical Secretary',
      'Development Manager',
      'Case Manager',
      'Assistant Studio Manager',
      'Management Team',
      'Global Infrastructure Director',
      'Customer Acquisition VP',
      'Global Network Services Director',
      'Director of Perioperative Services',
      'Operating Room Nurse Manager',
      "Assistant Director of Marketing and Outreach"
  ]
  programmer = [
      'Android Developer',
      'DevOps',
      'Web Developer',
      'Software Engineer',
      "Fullstack Engineer",
      "Tech Fellowship",
      'DevOps Engineer',
      'Cloud Operations Engineer',
      'Integrations Developer',
      'Client Platform Engineer'
  ]
  social_work = [ 
      'Social Worker',
      'Educator',
      'Outreach',
      'Social Work Coordinator'
  ]
  health_tech = [
      'Lab Techonologist',
      'Radiology',
      'Medical Technologist', 
      'Biomedical Engineer',
      'Mechanical Inspector',
      'Mechanic',
      'Lab Supervisor',
      'Histotechnologist'
  ]
  doctor = [
      'Gastroenterologist',
      'Anesthesiologist',
      'Pediatrics',
      'Neuropsychology',
      'Dermatologist',
      'Rheumatologist',
      'Psychiatrist',
      'Attending Physician',
      'Physician',
      'Physicians',
      'Neorosurgeon',
      'Urologist'
  ]
  nurse = [
      'Nurse',
      'Nurse Practitioner',
      'Practitioner Nurse',
      'Office Nurse'
  ]
  food_worker = [
      'Food Service Worker',
      'Cook'
  ]
  data = [
      'Data Analyst',
      'Data Scientist',
      'Customer Data Specialist',
      'Marketing Data Analyst',
      'Data Engineer',
      "Advanced Analyst",
      "Data Visualization Engineer",
      'Data Analyst',
      'Health Information Assistant',
      'Data Sci. Project Manager',
      "Provider Information Specialist",
      "Senior Data Scientist"
  ]
  business = [
      'Business Associate',
      'Development Operations',
      'Financial Counselor', 
      'Business Development',
      "Business Fellowship",
      'Business Associate',
      'Development Operations',
      'Financial Counselor',
      'Media Operations Analyst',
      "Business Analyst"
  ]
  if position in admin:
    return "Administrator"
  elif position in marketing:
    return "Marketing"
  elif position in hr:
    return "Human Resources"
  elif position in it:
    return "IT"
  elif position in coordinator:
    return "Coordinator"
  elif position in miscellaneous:
    return "Miscellaneous"
  elif position in health_pro:
    return "health professional"
  elif position in representative:
    return "Customer Representative"
  elif position in accountant:
    return "Accountant"
  elif position in therapist:
    return "Therapist"
  elif position in executive:
    return "Executive"
  elif position in programmer:
    return "Programmer"
  elif position in social_work:
    return "Social Worker"
  elif position in health_tech:
    return "Health Tech"
  elif position in doctor:
    return "Doctor"
  elif position in nurse:
    return "Nurse"
  elif position in food_worker:
    return "Food Service Worker"
  elif position in data:
    return "Data Scientist"
  elif position in business:
    return "Business"
  else:
    print("\"" + position + "\",")
    return False

## Group Law

In [0]:
def group_law(position):
  paralegal = [
      'Intellectual Property Paralegal',
      'Corporate Paralegal',
      'Capital Markets Paralegal',
      'Bankruptcy Paralegal',
      'Paralegal',
      'Real Estate Paralegal',
      'Trusts & Estates Paralegal.',
      'Paralegal.',
  ]
  counsel = [
      'Corporate M&A Associate/Counsel',
      'Insurance Knowledge Management Lawyer',
      'Investment Funds Knowledge Management Lawyer',
      'Attorney',
      'Contract Attorney',
      'Lawyer',
      'Attorney.',
      'Knowledge Management Lawyer',
  ]
  hr = [
      'Employee Benefits Associate',
      'Payroll Specialist/Coordinator',
      'Recruiting Assistant',
      'Lateral Partner Recruiting Manager',
      'Talent Acquisition Sepcialist',
      'Legal Recruiting Assistant',
      'Attorney Development Manager',
      'Professional Development Assistant',
      'HR Manager',
      'Global Benefits Generalist',
      'HR Business Partner',
      'Director of Global Retirement Benefits',
      'Talent Acquisition & Development Coordinator',
      'HR Coordinator/Director',
      'Employee Benefits Practice Manager',
      'Attorney Development Assistant, Manager, Specialist',
      'Practice Development Manager',
      'Technical Trainer',
      "HR Recruiting Assistant",
  ]
  coordinator = [
      'Senior Billing Coordinator',
      'CLE Coordinator',
      'Conference Services Assistant',
      'Collections Coordinator.',
      'Electronic Resources Coordinator',
      'Electronic Resources Manager',
      'Public Relations & Communications Coordinator',
      'Litigation Business Development Coordinator',
      'Events Manager',
      'Marketing Coordinator',
      'Lateral., Practice Recruitment & Professional Development Coordinator',
      'Legal Recruiting Coordinator',
      'Business Development Coordinator.',
      'Client Accounting Coordinator',
      'Media Relations Coordinator',
      'E-Billing and Inventory Data Coordinator',
      'E-Billing Coordinator',
      'Directories Coordinator',
      'HR Coordinator',
      'Billing Coordinator.',
      'Legal Recruiting Coordinator.',
      'Litigation Services Coordinator',
      'Global Attorney Development Coordinator',
      'Paralegal Services Coordinator',
      'Collections Coordinator',
      'Administrative Coordinator',
      'Pricing & Billing Rate Coordinator',
  ]
  miscellaneous = [
      'Mid-Level Securities Associate',
      'Media, Technology & Commerical Transactions Associate',
      'Lateral',
      'User Support Specialist',
      'Librarian',
      'Supervisor of Word Processing',
      'Security Officer',
      'Conflicts Analyst',
      'General Lateral Applications',
      'None',
      'US Brand, Creative & Digital Graphic Designer',
      'Client Services Manager',
      'Knowledge Manager',
      'Media Relations & Communications Manager.',
      'Director of Corporate Practice',
      'Manager of Secretarial Services',
      'Research Analyst.',
      'Practice Development Manager.',
      'HRIS Analyst',
      'Litigation Analyst',
      'Junior Conflict Analyst',
      'Knowledge & Digital Services Librarian',
      'Global Pitch & Proposal Panel/RFP Advisor',
      'Research Librarian',
      'Temporary Graphic Designer',
      'Client Development Advisor',
      'Manager of Global Attorney Training & Mentoring',
      'Client Service Specialist',
      'Department Assistant',
      'Legal Project Manager',
      'New Business Proposal Manager',
      'Diversity & Inclusion Manager',
      'Corporate Practice Manager',
      'HRIS Reporting Analyst',
      'MicroStrategy Product Lead',
      'Public Relations Manager, Specialist',
      'Project Management Specialist',
      'Litigation Support Project Manager',
      'Proofreader',
      'Patent Agent',
      'Practice Manager',
      'Proposal Writer.',
      'Litigation Support.',
      'Lateral.',
      'Litigation Business Development.',
  ]
  admin = [
      'SharePoint Administrator',
      'Judicial Clerkship',
      'Application Administrator',
      'Receptionist',
      'Facilities Manager',
      'Work Allocation Manager',
      'Office Services Clerk',
      'Administrative Assistant',
      'Collection Management Specialist',
      'General Services Clerk',
      'Assistant Managing Clerk',
      'Managing Clerk',
      'Catering Assistant',
      'Copy Center Operator',
      'Document Imaging/Records Clerk.',
      'Conference Center Administrative Assistant',
      'Records Manager',
      'Administrative Supervisor',
      'Administrative Assistant.',
      'Firm Operations Manager',
      'Document Processing Manager',
      'Administrative Staff.',
      'Business Development Manager, Specialist, Assistant',
      'Duplicating Operator',
      'Word Processing Operator/Desktop Publishing Specialist.',
      'Secretary.',
      'Office Services Specialist',
      'Executive Assistant',
      'Staff.',
      'Professional Staff.'
  ]
  tech = [
      'Enterprise Applications Engineer',
      'Director of Global IT Operations',
      'SQL Database Administrator',
      'Audio/Visual Specialist.',
      'Desktop Engineer.',
      'Desktop Support Analyst',
      'End User Systems Engineer',
      'Applications Engineer',
      'Sharepoint Architect/Senior Developer',
      'Server/Storage Engineer',
      'Business Systems Engineer',
      'Database Administrator',
      'Digital Services Specialist',
      'IT Practice Support Specialist',
      'IT Support Technician',
      'Storage & Virtualization Manager',
      'Project Specialist, Data Science, Analysis & Investigation.',
      'Systems Analyst',
      'Systems Solutions Analyst',
      'Project Manager',
      'Network Engineer',
      'Assistant Motion Graphics & Digital Designer',
      'Information Governance Assistant',
      'Information Security Analyst',
      'Library Technical Services',
      'SharePoint Application Developer',
  ]
  business = [
      'Financial Analyst.',
      'Manager & Assistant Manager of Business Development',
      'Procurement Specialist',
      'M&A Financial Advisory Associate',
      'Tax Associate',
      'Business Development Manager/Specialist',
      'Billing Specialist',
      'International Trade Analyst.',
      'Capital Markets Practice Manager',
      'Manager of Financial Services Business Intelligence',
      'Professional Development Manager',
      'Business Development Analyst',
      'Business Development Specialist/Manager',
      'Business Development Specialist',
      'Senior Pricing Analyst',
      'Financial Systems Analyst.',
      'Billing Supervisor',
      'Business Development Specialist & Assistant',
      'Practice Business Development Manager',
      'Financial Accounting Supervisor',
      'Billing Sepcialist',
      'Marketing& Business Development Specialist.',
      'Treasury Analyst',
      'Sourcing & Procurement Specialist',
      'Pricing Strategist',
      'Internal Auditor',
      'Accountant',
      'Partner Accounting Supervisor',
      'Manager of Business Continuity',
      'Junior Pricing Financial Analyst',
      'Financial Analyst',
      'Finance Systems Analyst',
      'Finance Project Manager',
      'Client Operations Supervisor',
      'Client Development Intern',
      'Business Intelligence Analyst',
      'Business Development Manager/Coordinator',
      'Billing Administration Assistant',
      'Business Intelligence Developer',
      'Client Development Specialist',
      'Pricing Analyst, Manager',
      'Billing Coordinator',
      'Market Intelligence Analyst.',
      'Marketing & Business Development Manager/Director',
  ]
  intern = [
      'Summer Associate',
      'Summer Associate, People Advisory Manager',
      'Fellowship.',
      'Fellowship',
      'Pro Bono Intern',
      'IT Intern',
      'HR Intern',
      'Attorney Recruiting & Development Intern',
  ]
  associate = [
      'Associate.',
      'Legal Assistant.',
      'Associate',
      'Legal Secretary',
      'Law Clerk',
      'Judicial Clerk',
      'Asset Management Legal Assistant',
      'Practice Assistant.',
      'Corporate Legal Assistant',
      'Litigation Legal Assistant',
      'Practice Assistant',
      'Attorney Support Assistant',
      'Case Assistant',
      'Legal Executive Assistant',
      'Practice & Office Development Assistant',
      'Legal Assistant',
  ]
  if position in paralegal:
    return "Paralegal"
  elif position in associate:
    return "Associate/Assistant"
  elif position in intern:
    return "Intern position"
  elif position in business:
    return "Business Related"
  elif position in tech:
    return "Tech Related"
  elif position in admin:
    return "Administrator"
  elif position in miscellaneous:
    return "Miscellaneous"
  elif position in coordinator:
    return "Coordinator"
  elif position in hr:
    return "Human Resources"
  elif position in counsel:
    return "Counsel / Lawyer"
  else:
    print("\"" + position + "\",")
    return False

## Group Business

In [0]:
def group_business(position):
  hr = [
      'HR Manager',
      'HR Generalist',
      'HR Director',
      'HR Associate',
      'HR Assistant',
      'Employee Benefits Producer',
      'Information Resources Associate',
      'HR Analyst',
      'HR Associate',
      'Principal',
      'Strategic Staffing',
      'HR Coordinator',
      'Campaign Manager',
      'Benefits Manager',
      'Payroll Analyst',
      'SEO Specialist',
      'HR Generlist',
      'HR',
      'International Benefits',
      'Employee Benefits',
      'HRIS',
      'Benefits',
      'Talent Aquistion Specialist 1',
  ]
  marketing = [
      'Marketing Assistant',
      'Healthcare & Life Sciences Marketing Manager',
      'Digital Marketing Specialist',
      'Marketing Representative',
      'Insights Manager',
      'Researcher',
      'Marketing Editor',
      'Digital Experience Management',
      'Bridge Discipline Leader',
      'Marketing Coordinator',
      'Content Manager',
      'Marketing Vice President',
      'Marketing Senior Associate',
      'Digital Marketing Director',
      'Healthcare Content Manager',
      'Content Strategist',
      'Public Relations and Paid Media Director',
      'CRM Marketing Manager',
      'Marketing Director',
      'Digital Workplace Manger',
  ]
  tax = [
      'Tax Manager',
      'Tax Senior',
      'International Tax Services',
      'International Tax',
      'Tax',
      'Income Franchise SALT Managing Director',
      'Core Tax Services Manager',
      'SALT Associate',
      'Transfer Pricing Tax Associate',
      'Transfer Pricing Tax Manager',
      'Tax Accountant',
      'International Tax Staff Accountant',
      'Tax Associate',
      'Tax Partner',
      'Financial Services Tax Staff',
      'International',
      'SALT Compliance Manager',
      'Tax Accountant',
      'International Tax Staff Accountant',
      'Trust & Estates Tax Manager',
      'International Tax Manager',
      'SALT Manager',
      'Tax Reporting & Advising Functional Optimization Manager',
      'Tax Operations Administrator',
      'Tax Director',
      'SALT Senior',
      'SALT Tax Partner',
      'Transaction Tax Manager',
      'Senior Tax Associate',
      'Senior Tax Associate',
      'Corporate Tax Associate',
  ]
  auditor = [
      'Auditor',
      'Audit Manager',
      'China Practice Audit',
      'Financial Services Audit Manager',
      'Audit Associate',
      'Audit Senior',
      'IT Audit Manager',
      'Audit Staff Accountant',
      'Audit Accountant',
      'China Practice Audit Accountant',
      'Assurance Manager',
      'Assurance Associate',
      'Audit Associate',
      'Audit Senior',
      'IT Audit Manager',
      'Audit Staff Accountant',
      'Audit Accountant',
      'China Practice Audit Accountant',
      'Assurance Senior',
      'Assurance Professional',
      'IT Audit Services Associate',
      'Assurance Director',
      'Consulting Internal Auditor',
      'Internal Audit Staff Analyst',
      'Audit',
  ]
  recruit = [
      'Recruiter',
      'Campus Recruiting Assistant',
      'Placement Specialist',
  ]
  miss = [
      'Proofreading Assistant',
      'Executive Compensation Senior Associate',
      'Operations & Portfolio Strategy Associate',
      'Strategic Sourcing & Contracts Director',
      'Construction Delay Associate',
      'Healthcare Advisory Manager',
      'Nonprofit Associate',
      'Managed Care Director',
      'None',
      'Controller/Director of Finance',
      'Transfer Pricing Manager',
      'Power BI Training Contractor',
      'Managed Care Director',
      'None',
      'Cyber Security Specialist & Penetration Tester. Transaction Advisory Services Director',
      'Administrative Controller',
      'Construction Controller',
      'Financial Proofreader',
      'Advisory Supervisor',
      'Regulatory & Compliance Solutions Manager',
      'A/R Coordinator',
      'Forensics and Litigation Services Director',
      'ERS Director',
      'Nonprofit Advisory Services Associate',
      'Attorney',
      'Legal Counsel',
      'Corporate Paralegal',
      'Plumbing Engineer',
      'Mechanical Engineer',
      'Electrical Engineer',
      'Rail Systems Engineer',
      'Tunneling Engineer',
      'Regulatory Controller',
      'International Controller',
      'Fire Safety',
      'Medical Assistant',
      'Various',
       'Legal Staff Paralegal',
      'Paralegal',
  ]
  intern = [
      'Tax Admin Intern',
      'Accounting Internship',
      'Internship',
      'Staff Accountant Intern',
      'Campus Recruiting Intern',
      'IT Help Desk Internship',
      'Tax Intern',
      'Accounting Internship',
      'Internship',
      'Accounting Intern',
      'Summer Design Intern',
      'Analytics & Reporting Intern',
      'Research Intern',
      'Event Marketing Intern',
      'Finance Intern',
      'Cyber Risk Intern',
      'Internship',
      'Intern',
  ]
  real_estate = [
      'Real Estate Assurance Associate',
      'Real Estate Lead',
      'Real Estate Advisory',
      'Building',
      'Property Manager',
      'Real Estate Analyst',
      'Real Estate Assistant',
      'Leasing',
      'Lease Analyst',
      'Retail Leasing Manager',
      'Resident Manager',
  ]
  development = [
      'STS Research & Development Associate',
      'Business Development Director',
      'Business Restructuring Services Associate',
      'Business Restructuring Services Manager',
      'Business Info Systems Manager',
      'Assurance & Advisory Partner',
      'Business Valuation Senior or Manager',
      'CFO Advisory Services Manager',
      'ERP Consultant',
      'ERP Project Manager',
      'Learning & Development Administrator',
      'Business Intelligence Research Associate',
      'Business Operations Financial Services Director',
      'Finance Transformation Manager',
      'Readiness Planning Project Manager',
      'Strategy Manager',
      'Executive Assistant',
      'CFO Advisory Manager',
      'Service Strategy & Design Manager',
      'Innovation Expert',
      'Organizational Effectiveness',
      'Business Development',
      'Enterprise Improvement',
      'Business Development Specialist',
      'Business Development Manager',
      'Business Development Associate',
      'Corporate Ops & Transformation Product Manager',
      'Business Intelligence Product Manager',
      'Survey Manager',
      'Strategic Solutions Associate',
      'Account Manager',
      'Account Director',
      'PR Account Supervisor',
      'Public Relations Professional',
      'Global Communications Lead',
      'Public Relations Account Supervisor',
      'Health Public Relations Director/Executive',
      'Media Relations Specialist',
      'Research & Business Engagement',
      'Restructuring Advisory',
      'Facilities Coordinator',
      'Team Lead of Financial Products Consolidations',
      'Geller Advisors Family Office CFO',
      'Operations Associate',
      'Research Associate',
      'Financial Advisor',
      'Business Dev',
      'Board Relations',
      'Regional Advising',
      'Portfolio Operations Associate',
      'Renewals Representative',
      'VP of Strategic Partnerships',
      'Finance Associate',
      'Area Vice President',
      'Producer',
      'Account Manager',
      'Coordinator',
      'Business Development Coordinator',
      'Account Coodinator',
      'Program Associate',
      'Provider Relations Specialist',
      'Retention Representative',
      'Strategic Planner',
      'Corporate Strategy & Pricing Associate',
      'Corporate Development Associate',
      'Head of Sourcing & Procurement',
      'Network & Partnerships',
      'Member Care & Service',
      'Business',
      'Managed Care Coordinator',
      'Practice Assistant',
      'Management',
      'Regional',
      'Investor Relations',
      'Project',
      'Loan Compliance',
      'Valuation Services Director',
      'Assitant Manager',
  ]
  analyst = [
      'Valuation & Business Analytics Director',
      'Technical Analyst',
      'Healthcare Claims Research Analyst',
      'Retiree Health Analyst',
      'Business Analyst',
      'Marketing Science Analyst',
      'Senior Financial Analyst',
      'AML Analyst',
      'Information Security Analyst',
      'Account Receivable Analyst',
      'Financial Analyst', 
      'Investment Reporting Senior Analyst',
      'Real Estate Fund Analyst',
      'Operations Analyst',
      'Risk & Portfolio Analyst',
      'Credit Analyst',
      'Loan Operations Analyst',
      'Risk Analyst',
      'Investor Relation Analyst',
      'Modeling Analyst',
      'Captial Market Analyst',
      'Senior Analyst- Model Validation',
      'Analyst',
      'Risk Analytics Associate',
      'Corporate Systems Analyst',
      'Writer & Analyst',
      'Business Intelligence Analyst',
      'Financial Analyst',
      'Risk',
      'HRIS Analyst',
      'Project Business Analyst',
      'Summer Quantitative Finance Analyst',
      'Quantitative Associate',
      'Analytics Associate',
      'Analysis',
      'Risk'
  ]
  insurance = [
      'Forensic Accounting & Investigations Associate',
      'Forensic Accounting & Investigations Manager',
      'Insurance Claims Forensic Accounting Manager',
      'Health Care Consultant',
      'Insurance Advisory',
      'Insurance Analyst',
      'Underwriter',
      'Medical Claims Examiner',
      'Material Damage Specialist',
      'Treaty Underwriter',
      'Compliance',
      'Clinical',
      'Insurance Operations',
  ]
  data = [
      'Healthcare Data Scientist',
      'Healthcare Data Analyst',
      'Data Integration Scientist',
      'Data Engineer',
      'Data Visualization',
      'Data Architect',
      'Data Analytics',
      'BIM Designer',
      'Pricing Analytics Lead',
      'Information Architect',
      'Public Relations Data & Analytics Director',
      'Data Scientist',
      'Data Encoder',
      'Data Asscoiate',
      'Data Analyst',
      'Data Architect Consultant',
      'Data Developer',
      'Data',
      'Data Solution Architect',
      'HealthCare Data Analyst',
      'Data Coordinator',
  ]
  tech = [
      'Advisory IT Strategy & Process Manager',
      'Strategic Cloud Services Consulting Manager',
      'Oracle Reporting & Analytics Consultant',
      'Oracle HCM Learning Solutions Consultant',
      'Oracle HCM Cloud Solutions Architect',
      'Oracle HCM Cloud Compensation Solutions Architect',
      'Oracle EPM Cloud Solution Architect',
      'Functional Lead & IT Support Coordinator',
      'Cyber Security Specialist & Penetration Tester',
      'IT Assurance Associate',
      'Web Services/SharePoint Specialist',
      'Dynamics 365 CRM Developer',
      'Cyber Security & Privacy Manager',
      'Data Privacy Associate',
      'IT Helpdesk Analyst',
      'Controls Advisory IT Associate',
      'Cyber Risk Data Protection & Privacy Manager',
      'Cyber Risk Manager',
      'Data Warehouse & Business Intelligence Advisor',
      'Digital Transformation & Management Manager',
      'Microsoft Dynamics 365 ERP Functional Lead',
      'IT Advisory Consultant',
      'Technology Solutions',
      'IT Support and Implementation Coordinator',
      'AWS Technology',
      'Cloud/DevOps Engineer',
      'Cloud Architect',
      'Technology Strategy & Operations',
      'Big Data Software Engineer',
      'Full Stack Developer',
      'Front-End Developer',
      'Freelance Software Engineer',
      'Java Developer',
      'Front-End Developer',
      'Digital Services Leader',
      'Intelligent Transporation Systems Leader',
      'Lead of Technology',
      'Test Engineer',
      'Full-Stack Software Engineer',
      'Fullstack Developer',
      'IT Helpdesk Dispatcher',
      'C# Developer',
      'Test Lead',
      'Front End Engineer',
      'IT Infrastucture & Security Engineer',
      'Software Engineer',
      'Full Stack',
      'Lead Network Engineer',
      'Application Developer',
      'Software Developer',
      'Software Engineer',
      'Network Security Engineer',
      'Insurance Securities Structurer',
      'Multimedia Operations Engineer',
      'Engineering Manager',
      'IT',
      'Product Dev.',
      'Chief Engineer',
      'Senior Software Engineer',
      'Junior Network Engineer',
      'Network Engineer',
  ]
  design = [
      'Instructional Designer',
      'Motion Graphics Designer',
      'Design Director',
      'Senior UI Designer',
      'Senior Designer',
      'Motion Graphics Animator',
      'Motion Graphics Specialist',
      'Motion Graphics Manager',
      'Motion Graphics Assistant',
      'Senior Product Designer',
      'UX/UI Designer',
      'Product Designer',
      'Product',
      'Design',
      'Graphic Designer',
      'Marketing Coordinator/Graphic Designer',
  ]
  accountant = [
      'Bookkeeper',
      'Senior Staff Accountant',
      'Staff Accountant',
      'CFO Advisory Services Consultant',
      'Bookkeeper',
      'Senior Staff Accountant',
      'Accountant',
      'Accounting Advisory Services Manager',
      'Billing Analyst',
      'Forensic Accountant',
      'Account Receivable Analyst', 
      'Private Equity/Hedge Fund Accountant',
      'Asscoaite Accountant',
      'Accounting and Finance Manager',
      'Reconcilation Splt 2',
      'Accounts Recivable Collections Specialist',
      'Accounting Clerk',
      'Accounting Specialist',
      'Direct Bill Staff Accountant',
      'Account Recievable Clerk',
      'Accounting Manager',
      'Accounting',
  ]
  admin = [
      'Administrative Assistant',
      'Receptionist',
      'Administrative Assistant',
      'Receptionist',
      'Office Services Clerk',
      'Assistant Vice President',
      'Corporate Administrator',
      'Accounts Payable Clerk',
      'Finance & Administration Specialist',
      'Vice President of Product',
      'Administrative Professional',
      'Office Services',
      'Vice President',
      'Office Manager',
      'Office Services Assistant',
      'Administration',
      'Executive Assitant',
      'Recruitement Administrator',
      'Office Services',
      'Data Entry Clerk',
      'Account Administrator',
      'Finance & Actuarial Information Assitant',
      'Sales Administration Specialist',
      'Front Desk Administrator',
  ]
  consulting = [
      'Staff Consultant',
      'Business Info Systems Consultant',
      'Nurse Clinical Consultant',
      'Nonprofit Advisory Services Consultant',
      'Financial Services Internal Audit Consultant',
      'Financial Crime Model Validation Consultant',
      'Strategic Sourcing Consultant',
      'Financial Crime Consultant',
      'Management Consultant',
      'Consultant',
      'EECI Auctions',
      'Admin. Health Outcomes Research Consultant',
      'Financial Consultant',
      'Wealth Consultant',
      'Associate Actuarial Consultant',
      'Experience Design Consultant',
      'Operations Consultant',
      'Managing Logistics & Operations Consultant',
      'Blast Consultant',
      'Independent Consultant',
      'Strategic Shareholder Advisory',
      'Transportation & Infrastructure Advisory',
      'Shareholder Activism Advisory',
      'Debt Advisory',
      'Healthcare Advisory',
      'Activism Defense Advisory',
      'Insurance Advisory',
      'Restructuring Advisory',
      'Strategy Management Consultant',
      'Solutions Consultant',
      'Inventory Field Consultant',
  ]
  risk = [
      'Risk & Compliance Consultant',
      'Business Risk Services',
      'Loan Review Consultant',
      'Loan Review Manager',
      'Risk & Compliance Data Analytics Management Consultant',
      'Information Security Manager',
      'Investigator',
      'Investigations & Disputes Director',
      'Investment and Market Risk Asscoaite',
      'Senior Quantitative Risk',
  ]
  investment = [
      'Alternative Investment Practice Manager',
      'Alternative Investment Practice Supervisor',
      'Investment Principal',
      'Credit Trader',
      'Investment',
      'Investment Operations',
      'Private Banking Credit Officer',
      'Funding Specialist',
      'Wealth Associate',
      'Finance Investment Associate',
  ]
  sales = [
      'Healthcare Revenue Cycle Associate',
      'Oracle Regional Sales Director',
      'Inside Sales Representative',
      'Quality Control Manager',
      'Quality Control Director',
      'Sales Representative',
      'Assistant Account Executive',
      'Account Executive',
      'Professional Liability Producer',
      'Associate Account Executive',
      'Client Service Leader',
      'Client Partner',
      'Consumer Goods & Services Manager',
      'Client Solutions Associate',
      'Consumer Public Relations',
      'Portfolio Management',
      'Sales Executive',
      'Salesforce Developer',
      'Head of Public Equities Events',
      'Equity Research',
      'Equity Capital Markets',
      'Client Service Specialist',
      'Account Specialist',
      'Account Management',
      'Client Services',
      'Client Service Associate',
      'Client Services Representative',
      'Product Sales Person',
      'Sales Manager',
      'Sales Support Specialist',
      'Inside Sales Associate',
      'Employee Benefits Sales',
      'Sales Assitant',
      'Salesperson/Associate Broker',
      'Junior Broker',
      'Broker Account Manager',
  ]
  project = [
      'Project Manager',
      'Research Leader',
      'Project Commercial Manager',
  ]
  transaction = [
      'Transactional Advisory Services Associate',
      'Transaction Advisory Services Associate',
      'Transaction Advisory Services Manager',
      'Transaction Advisory Services Director',
      'Transactional Advisory Services Manager',
      'Transaction Advisory Manager',
      'Transaction Advisory Supervisor',
      'Transaction Services Manager',
  ]
  actuary = [
      'Medicare Advantage Actuary',
      'Medicare Advantage Consulting Actuary',
      'Retirement Plan Actuary',
      'Actuarial Analyst',
      'Actuary',
  ]
  if position in tech:
    return "Tech Related"
  elif position in data:
    return "Data Science"
  elif position in intern:
    return "Intern Position"
  elif position in insurance:
    return "Insurance"
  elif position in analyst:
    return "Analyst"
  elif position in development:
    return "Business Development"
  elif position in miss:
    return "Miscellaneous"
  elif position in real_estate:
    return "Real Estate"
  elif position in hr:
    return "Human Resources"
  elif position in recruit:
    return "Recruiter"
  elif position in auditor:
    return "Auditor"
  elif position in tax:
    return "Tax Related"
  elif position in marketing:
    return "Marketing"
  elif position in design:
    return "Designer"
  elif position in accountant:
    return "Accountant"
  elif position in admin:
    return "Administrator"
  elif position in consulting:
    return "Consultant"
  elif position in risk:
    return "Risk/ Loan/ Fraud"
  elif position in transaction:
    return "Transaction"
  elif position in project:
    return "Project"
  elif position in sales:
    return "Sales"
  elif position in investment:
    return "Investment"
  elif position in actuary:
    return "Actuary"
  else:
    print("\"" + position + "\",")
    return False

## Group Skill

In [0]:
def group_skill(s):
  oracle = [
    'Oracle',
    'Oracle Analytics',
    'Oracle ERP'
  ]
  computer_tech = [
    'AR Software',
    'ATS',
    'Abila',
    'Android',
    'Aurora',
    'Beam',
    'autoCAD',
    'CRM',
    'Cisco Fire Power',
    'Cisco ISE',
    'Confluence',
    'DNS Protocal',
    'DRG',
    'Dynatrace',
    'EHR',
    'EMR',
    'Egnyte',
    'FireEye HX',
    'Google Suite',
    'Hadoop',
    'IOS',
    'IT System',
    'InVision',
    'JAMF',
    'SAP',
    'SAS',
    'Linux/Unix',
    'MDM',
    'MSE',
    'Qlikview',
    'Quickbooks',
    'Salesforce',
    'Sketch',
    'Social Studio',
    'Symantec EP',
    'Tableau',
    'Totango',
    'Zendesk',
    "Magento",
    "Mac",
    
  ]
  computer_language = [
    '.NET',
    'Angular.js',
    'BASH',
    'C',
    'C#',
    'C++',
    'COBOL',
    'CSS',
    'Electron',
    'Flask',
    'GoLang',
    'GraphQL',
    'Gulp',
    'HTML',
    'Hugo',
    'JavaScript',
    'JSL',
    'JSON',
    'Java',
    'Kotlin',
    'MatLab',
    'MySQL',
    'NoSQL',
    'Next.js', 
    'Node.js',
    'NumPy',
    'Objective C',
    'PHP',
    'Pearl',
    'Perl',
    'Python',
    'R',
    'Pandas',
    'Rails',
    'React',
    'Ruby',
    'SQL',
    'Scala',
    'Spring',
    'Swift',
    'TensorFlow',
    'TypeScript',
    'Vue.js',
    'openCL',
    'jQuery',
    'XML',
    "PyTorch",
    "PySpark",
    
  ]
  data = [
    'Chartio',
    'D3',
    'Data Analysis',
    'Data Entry',
    'Data Management',
    'Data Mining',
    'Data Structure',
    'Data Visualization',
    'Database',
    'Dataflow',
    'Django',
    'ETL',
    'GGplot',
    'Grafana',
    'Spark',
  ]
  dev_tech = [
    'AJAX',
    'Agile',
    'Automation Testing',
    'BDD',
    'BIM',
    'Babel',
    'Bootstrap',
    'CDNs',
    'Chrome developer tools',
    'Complexity Analysis',
    'Debugging',
    'Distributed System',
    'Docker',
    'ESLint',
    'Fabric',
    'Flowtype',
    'Flux',
    'Front End',
    'Framework',
    'Full Stack',
    'Gatsby',
    'Git/Github',
    'Graphite',
    'JIRA',
    'JVM',
    'Jenkins',
    'Lean',
    'Lerna',
    'MEAN',
    'NPM',
    'OOP',
    'Network',
    'Open Source',
    'Packer',
    'Parquet',
    'Prettier',
    'Process Mining',
    'QA',
    'REST',
    'Runtime Optimization',
    'Jupyter Notebooks',
    'Kafka',
    'Kanban',
    'Kubernetes',
    'MVC',
    'MongoDB',
    'Neo4j',
    'Oracle',
    'Postgres',
    'Redis',
    'Redux',
    'SDKs',
    'Server Rendering/Language',
    'Software Design',
    'Software Licensing',
    'TDD',
    'Terraform',
    'UI',
    'UX',
    'Visual Studio',
    'web security',
    'image optimization',
    'Yarn',
    'Windows',
    'WAN',
    'WLAN and VPN',
    'Web Analytics',
    'Web Programming',
    'WebEx',
    'WebGL',
    'Webflow',
    'Webpack',
    "Scrum",
    "Machine Learning",
    "Heroku",
  ]
  cloud_computing = [
    'AWS',
    'Azure',
    'Beanstalk',
    'CloudForm',
    'CloudWatch',
    'Elasticsearch',
    'GCP',
    'Lambda',
    'Redshift',
    'S3',
    'Snowflake',
  ]
  big_data = [
    'BigQuery',
    'Big Data',
  ]
  MS = [
    'Access',
    'Excel',
    'Microsoft Office',
    'Microsoft Power BI',
    'Office 365',
    'Outlook',
    'PowerPoint',
    'Publisher',
    'SharePoint',
    'Word',
  ]
  adobe = [
    'Adobe',
    'After Effects',
    'Illustrator',
    'InDesign',
    'Photoshop',
  ]
  soft_skill = [
    'Ambitious',
    'Analytical',
    'Bilingual',
    'Client-oriented',
    'Communication',
    'Compassion',
    'Critical Thinking',
    'Curiosity',
    'Customer Service',
    'Dependable',
    'Desire',
    'Detail-Oriented',
    'Determined',
    'Driven',
    'Event Planning',
    'Follow through',
    'Go to meeting',
    'Interpersonal',
    'Leadership',
    'Management',
    'Marketing',
    'Meeting Prep',
    'Microservices',
    'Motivated',
    'Multitask',
    'Negotiation',
    'Networking',
    'Optimism',
    'Organized',
    'Outreach',
    'Passion',
    'Planning',
    'Presentation',
    'Problem Solving',
    'Product Management',
    'Project Management',
    'Public Speaking',
    'Quantitative',
    'Reading',
    'Reliable',
    'Resilient',
    'Responsible',
    'Result-oriented',
    'responsiveness',
    'Sales',
    'Self Motivated',
    'Socially Skilled',
    'Strategic thinking',
    'Supervise',
    'Teamwork',
    'Time Management',
  ]
  business = [
    'SaaS',
    'B2B',
    'Billing',
    'Corporate Finance',
    'Credit',
    'Debit/Credit',
    'Demand Generation',
    'Digital Marketing',
    'Equities',
    'Commodities',
    'FX',
    'Financial Markets',
    'Repos',
    'Risk Management',
    'SEO',
    'SOX Audits',
    'SaltStack',
    'Statistic',
    'Trade Skills',
    'Upsell products',
    'prior sales experience',
    'office work',
    'knowledge of markets and investments',
    'knowledge of purchase process',
    'knowledge of real estate market',
    'Financial Accounting',
    "BASIS",
  ]
  miss = [
    'Algorithms',
    'Linkedin',
    'BenPlus',
    'Building Code',
    'Catchpoint',
    'Cl',
    'Construction',
    'Extensively walking',
    'Graphic Design',
    'Healthcare',
    'None',
    'SGS',
    'Social Media',
    'Sports',
    'Swim',
    'Tech savvy',
    'Typing',
    'Visiting',
  ]
  research = [
    'Research',
  ]
  health_pro = [
    'CPT',
    'CT scan',
    'Clinical Research',
    'Massage',
    'Physical Therapy',
    'Medicine',
    'Radiology',
    'Rehab',
    'wound treatment',
    'nutrition assessments',
    'X-ray'
  ]
  office = [
    'E-mail',
    'Editing',
    'phone calls',
  ]
  nodata = [
      'nan',
  ]
  if s in oracle:
    return "Oracle"
  elif s in computer_tech:
    return "Software/Technology"
  elif s in computer_language:
    return "Programming Language/Framework"
  elif s in data + big_data:
    return "Data Related"
  elif s in dev_tech:
    return "Development Technology"
  elif s in cloud_computing:
    return "Cloud Computing"
  elif s in MS:
    return "Microsoft Office"
  elif s in adobe:
    return  "Adobe Suite"
  elif s in soft_skill:
    return "Soft Skill"
  elif s in business:
    return "Business Skill"
  elif s in miss:
    return "Miscellaneous"
  elif s in health_pro:
    return "Health Professional Skill"
  elif s in office:
    return "Office Skill"
  elif s in nodata:
    return "None"
  elif s in research:
    return "Research"
  else:
    return False

# Grouping Job Title

## Grouping job title for Tech

In [0]:
# get_position(df, "Tech", True)

In [0]:
# # list the dict
# sales = [
#     'Sales Development Associate',
#     'Sales',
#     'Customer Representative',
#     'CRM Analyst',
#     'Business Intelligence',
#     'Marketing',
#     'Sales Development Representative',
#     'Digital Marketing',
#     'Social Media Specialist',
#     'Senior Brand Designer',
#     'Growth Analyst',
#     'Sales Development',
#     'Customer Support',
#     'Service Desk Engineer',
#     'Associate Account Manager',
#     'Account Executive',
#     'Senior Presales Engineer',
#     'Customer Successs and Implementation Specialist',
#     'Technical Account Manager',
#     'Client Relations Specialist',
#     'Marketing Manager',
#     'Business Development',
#     'Demand Generation Specialist',
#     'Business Analyst',
#     'Business Strategy',
#     'Marketing Strategy',
#     'Business',
#     'Associate Account Executive',
#     'Demand Generation',
#     'Sales Executive',
#     'Sales Generation',
#     'Customer Success',
#     'Solutions Engineer',
#     'Sales Engineer',
#     'Business Development Representative',
#     'Customer Success Operation Manager',
#     'Sales Developement Representative',
#     'Customer Success Specialist',
#     'Client Care Associate',
#     "Sales Representative",
#     "Business Operations & Strategy Manager",
#     "Financial Planning Associate",
#     "Demand/MRP Planning Associate",
#     "Customer Support Rep.",
#     "Manager of Marketing",
#     "Sales Manager",
#     "Product Marketing Manager",
#     "Customer Success Associate",
#     "Enterprise Account Manager",
#     "Customer Success Manager",
#     "Sales Support Specialist",
#     "Sales Rep.",
#     "Business Insights Analyst",
#     "Community Manager",
#     "Business Dev. Associate",
#     "Enterprise B2B Account Exec.",
#     "Financial Operations Analyst",
#     "Lead SalesForce Dev.",
#     " Lead SalesForce Dev.",
# ]
# software_engineer = [
#     'Software Engineer',
#     'Associate Software Developer',
#     'Engineer: Storage Application',
#     'Software Developer',
#     'Fullstack Engineer',
#     'Product Engineer',
#     'Web Developer',
#     'Software Support & Developer',
#     'Software Engineer Trainee',
#     'Staff Applications Engineer',
#     'Python Software Engineer',
#     'Java Developer',
#     "Full Stack Developer",
#     ".Net Developer",
#     "C++ Developer",
#     "Core Java Software Developer",
#     "Full-Stack Engineer",
#     "Full Stack Engineer",
#     "Platform Software Engineer",
#     "BIM Applications Specialist",
#     "Fullstack Java Developer",
    
# ]
# data_scientist = [
#     'Data Scientist',
#     'Staff Data Scientist',
#     'Data Quality Analyst',
#     'Machine Learning Engineer',
#     'Collections Analyst',
#     'Data Engineer',
#     'Data Administrator',
#     'Data Analyst',
#     'Data Platform Engineer',
#     'Data Consultant',
#     'Research Analyst',
#     'Big Data Developer',
#     'Machine Learning Developer',
#     'Data Scientist ',
#     "Data Warehouse Developer",
#     "Senior Data Analyst",
#     "Head of Data",
#     "VP of Data Science",
# ]
# miscellaneous = [
#     'Reporter',
#     'Network Consultant',
#     'Site Lead',
#     'Consultant',
#     'General Counsel',
#     'Administrator',
#     'Legal Analyst',
#     'Operation Manager',
#     'Intern',
#     'Analyst',
#     'Videographers',
#     'PR',
#     'Communication Associate',
#     'Job Captain',
#     'Resident Engineer',
#     'Survey Technicians',
#     'Assistant Engineer',
#     'Solution Consultant',
#     "None",
#     'Coordinator',
#     'Office Experience Coordinator',
#     'Billing Coodinator',
#     "Social Media Manager",
#     "People Business Partner",
#     "Events Coordinator",
#     "Brand Manager",
#     "Manager of Wedding Planning Tools",
#     "Merchandising Assistant",
#     "Counsel Associate",
#     "nan",
#     "Research Internship",
#     "Research Scientist",
#     "Internal Communications Director",
#     "Murex RISK Business Analyst",
#     "Murex Functional Analyst",
#     "Murex Configuration and Environment Management",
#     "Murex FO",
#     "Murex Consultant",
#     "Regional Operations Associate",
#     "Ticket Distribution Agent",
#     "Express Shopper",
#     "Outbound Dept. Manager",
#     " Corporate Controller",
#     "Paid Social Manager",
#     "Corporate Controller",
# ]
# devops = [
#     'DevOps Engineer',
#     'Infrastructure Developer',
#     'DevOps',
#     "Platform Engineer",
#     "Integration Engineer",
#     "Infrastructure Applications Specialist",
#     "Linux DevOps Engineer",
# ]
# accounting = [
#     'Accouting',
#     'Accountant',
#     'Revenue Accountant',
#     'Staff Accountant',
#     'Accounts Payable Clerk'
# ]
# hr = [
#     'Recruiter',
#     'Recruiting Coordinator',
#     'HR',
#     "Sales Recruiter",
#     "Architectural Recruiter",
# ]
# frontend = [
#     'Qualitative UX Researcher',
#     'Frontend Developer',
#     'Front End Engineer',
#     'Frontend Engineer',
#     'UX Designer',
#     'Graphic Designer',
#     'Visual Designer',
#     "Front End Developer",
#     "React Engineer",
#     "Front End Software Engineer",
#     "Head of User Experience",
# ]
# it = [
#     'IT Support',
#     'Contract Technician',
#     'Technician',
#     'Support Engineer',
#     'IT Support Specialist',
#     'Junior Technical Analyst',
#     "EDI Senior Specialist",
#     "Technical Support Engineer",
#     "App. Support Analyst",
#     "Telecom/Networking Director",
# ]
# product_manager = [
#     'Product Manager',
#     'Head of Product',
#     'Product Manager Machine Learning',
#     'Project Manager',
#     'Product Designer',
#     'Engineering Lead',
#     "Learning & Dev. Manager",
#     "Product Dev. Manager",
#     "Platform Product Director",
#     "Director of Product Dev.",
#     "Technical Product Manager",
#     "Self-Service Experiences Product Manager",
#     "Product Management Director",
#     "Content Platform Product Manager",
#     "Portfolio Manager",
# ]
# qa = [
#     'Quality Assurance Engineer',
#     'QA Engineer',
#     'QA',
#     'QA Automation Engineer',
#     "SDET Engineer",
#     "QA Analyst",
# ]
# security = [
#     'Senior Security Engineer',
#     'Security Engineer',
#     'Information Security Specialist',
#     "Network Security Engineer",
#     "Cyber Risk Analyst",
#     "Chief Information Security Officer",
# ]
# mobile_engineer = [
#     'Mobile Engineer',
#     'IOS Developer',
#     'Mobile Software Engineer',
#     'IOS Engineer',
#     'Android Developer',
# ]
# backend = [
#     'Ruby Developer- Back End',
#     'Backend Engineer',
#     "Platform Reliability Engineer",
#     "Backend Engineering Manager",
#     "Back End Software Developer",
#     "Back End Software Engineer",
#     "Backend Software Engineer",
    
# ]
# electrical_engineer = [
#     'Electrical Engineer',
#     'Project/Senior Electrical Engineer',
#     'Entry Level Electrical Engineer',
# ]
# architect = [
#     'Interior Architecture and Design',
#     'Architect',
#     'Urban Planner',
# ]
# mechanical = [
#     'Mechanical Engineer',
# ]
# civil = [
#     'Civil Engineer',
#     'Geotechnical Engineer',
#     "Construction Engineer",
# ]
# structural = [
#     'Structural Engineer',
#     'Junior Bridge Engineer',
#     'Junior Structural Engineer',
# ]

In [0]:
# unclean_position = get_position(df, "Tech", False)
# clean_position = []
# for position in unclean_position:
#   if position in sales:
#     clean_position.append("Sales")
#   elif position in software_engineer:
#     clean_position.append("Software Engineer")
#   elif position in data_scientist:
#     clean_position.append("Data Scientist")
#   elif position in miscellaneous:
#     clean_position.append("Miscellaneous")
#   elif position in devops:
#     clean_position.append("DevOps")
#   elif position in accounting:
#     clean_position.append("Accounting")
#   elif position in hr:
#     clean_position.append("HR")
#   elif position in frontend:
#     clean_position.append("Frontend")
#   elif position in it:
#     clean_position.append("IT")
#   elif position in product_manager:
#     clean_position.append("Product Manager")
#   elif position in qa:
#     clean_position.append("Quality Assurance")
#   elif position in security:
#     clean_position.append("Security")
#   elif position in mobile_engineer:
#     clean_position.append("Mobile Engineer")
#   elif position in backend:
#     clean_position.append("Backend Engineer")
#   elif position in electrical_engineer:
#     clean_position.append("Electrical Engineer")
#   elif position in architect:
#     clean_position.append("Architecture")
#   elif position in mechanical:
#     clean_position.append("Mechanical Engineer")
#   elif position in civil:
#     clean_position.append("Civil Engineer")
#   elif position in structural:
#     clean_position.append("Structural Engineer")
#   else:
#     clean_position.append("Unclassified")
#     print("Unclassified data: \"" + str(position) + "\", index at: " + str(len(clean_position)))

In [0]:
#  series_position = pd.Series(clean_position)
#  series_position.value_counts()

In [0]:
# series_position.value_counts().plot.bar()

## Group Job Title for Health



In [0]:
# get_position(df, "Health", True)

In [0]:
# admin = [
#     'Member Services Administrator',
#     'Office Support',
#     'Registrar',
#     'Front Desk',
#     'Admission Clerk',
#     'Administrative Assistant', 
#     'General Counsel',
#     "Health Information Clerk",
#     'Office Assistant',
#     'Unit Clerk',
#     'Program Supervisor',
#     'Medical Office Front Desk',
#     "Member Services Administrator"
# ]
# marketing = [
#     'Digital Marketing Associate',
#     'Marketing Director',
#     'Media Relation',
#     "Customer Success Associate",
#     "Marketing",
#     'Product Marketing Coordinator',
#     'Product Marketing Manager'
# ]
# hr = [
#     'Human Resources Coordinator',
#     'Payroll Manager',
#     "HR",
#     'Payroll & Benefit Specialist',
#     "Human Resources Coordinator",
#     "Payroll Manager"
# ]
# it = [
#     'Technician',
#     'IT',
#     'IT Technician',
#     'IT Service',
#     'Technical Support',
#     'Technical Support Engineer',
#     'IT support',
#     "IT Engineer",
#     'IT Service',
#     'Technical Support',
#     'IT Operations Analyst',
#     'Finance IT Director',
#     'Corporate IT Engineer',
#     "Computer Information System",
#     "Information System Analyst"
# ]
# coordinator = [ 
#     'Care Coordinator',
#     'Studio Coodinator',
#     'Coordinator', 
#     'Outreach',
#     'Studio Coordinator',
#     'Weekend Opener',
#     "Studio Coodinator",
#     "Community Relations Coordinator"
# ]
# miscellaneous = [
#     'Swim Instructor',
#     'None', 
#     'Graphic Designer',
#     'Residental Counselor',
#     'Intern',
#     'Work Study Associate',
#     "Swim Instructor",
#     "Graphic Designer"
# ]
# health_pro = [
#     'Patient Care Associate',
#     'Physician Assistant',
#     'Medical Assistant',
#     'Physical Assistant',
#     'Pharmacist',
#     'Dietition',
#     "Health Fellowship",
#     'Medical Officer',
#     'Child Focused Adaption Services Recruiter',
#     'Hospitalist',
#     'Nutritionist',
#     'Nursing Assistant',
#     'Caregiver',
#     '305 PWR/FLX Instructor',
#     'Instructor',
#     'Overnight Custodian'
# ]
# representative = [ 
#     'Patient Access Representative',
#     'Customer Service Representative', 
#     'Customer Relation',
#     'Customer Representative', 
#     'Sales Development Representative',
#     'Customer Support Representative',
#     'Sales Development Rep',
#     'Customer Care',
#     'Customer Relation',
#     'Patient Representative',
#     'Sales',
#     'Overnight Member Services Rep.',
#     "Member Services Rep."
    
# ]
# accountant = [
#     'Junior Accountant'
# ]
# therapist = [
#     'Massage Therapist',
#     'Occupational Therapy Assistant', 
#     'Occupational Therapist',
#     'Physical Therapist'
# ]
# executive = [
#     'Project Manager',
#     'Secretary',
#     'Vendor Management',
#     'Client Success Manager',
#     'Account Manager',
#     'Strategic Account Management',
#     'Office Manager',
#     'Revenue Management',
#     'Medical Secretary',
#     'Development Manager',
#     'Case Manager',
#     'Assistant Studio Manager',
#     'Management Team',
#     'Global Infrastructure Director',
#     'Customer Acquisition VP',
#     'Global Network Services Director',
#     'Director of Perioperative Services',
#     'Operating Room Nurse Manager',
#     "Assistant Director of Marketing and Outreach"
# ]
# programmer = [
#     'Android Developer',
#     'DevOps',
#     'Web Developer',
#     'Software Engineer',
#     "Fullstack Engineer",
#     "Tech Fellowship",
#     'DevOps Engineer',
#     'Cloud Operations Engineer',
#     'Integrations Developer',
#     'Client Platform Engineer'
# ]
# social_work = [ 
#     'Social Worker',
#     'Educator',
#     'Outreach',
#     'Social Work Coordinator'
# ]
# health_tech = [
#     'Lab Techonologist',
#     'Radiology',
#     'Medical Technologist', 
#     'Biomedical Engineer',
#     'Mechanical Inspector',
#     'Mechanic',
#     'Lab Supervisor',
#     'Histotechnologist'
# ]
# doctor = [
#     'Gastroenterologist',
#     'Anesthesiologist',
#     'Pediatrics',
#     'Neuropsychology',
#     'Dermatologist',
#     'Rheumatologist',
#     'Psychiatrist',
#     'Attending Physician',
#     'Physician',
#     'Physicians',
#     'Neorosurgeon',
#     'Urologist'
# ]
# nurse = [
#     'Nurse',
#     'Nurse Practitioner',
#     'Practitioner Nurse',
#     'Office Nurse'
# ]
# food_worker = [
#     'Food Service Worker',
#     'Cook'
# ]
# data = [
#     'Data Analyst',
#     'Data Scientist',
#     'Customer Data Specialist',
#     'Marketing Data Analyst',
#     'Data Engineer',
#     "Advanced Analyst",
#     "Data Visualization Engineer",
#     'Data Analyst',
#     'Health Information Assistant',
#     'Data Sci. Project Manager',
#     "Provider Information Specialist",
#     "Senior Data Scientist"
# ]
# business = [
#     'Business Associate',
#     'Development Operations',
#     'Financial Counselor', 
#     'Business Development',
#     "Business Fellowship",
#     'Business Associate',
#     'Development Operations',
#     'Financial Counselor',
#     'Media Operations Analyst',
#     "Business Analyst"
# ]

In [0]:
# unclean_position = get_position(df, "Health", False)
# clean_position = []
# for position in unclean_position:
#   if position in admin:
#     clean_position.append("Administrator")
#   elif position in marketing:
#     clean_position.append("Marketing")
#   elif position in hr:
#     clean_position.append("Human Resources")
#   elif position in it:
#     clean_position.append("IT")
#   elif position in coordinator:
#     clean_position.append("Coordinator")
#   elif position in miscellaneous:
#     clean_position.append("Miscellaneous")
#   elif position in health_pro:
#     clean_position.append("health professional")
#   elif position in representative:
#     clean_position.append("Customer Representative")
#   elif position in accountant:
#     clean_position.append("Accountant")
#   elif position in therapist:
#     clean_position.append("Therapist")
#   elif position in executive:
#     clean_position.append("Executive")
#   elif position in programmer:
#     clean_position.append("Programmer")
#   elif position in social_work:
#     clean_position.append("Social Worker")
#   elif position in health_tech:
#     clean_position.append("Health Tech")
#   elif position in doctor:
#     clean_position.append("Doctor")
#   elif position in nurse:
#     clean_position.append("Nurse")
#   elif position in food_worker:
#     clean_position.append("Food Service Worker")
#   elif position in data:
#     clean_position.append("Data Scientist")
#   elif position in business:
#     clean_position.append("Business")
#   else:
#     clean_position.append("Unclassified")
#     print("Unclassified data: \"" + str(position) + "\", index at: " + str(len(clean_position)))
    
# #pd.Series(clean_position).value_counts()

## Grouping job title for Law

In [0]:
# get_position(df, "Law", True)

In [0]:
# paralegal = [
#     'Intellectual Property Paralegal',
#     'Corporate Paralegal',
#     'Capital Markets Paralegal',
#     'Bankruptcy Paralegal',
#     'Paralegal',
#     'Real Estate Paralegal',
#     'Trusts & Estates Paralegal.',
#     'Paralegal.',
# ]
# counsel = [
#     'Corporate M&A Associate/Counsel',
#     'Insurance Knowledge Management Lawyer',
#     'Investment Funds Knowledge Management Lawyer',
#     'Attorney',
#     'Contract Attorney',
#     'Lawyer',
#     'Attorney.',
#     'Knowledge Management Lawyer',
# ]
# hr = [
#     'Employee Benefits Associate',
#     'Payroll Specialist/Coordinator',
#     'Recruiting Assistant',
#     'Lateral Partner Recruiting Manager',
#     'Talent Acquisition Sepcialist',
#     'Legal Recruiting Assistant',
#     'Attorney Development Manager',
#     'Professional Development Assistant',
#     'HR Manager',
#     'Global Benefits Generalist',
#     'HR Business Partner',
#     'Director of Global Retirement Benefits',
#     'Talent Acquisition & Development Coordinator',
#     'HR Coordinator/Director',
#     'Employee Benefits Practice Manager',
#     'Attorney Development Assistant, Manager, Specialist',
#     'Practice Development Manager',
#     'Technical Trainer',
#     "HR Recruiting Assistant",
# ]
# coordinator = [
#     'Senior Billing Coordinator',
#     'CLE Coordinator',
#     'Conference Services Assistant',
#     'Collections Coordinator.',
#     'Electronic Resources Coordinator',
#     'Electronic Resources Manager',
#     'Public Relations & Communications Coordinator',
#     'Litigation Business Development Coordinator',
#     'Events Manager',
#     'Marketing Coordinator',
#     'Lateral., Practice Recruitment & Professional Development Coordinator',
#     'Legal Recruiting Coordinator',
#     'Business Development Coordinator.',
#     'Client Accounting Coordinator',
#     'Media Relations Coordinator',
#     'E-Billing and Inventory Data Coordinator',
#     'E-Billing Coordinator',
#     'Directories Coordinator',
#     'HR Coordinator',
#     'Billing Coordinator.',
#     'Legal Recruiting Coordinator.',
#     'Litigation Services Coordinator',
#     'Global Attorney Development Coordinator',
#     'Paralegal Services Coordinator',
#     'Collections Coordinator',
#     'Administrative Coordinator',
#     'Pricing & Billing Rate Coordinator',
# ]
# miscellaneous = [
#     'Mid-Level Securities Associate',
#     'Media, Technology & Commerical Transactions Associate',
#     'Lateral',
#     'User Support Specialist',
#     'Librarian',
#     'Supervisor of Word Processing',
#     'Security Officer',
#     'Conflicts Analyst',
#     'General Lateral Applications',
#     'None',
#     'US Brand, Creative & Digital Graphic Designer',
#     'Client Services Manager',
#     'Knowledge Manager',
#     'Media Relations & Communications Manager.',
#     'Director of Corporate Practice',
#     'Manager of Secretarial Services',
#     'Research Analyst.',
#     'Practice Development Manager.',
#     'HRIS Analyst',
#     'Litigation Analyst',
#     'Junior Conflict Analyst',
#     'Knowledge & Digital Services Librarian',
#     'Global Pitch & Proposal Panel/RFP Advisor',
#     'Research Librarian',
#     'Temporary Graphic Designer',
#     'Client Development Advisor',
#     'Manager of Global Attorney Training & Mentoring',
#     'Client Service Specialist',
#     'Department Assistant',
#     'Legal Project Manager',
#     'New Business Proposal Manager',
#     'Diversity & Inclusion Manager',
#     'Corporate Practice Manager',
#     'HRIS Reporting Analyst',
#     'MicroStrategy Product Lead',
#     'Public Relations Manager, Specialist',
#     'Project Management Specialist',
#     'Litigation Support Project Manager',
#     'Proofreader',
#     'Patent Agent',
#     'Practice Manager',
#     'Proposal Writer.',
#     'Litigation Support.',
#     'Lateral.',
#     'Litigation Business Development.',
# ]
# admin = [
#     'SharePoint Administrator',
#     'Judicial Clerkship',
#     'Application Administrator',
#     'Receptionist',
#     'Facilities Manager',
#     'Work Allocation Manager',
#     'Office Services Clerk',
#     'Administrative Assistant',
#     'Collection Management Specialist',
#     'General Services Clerk',
#     'Assistant Managing Clerk',
#     'Managing Clerk',
#     'Catering Assistant',
#     'Copy Center Operator',
#     'Document Imaging/Records Clerk.',
#     'Conference Center Administrative Assistant',
#     'Records Manager',
#     'Administrative Supervisor',
#     'Administrative Assistant.',
#     'Firm Operations Manager',
#     'Document Processing Manager',
#     'Administrative Staff.',
#     'Business Development Manager, Specialist, Assistant',
#     'Duplicating Operator',
#     'Word Processing Operator/Desktop Publishing Specialist.',
#     'Secretary.',
#     'Office Services Specialist',
#     'Executive Assistant',
#     'Staff.',
#     'Professional Staff.'
# ]
# tech = [
#     'Enterprise Applications Engineer',
#     'Director of Global IT Operations',
#     'SQL Database Administrator',
#     'Audio/Visual Specialist.',
#     'Desktop Engineer.',
#     'Desktop Support Analyst',
#     'End User Systems Engineer',
#     'Applications Engineer',
#     'Sharepoint Architect/Senior Developer',
#     'Server/Storage Engineer',
#     'Business Systems Engineer',
#     'Database Administrator',
#     'Digital Services Specialist',
#     'IT Practice Support Specialist',
#     'IT Support Technician',
#     'Storage & Virtualization Manager',
#     'Project Specialist, Data Science, Analysis & Investigation.',
#     'Systems Analyst',
#     'Systems Solutions Analyst',
#     'Project Manager',
#     'Network Engineer',
#     'Assistant Motion Graphics & Digital Designer',
#     'Information Governance Assistant',
#     'Information Security Analyst',
#     'Library Technical Services',
#     'SharePoint Application Developer',
# ]
# business = [
#     'Financial Analyst.',
#     'Manager & Assistant Manager of Business Development',
#     'Procurement Specialist',
#     'M&A Financial Advisory Associate',
#     'Tax Associate',
#     'Business Development Manager/Specialist',
#     'Billing Specialist',
#     'International Trade Analyst.',
#     'Capital Markets Practice Manager',
#     'Manager of Financial Services Business Intelligence',
#     'Professional Development Manager',
#     'Business Development Analyst',
#     'Business Development Specialist/Manager',
#     'Business Development Specialist',
#     'Senior Pricing Analyst',
#     'Financial Systems Analyst.',
#     'Billing Supervisor',
#     'Business Development Specialist & Assistant',
#     'Practice Business Development Manager',
#     'Financial Accounting Supervisor',
#     'Billing Sepcialist',
#     'Marketing& Business Development Specialist.',
#     'Treasury Analyst',
#     'Sourcing & Procurement Specialist',
#     'Pricing Strategist',
#     'Internal Auditor',
#     'Accountant',
#     'Partner Accounting Supervisor',
#     'Manager of Business Continuity',
#     'Junior Pricing Financial Analyst',
#     'Financial Analyst',
#     'Finance Systems Analyst',
#     'Finance Project Manager',
#     'Client Operations Supervisor',
#     'Client Development Intern',
#     'Business Intelligence Analyst',
#     'Business Development Manager/Coordinator',
#     'Billing Administration Assistant',
#     'Business Intelligence Developer',
#     'Client Development Specialist',
#     'Pricing Analyst, Manager',
#     'Billing Coordinator',
#     'Market Intelligence Analyst.',
#     'Marketing & Business Development Manager/Director',
# ]
# intern = [
#     'Summer Associate',
#     'Summer Associate, People Advisory Manager',
#     'Fellowship.',
#     'Fellowship',
#     'Pro Bono Intern',
#     'IT Intern',
#     'HR Intern',
#     'Attorney Recruiting & Development Intern',
# ]
# associate = [
#     'Associate.',
#     'Legal Assistant.',
#     'Associate',
#     'Legal Secretary',
#     'Law Clerk',
#     'Judicial Clerk',
#     'Asset Management Legal Assistant',
#     'Practice Assistant.',
#     'Corporate Legal Assistant',
#     'Litigation Legal Assistant',
#     'Practice Assistant',
#     'Attorney Support Assistant',
#     'Case Assistant',
#     'Legal Executive Assistant',
#     'Practice & Office Development Assistant',
#     'Legal Assistant',
# ]

In [0]:
# # catch ungroup
# # unclean = get_position(df,"Law", False)
# # for position in unclean:
# #   if position not in paralegal + associate + intern + business + tech + admin + miscellaneous + coordinator + hr + counsel:
# #     print(position)
# #     break

# # grouping and print out if not being grouped
# unclean = get_position(df, "Law", False)
# clean = []
# for position in unclean:
#   if position in paralegal:
#     clean.append("Paralegal")
#   elif position in associate:
#     clean.append("Associate/Assistant")
#   elif position in intern:
#     clean.append("Intern position")
#   elif position in business:
#     clean.append("Business Related")
#   elif position in tech:
#     clean.append("Tech Related")
#   elif position in admin:
#     clean.append("Administrator")
#   elif position in miscellaneous:
#     clean.append("Miscellaneous")
#   elif position in coordinator:
#     clean.append("Coordinator")
#   elif position in hr:
#     clean.append("Human Resources")
#   elif position in counsel:
#     clean.append("Counsel / Lawyer")
#   else:
#     clean_position.append("Unclassified")
#     print("Unclassified data: \"" + str(position) + "\", index at: " + str(len(clean_position)))

# # series_position = pd.Series(clean)
# # series_position.value_counts()

## Grouping for Business

In [0]:
# unique_job_title_business = get_position(df, "Business", True)
# three = len(unique_job_title_business)//3
# first_part = unique_job_title_business[:three]
# second_part = unique_job_title_business[three:three+three]
# third_part = unique_job_title_business[three+three:]

In [0]:
# hr = [
#     'HR Manager',
#     'HR Generalist',
#     'HR Director',
#     'HR Associate',
#     'HR Assistant',
#     'Employee Benefits Producer',
#     'Information Resources Associate',
#     'HR Analyst',
#     'HR Associate',
#     'Principal',
#     'Strategic Staffing',
#     'HR Coordinator',
#     'Campaign Manager',
#     'Benefits Manager',
#     'Payroll Analyst',
#     'SEO Specialist',
#     'HR Generlist',
#     'HR',
#     'International Benefits',
#     'Employee Benefits',
#     'HRIS',
#     'Benefits',
#     'Talent Aquistion Specialist 1',
# ]
# marketing = [
#     'Marketing Assistant',
#     'Healthcare & Life Sciences Marketing Manager',
#     'Digital Marketing Specialist',
#     'Marketing Representative',
#     'Insights Manager',
#     'Researcher',
#     'Marketing Editor',
#     'Digital Experience Management',
#     'Bridge Discipline Leader',
#     'Marketing Coordinator',
#     'Content Manager',
#     'Marketing Vice President',
#     'Marketing Senior Associate',
#     'Digital Marketing Director',
#     'Healthcare Content Manager',
#     'Content Strategist',
#     'Public Relations and Paid Media Director',
#     'CRM Marketing Manager',
#     'Marketing Director',
#     'Digital Workplace Manger',
# ]
# tax = [
#     'Tax Manager',
#     'Tax Senior',
#     'International Tax Services',
#     'International Tax',
#     'Tax',
#     'Income Franchise SALT Managing Director',
#     'Core Tax Services Manager',
#     'SALT Associate',
#     'Transfer Pricing Tax Associate',
#     'Transfer Pricing Tax Manager',
#     'Tax Accountant',
#     'International Tax Staff Accountant',
#     'Tax Associate',
#     'Tax Partner',
#     'Financial Services Tax Staff',
#     'International',
#     'SALT Compliance Manager',
#     'Tax Accountant',
#     'International Tax Staff Accountant',
#     'Trust & Estates Tax Manager',
#     'International Tax Manager',
#     'SALT Manager',
#     'Tax Reporting & Advising Functional Optimization Manager',
#     'Tax Operations Administrator',
#     'Tax Director',
#     'SALT Senior',
#     'SALT Tax Partner',
#     'Transaction Tax Manager',
#     'Senior Tax Associate',
#     'Senior Tax Associate',
#     'Corporate Tax Associate',
# ]
# auditor = [
#     'Auditor',
#     'Audit Manager',
#     'China Practice Audit',
#     'Financial Services Audit Manager',
#     'Audit Associate',
#     'Audit Senior',
#     'IT Audit Manager',
#     'Audit Staff Accountant',
#     'Audit Accountant',
#     'China Practice Audit Accountant',
#     'Assurance Manager',
#     'Assurance Associate',
#     'Audit Associate',
#     'Audit Senior',
#     'IT Audit Manager',
#     'Audit Staff Accountant',
#     'Audit Accountant',
#     'China Practice Audit Accountant',
#     'Assurance Senior',
#     'Assurance Professional',
#     'IT Audit Services Associate',
#     'Assurance Director',
#     'Consulting Internal Auditor',
#     'Internal Audit Staff Analyst',
#     'Audit',
# ]
# recruit = [
#     'Recruiter',
#     'Campus Recruiting Assistant',
#     'Placement Specialist',
# ]
# miss = [
#     'Proofreading Assistant',
#     'Executive Compensation Senior Associate',
#     'Operations & Portfolio Strategy Associate',
#     'Strategic Sourcing & Contracts Director',
#     'Construction Delay Associate',
#     'Healthcare Advisory Manager',
#     'Nonprofit Associate',
#     'Managed Care Director',
#     'None',
#     'Controller/Director of Finance',
#     'Transfer Pricing Manager',
#     'Power BI Training Contractor',
#     'Managed Care Director',
#     'None',
#     'Cyber Security Specialist & Penetration Tester. Transaction Advisory Services Director',
#     'Administrative Controller',
#     'Construction Controller',
#     'Financial Proofreader',
#     'Advisory Supervisor',
#     'Regulatory & Compliance Solutions Manager',
#     'A/R Coordinator',
#     'Forensics and Litigation Services Director',
#     'ERS Director',
#     'Nonprofit Advisory Services Associate',
#     'Attorney',
#     'Legal Counsel',
#     'Corporate Paralegal',
#     'Plumbing Engineer',
#     'Mechanical Engineer',
#     'Electrical Engineer',
#     'Rail Systems Engineer',
#     'Tunneling Engineer',
#     'Regulatory Controller',
#     'International Controller',
#     'Fire Safety',
#     'Medical Assistant',
#     'Various',
#      'Legal Staff Paralegal',
#     'Paralegal',
# ]
# intern = [
#     'Tax Admin Intern',
#     'Accounting Internship',
#     'Internship',
#     'Staff Accountant Intern',
#     'Campus Recruiting Intern',
#     'IT Help Desk Internship',
#     'Tax Intern',
#     'Accounting Internship',
#     'Internship',
#     'Accounting Intern',
#     'Summer Design Intern',
#     'Analytics & Reporting Intern',
#     'Research Intern',
#     'Event Marketing Intern',
#     'Finance Intern',
#     'Cyber Risk Intern',
#     'Internship',
#     'Intern',
# ]
# real_estate = [
#     'Real Estate Assurance Associate',
#     'Real Estate Lead',
#     'Real Estate Advisory',
#     'Building',
#     'Property Manager',
#     'Real Estate Analyst',
#     'Real Estate Assistant',
#     'Leasing',
#     'Lease Analyst',
#     'Retail Leasing Manager',
#     'Resident Manager',
# ]
# development = [
#     'STS Research & Development Associate',
#     'Business Development Director',
#     'Business Restructuring Services Associate',
#     'Business Restructuring Services Manager',
#     'Business Info Systems Manager',
#     'Assurance & Advisory Partner',
#     'Business Valuation Senior or Manager',
#     'CFO Advisory Services Manager',
#     'ERP Consultant',
#     'ERP Project Manager',
#     'Learning & Development Administrator',
#     'Business Intelligence Research Associate',
#     'Business Operations Financial Services Director',
#     'Finance Transformation Manager',
#     'Readiness Planning Project Manager',
#     'Strategy Manager',
#     'Executive Assistant',
#     'CFO Advisory Manager',
#     'Service Strategy & Design Manager',
#     'Innovation Expert',
#     'Organizational Effectiveness',
#     'Business Development',
#     'Enterprise Improvement',
#     'Business Development Specialist',
#     'Business Development Manager',
#     'Business Development Associate',
#     'Corporate Ops & Transformation Product Manager',
#     'Business Intelligence Product Manager',
#     'Survey Manager',
#     'Strategic Solutions Associate',
#     'Account Manager',
#     'Account Director',
#     'PR Account Supervisor',
#     'Public Relations Professional',
#     'Global Communications Lead',
#     'Public Relations Account Supervisor',
#     'Health Public Relations Director/Executive',
#     'Media Relations Specialist',
#     'Research & Business Engagement',
#     'Restructuring Advisory',
#     'Facilities Coordinator',
#     'Team Lead of Financial Products Consolidations',
#     'Geller Advisors Family Office CFO',
#     'Operations Associate',
#     'Research Associate',
#     'Financial Advisor',
#     'Business Dev',
#     'Board Relations',
#     'Regional Advising',
#     'Portfolio Operations Associate',
#     'Renewals Representative',
#     'VP of Strategic Partnerships',
#     'Finance Associate',
#     'Area Vice President',
#     'Producer',
#     'Account Manager',
#     'Coordinator',
#     'Business Development Coordinator',
#     'Account Coodinator',
#     'Program Associate',
#     'Provider Relations Specialist',
#     'Retention Representative',
#     'Strategic Planner',
#     'Corporate Strategy & Pricing Associate',
#     'Corporate Development Associate',
#     'Head of Sourcing & Procurement',
#     'Network & Partnerships',
#     'Member Care & Service',
#     'Business',
#     'Managed Care Coordinator',
#     'Practice Assistant',
#     'Management',
#     'Regional',
#     'Investor Relations',
#     'Project',
#     'Loan Compliance',
#     'Valuation Services Director',
#     'Assitant Manager',
# ]
# analyst = [
#     'Valuation & Business Analytics Director',
#     'Technical Analyst',
#     'Healthcare Claims Research Analyst',
#     'Retiree Health Analyst',
#     'Business Analyst',
#     'Marketing Science Analyst',
#     'Senior Financial Analyst',
#     'AML Analyst',
#     'Information Security Analyst',
#     'Account Receivable Analyst',
#     'Financial Analyst', 
#     'Investment Reporting Senior Analyst',
#     'Real Estate Fund Analyst',
#     'Operations Analyst',
#     'Risk & Portfolio Analyst',
#     'Credit Analyst',
#     'Loan Operations Analyst',
#     'Risk Analyst',
#     'Investor Relation Analyst',
#     'Modeling Analyst',
#     'Captial Market Analyst',
#     'Senior Analyst- Model Validation',
#     'Analyst',
#     'Risk Analytics Associate',
#     'Corporate Systems Analyst',
#     'Writer & Analyst',
#     'Business Intelligence Analyst',
#     'Financial Analyst',
#     'Risk',
#     'HRIS Analyst',
#     'Project Business Analyst',
#     'Summer Quantitative Finance Analyst',
#     'Quantitative Associate',
#     'Analytics Associate',
#     'Analysis',
#     'Risk'
# ]
# insurance = [
#     'Forensic Accounting & Investigations Associate',
#     'Forensic Accounting & Investigations Manager',
#     'Insurance Claims Forensic Accounting Manager',
#     'Health Care Consultant',
#     'Insurance Advisory',
#     'Insurance Analyst',
#     'Underwriter',
#     'Medical Claims Examiner',
#     'Material Damage Specialist',
#     'Treaty Underwriter',
#     'Compliance',
#     'Clinical',
#     'Insurance Operations',
# ]
# data = [
#     'Healthcare Data Scientist',
#     'Healthcare Data Analyst',
#     'Data Integration Scientist',
#     'Data Engineer',
#     'Data Visualization',
#     'Data Architect',
#     'Data Analytics',
#     'BIM Designer',
#     'Pricing Analytics Lead',
#     'Information Architect',
#     'Public Relations Data & Analytics Director',
#     'Data Scientist',
#     'Data Encoder',
#     'Data Asscoiate',
#     'Data Analyst',
#     'Data Architect Consultant',
#     'Data Developer',
#     'Data',
#     'Data Solution Architect',
#     'HealthCare Data Analyst',
#     'Data Coordinator',
# ]
# tech = [
#     'Advisory IT Strategy & Process Manager',
#     'Strategic Cloud Services Consulting Manager',
#     'Oracle Reporting & Analytics Consultant',
#     'Oracle HCM Learning Solutions Consultant',
#     'Oracle HCM Cloud Solutions Architect',
#     'Oracle HCM Cloud Compensation Solutions Architect',
#     'Oracle EPM Cloud Solution Architect',
#     'Functional Lead & IT Support Coordinator',
#     'Cyber Security Specialist & Penetration Tester',
#     'IT Assurance Associate',
#     'Web Services/SharePoint Specialist',
#     'Dynamics 365 CRM Developer',
#     'Cyber Security & Privacy Manager',
#     'Data Privacy Associate',
#     'IT Helpdesk Analyst',
#     'Controls Advisory IT Associate',
#     'Cyber Risk Data Protection & Privacy Manager',
#     'Cyber Risk Manager',
#     'Data Warehouse & Business Intelligence Advisor',
#     'Digital Transformation & Management Manager',
#     'Microsoft Dynamics 365 ERP Functional Lead',
#     'IT Advisory Consultant',
#     'Technology Solutions',
#     'IT Support and Implementation Coordinator',
#     'AWS Technology',
#     'Cloud/DevOps Engineer',
#     'Cloud Architect',
#     'Technology Strategy & Operations',
#     'Big Data Software Engineer',
#     'Full Stack Developer',
#     'Front-End Developer',
#     'Freelance Software Engineer',
#     'Java Developer',
#     'Front-End Developer',
#     'Digital Services Leader',
#     'Intelligent Transporation Systems Leader',
#     'Lead of Technology',
#     'Test Engineer',
#     'Full-Stack Software Engineer',
#     'Fullstack Developer',
#     'IT Helpdesk Dispatcher',
#     'C# Developer',
#     'Test Lead',
#     'Front End Engineer',
#     'IT Infrastucture & Security Engineer',
#     'Software Engineer',
#     'Full Stack',
#     'Lead Network Engineer',
#     'Application Developer',
#     'Software Developer',
#     'Software Engineer',
#     'Network Security Engineer',
#     'Insurance Securities Structurer',
#     'Multimedia Operations Engineer',
#     'Engineering Manager',
#     'IT',
#     'Product Dev.',
#     'Chief Engineer',
#     'Senior Software Engineer',
#     'Junior Network Engineer',
#     'Network Engineer',
# ]
# design = [
#     'Instructional Designer',
#     'Motion Graphics Designer',
#     'Design Director',
#     'Senior UI Designer',
#     'Senior Designer',
#     'Motion Graphics Animator',
#     'Motion Graphics Specialist',
#     'Motion Graphics Manager',
#     'Motion Graphics Assistant',
#     'Senior Product Designer',
#     'UX/UI Designer',
#     'Product Designer',
#     'Product',
#     'Design',
#     'Graphic Designer',
#     'Marketing Coordinator/Graphic Designer',
# ]
# accountant = [
#     'Bookkeeper',
#     'Senior Staff Accountant',
#     'Staff Accountant',
#     'CFO Advisory Services Consultant',
#     'Bookkeeper',
#     'Senior Staff Accountant',
#     'Accountant',
#     'Accounting Advisory Services Manager',
#     'Billing Analyst',
#     'Forensic Accountant',
#     'Account Receivable Analyst', 
#     'Private Equity/Hedge Fund Accountant',
#     'Asscoaite Accountant',
#     'Accounting and Finance Manager',
#     'Reconcilation Splt 2',
#     'Accounts Recivable Collections Specialist',
#     'Accounting Clerk',
#     'Accounting Specialist',
#     'Direct Bill Staff Accountant',
#     'Account Recievable Clerk',
#     'Accounting Manager',
#     'Accounting',
# ]
# admin = [
#     'Administrative Assistant',
#     'Receptionist',
#     'Administrative Assistant',
#     'Receptionist',
#     'Office Services Clerk',
#     'Assistant Vice President',
#     'Corporate Administrator',
#     'Accounts Payable Clerk',
#     'Finance & Administration Specialist',
#     'Vice President of Product',
#     'Administrative Professional',
#     'Office Services',
#     'Vice President',
#     'Office Manager',
#     'Office Services Assistant',
#     'Administration',
#     'Executive Assitant',
#     'Recruitement Administrator',
#     'Office Services',
#     'Data Entry Clerk',
#     'Account Administrator',
#     'Finance & Actuarial Information Assitant',
#     'Sales Administration Specialist',
#     'Front Desk Administrator',
# ]
# consulting = [
#     'Staff Consultant',
#     'Business Info Systems Consultant',
#     'Nurse Clinical Consultant',
#     'Nonprofit Advisory Services Consultant',
#     'Financial Services Internal Audit Consultant',
#     'Financial Crime Model Validation Consultant',
#     'Strategic Sourcing Consultant',
#     'Financial Crime Consultant',
#     'Management Consultant',
#     'Consultant',
#     'EECI Auctions',
#     'Admin. Health Outcomes Research Consultant',
#     'Financial Consultant',
#     'Wealth Consultant',
#     'Associate Actuarial Consultant',
#     'Experience Design Consultant',
#     'Operations Consultant',
#     'Managing Logistics & Operations Consultant',
#     'Blast Consultant',
#     'Independent Consultant',
#     'Strategic Shareholder Advisory',
#     'Transportation & Infrastructure Advisory',
#     'Shareholder Activism Advisory',
#     'Debt Advisory',
#     'Healthcare Advisory',
#     'Activism Defense Advisory',
#     'Insurance Advisory',
#     'Restructuring Advisory',
#     'Strategy Management Consultant',
#     'Solutions Consultant',
#     'Inventory Field Consultant',
# ]
# risk = [
#     'Risk & Compliance Consultant',
#     'Business Risk Services',
#     'Loan Review Consultant',
#     'Loan Review Manager',
#     'Risk & Compliance Data Analytics Management Consultant',
#     'Information Security Manager',
#     'Investigator',
#     'Investigations & Disputes Director',
#     'Investment and Market Risk Asscoaite',
#     'Senior Quantitative Risk',
# ]
# investment = [
#     'Alternative Investment Practice Manager',
#     'Alternative Investment Practice Supervisor',
#     'Investment Principal',
#     'Credit Trader',
#     'Investment',
#     'Investment Operations',
#     'Private Banking Credit Officer',
#     'Funding Specialist',
#     'Wealth Associate',
#     'Finance Investment Associate',
# ]
# sales = [
#     'Healthcare Revenue Cycle Associate',
#     'Oracle Regional Sales Director',
#     'Inside Sales Representative',
#     'Quality Control Manager',
#     'Quality Control Director',
#     'Sales Representative',
#     'Assistant Account Executive',
#     'Account Executive',
#     'Professional Liability Producer',
#     'Associate Account Executive',
#     'Client Service Leader',
#     'Client Partner',
#     'Consumer Goods & Services Manager',
#     'Client Solutions Associate',
#     'Consumer Public Relations',
#     'Portfolio Management',
#     'Sales Executive',
#     'Salesforce Developer',
#     'Head of Public Equities Events',
#     'Equity Research',
#     'Equity Capital Markets',
#     'Client Service Specialist',
#     'Account Specialist',
#     'Account Management',
#     'Client Services',
#     'Client Service Associate',
#     'Client Services Representative',
#     'Product Sales Person',
#     'Sales Manager',
#     'Sales Support Specialist',
#     'Inside Sales Associate',
#     'Employee Benefits Sales',
#     'Sales Assitant',
#     'Salesperson/Associate Broker',
#     'Junior Broker',
#     'Broker Account Manager',
# ]
# project = [
#     'Project Manager',
#     'Research Leader',
#     'Project Commercial Manager',
# ]
# transaction = [
#     'Transactional Advisory Services Associate',
#     'Transaction Advisory Services Associate',
#     'Transaction Advisory Services Manager',
#     'Transaction Advisory Services Director',
#     'Transactional Advisory Services Manager',
#     'Transaction Advisory Manager',
#     'Transaction Advisory Supervisor',
#     'Transaction Services Manager',
# ]
# actuary = [
#     'Medicare Advantage Actuary',
#     'Medicare Advantage Consulting Actuary',
#     'Retirement Plan Actuary',
#     'Actuarial Analyst',
#     'Actuary',
# ]

In [0]:
# # catch ungroup
# unclean = get_position(df,"Business", False)
# for position in first_part + second_part + third_part:
#   if position not in tech + data + intern + insurance + analyst + development + \
#   miss + real_estate + hr + recruit + auditor + tax + marketing + design + accountant + \
#   admin + consulting + risk + transaction + project + sales + investment + actuary:
#     print(position)

## Making Skillsets consistent

In [0]:
# temp = pd.Series(get_skill(df, "Any", True)).sort_values().tolist()
# half = len(temp)//2
# first_part = temp[0:half]
# second_part = temp[half:]

In [0]:
# # Consistent
# organize = [ # Organized
#     'organized',
#     'Organization',
#     'Organized',
#     'Organizational',
# ]
# vue = [ # Vue.js
#     'Vue.js',
#     'VueJS',
# ]
# typescript = [ # TypeScript
#     'TypeScript',
#     'Typescript',
# ]
# tf = [ # TensorFlow
#     'TensorFlow',
#     'Tensorflow',
# ]
# teamwork = [ # Teamwork
#     'Team Player',
#     'Teamwork',
#     'teamwork',
# ]
# stat = [ # Statistic
#     'Statistical Analysis',
#     'Statistical Modeling',
# ]
# server = [ # Server Rendering/Language
#     'Server',
#     'Server Rendering',
#     'Server Side Language,',
# ]
# self_motivated = [ # Self Motivated
#     'Self Motivating',
#     'Self-Motivated',
# ]
# research = [ # Research
#     'Researching',
#     'Research',
# ]
# PowerPoint = [ # PowerPoint
#     'Power Point',
#     'PowerPoint'
# ]
# Problem_Solving = [ # Problem Solving
#     'Problem Solving',
#     'Problem-solving',
#     'problem solving',
# ]
# Public_Speaking = [ # Public Speaking
#     'Public Speaking',
#     'Public Sppeaking',
#     'Verbal',
    
# ]
# QA = [ # QA
#     'Testing',
#     'QA',
#     'QA Automation'
# ]
# REST = [ # REST
#     'REST',
#     'REST APIs',
#     'RESTful',
#     'RESTful API'
# ]
# React = [ #React
#     'React',
#     'React Native',
#     'React.js',
#     'ReactJS',
# ]
# Postgres = [ # Postgres
#     'PostgreSQL',
#     'Postgres'
# ]
# Nodejs = [ # Node.js
#     'Node',
#     'Node.js',
#     'NodeJS',
# ]
# ms = [ # Microsoft Office
#     'MS Office',
#     'MS office',
#     'Microsoft Office',
#     'Microsoft Office Suite',
#     "Microsoft Office Suite ",
# ]
# linux = [ # Linux/Unix
#     'LInux',
#     'Linux',
#     'Unix',
#     'Unix/Linux',
# ]
# jira = [ # JIRA
#     'Jira',
# ]
# js = [ # JavaScript
#     'JS',
#     'JavaScript',
#     'JavaScript Framework',
#     'Javascript',
# ]
# interpersonal = [ # Interpersonal
#     'Internpersonal',
#     'Interpersonal',
#     'Interpersonal Skills',
#     'Interspersonal',
#     'interpersonal',
#     'interpersonal',
# ]
# it = [ # IT System
#     'LAN',
#     'IT',
#     'IT Experience',
#     'IT System',
# ]
# ios = [ # IOS
#     'IOS',
#     'IOS development experience',
# ]
# html = [ # HTML
#     'HTML',
#     'HTML5',
#     'HTMl',
# ]
# gotomeeting = [ # Go to meeting
#     'GoToMeeting',
# ]
# go = [ # GoLang
#     'Go',
#     'GoLang',
#     'GO',
# ]
# git = [ # Git/Github
#     'Git',
#     'GitHub',
# ]
# google = [ # Google Suite
#     'GSuite',
#     'G Suite',
#     'Google',
#     'Google Analytics',
#     'Google Suite',
# ]
# flow = [ # Flowtype
#     'Flow',
#     'FlowType',
# ]
# Customer_Service = [ # Customer Service
#     'Customer Service',
#     'Customer Services',
# ]
# SaaS = [ # SaaS
#     ' SaaS',
# ]
# Net = [ # .NET
#     '.NET',
#     '.NET ',
# ]
# communication = [ # Communication
#     'Commuication',
#     'Communication',
#     'Writing Communication',
#     'Written',
#     'Written & Oral Communication',
#     'Writing',
# ]
# done = [
#     'AJAX',
#     'AR Software',
#     'ATS',
#     'AWS',
#     'Abila',
#     'Access',
#     'Adobe',
#     'Agile',
#     'After Effects',
#     'Algorithms',
#     'Ambitious',
#     'Android',
#     'Aurora',
#     'Azure',
#     'B2B',
#     'BASH',
#     'BASIS',
#     'BDD',
#     'BIM',
#     'Babel',
#     'Beam',
#     'Beanstalk',
#     'BenPlus',
#     'Big Data',
#     'BigQuery',
#     'Bilingual',
#     'Billing',
#     'Bootstrap',
#     'Building Code',
#     'C',
#     'C#',
#     'C++',
#     'CDNs',
#     'COBOL',
#     'CPT',
#     'CRM',
#     'CSS',
#     'CT scan',
#     'Catchpoint',
#     'Chartio',
#     'Chrome developer tools',
#     'Cisco Fire Power',
#     'Cisco ISE',
#     'Cl',
#     'Client-oriented',
#     'Clinical Research',
#     'CloudForm',
#     'CloudWatch',
#     'Commodities',
#     'Compassion',
#     'Complexity Analysis',
#     'Confluence',
#     'Construction',
#     'Corporate Finance',
#     'Credit',
#     'Critical Thinking',
#     'Curiosity',
#     'D3',
#     'DNS Protocal',
#     'DRG',
#     'Data Analysis',
#     'Data Entry',
#     'Data Management',
#     'Data Mining',
#     'Data Structure',
#     'Data Visualization',
#     'Database',
#     'Dataflow',
#     'Debit/Credit',
#     'Debugging',
#     'Demand Generation',
#     'Dependable',
#     'Desire',
#     'Detail-Oriented',
#     'Determined',
#     'Digital Marketing',
#     'Distributed System',
#     'Django',
#     'Docker',
#     'Driven',
#     'Dynatrace',
#     'E-mail',
#     'EHR',
#     'EMR',
#     'ESLint',
#     'ETL',
#     'Editing',
#     'Egnyte',
#     'Elasticsearch',
#     'Electron',
#     'Event Planning',
#     'Excel',
#     'Extensively walking',
#     'FX',
#     'Fabric',
#     'Financial Markets',
#     'FireEye HX',
#     'Flask',
#     'Flux',
#     'Follow through',
#     'Framework',
#     'Front End',
#     'Full Stack',
#     'GCP',
#     'GGplot',
#     'Gatsby',
#     'Grafana',
#     'GraphQL',
#     'Graphic Design',
#     'Graphite',
#     'Gulp',
#     'Hadoop',
#     'Healthcare',
#     'Heroku',
#     'Hugo',
#     'Illustrator',
#     'InDesign',
#     'InVision',
#     'JAMF',
#     'JIRA',
#     'JSL',
#     'JSON',
#     'JVM',
#     'Java',
#     'Jenkins',
#     'Jupyter Notebooks',
#     'Kafka',
#     'Kanban',
#     'Kotlin',
#     'Kubernetes',
#     'Lambda',
#     'Leadership',
#     'Lean',
#     'Lerna',
#     'Linkedin',
#     'MDM',
#     'MEAN',
#     'MSE',
#     'MVC',
#     'None',
#     'Mac',
#     'Machine Learning',
#     'Magento',
#     'Management',
#     'Marketing',
#     'Massage',
#     'MatLab',
#     'Medicine',
#     'Meeting Prep',
#     'Microservices',
#     'Microsoft Power BI',
#     'MongoDB',
#     'Motivated',
#     'Multitask',
#     'NPM',
#     'Negotiation',
#     'Neo4j',
#     'NoSQL',
#     'NumPy',
#     'OOP',
#     'Network',
#     'Networking',
#     'Objective C',
#     'Office 365',
#     'Open Source',
#     'Optimism',
#     'Oracle',
#     'Oracle Analytics',
#     'Oracle ERP',
#     'Outlook',
#     'Outreach',
#     'PHP',
#     'Packer',
#     'Pandas',
#     'Parquet',
#     'Passion',
#     'Pearl',
#     'Perl',
#     'Photoshop',
#     'Physical Therapy',
#     'Planning',
#     'Presentation',
#     'Prettier',
#     'Process Mining',
#     'Product Management',
#     'Project Management',
#     'Publisher',
#     'PySpark',
#     'PyTorch',
#     'Python',
#     'Qlikview',
#     'Quantitative',
#     'Quickbooks',
#     'R',
#     'Radiology',
#     'Rails',
#     'Reading',
#     'Redis',
#     'Redshift',
#     'Redux',
#     'Rehab',
#     'MySQL',
#     'Reliable',
#     'Repos',
#     'Research',
#     'Resilient',
#     'Responsible',
#     'Result-oriented',
#     'Risk Management',
#     'Ruby',
#     'Runtime Optimization',
#     'S3',
#     'SDKs',
#     'SEO',
#     'SGS',
#     'SOX Audits',
#     'SQL',
#     'SaaS',
#     'Sales',
#     'Salesforce',
#     'SaltStack',
#     'Scala',
#     'Scrum',
#     'SharePoint',
#     'Sketch',
#     'Snowflake',
#     'Social Media',
#     'Social Studio',
#     'Socially Skilled',
#     'Software Design',
#     'Software Licensing',
#     'Spark',
#     'Sports',
#     'Spring',
#     'Strategic thinking',
#     'Supervise',
#     'Swift',
#     'Swim',
#     'Symantec EP',
#     'TDD',
#     'Tableau',
#     'Tech savvy',
#     'Terraform',
#     'Time Management',
#     'Totango',
#     'Trade Skills',
#     'Typing',
#     'UI',
#     'UX',
#     'Upsell products',
#     'Visiting',
#     'Visual Studio',
#     'Word',
#     'wound treatment',
#     'web security',
#     'responsiveness',
#     'prior sales experience',
#     'openCL',
#     'office work',
#     'nutrition assessments',
#     'knowledge of markets and investments',
#     'knowledge of purchase process',
#     'knowledge of real estate market',
#     'jQuery',
#     'image optimization',
#     'Zendesk',
#     'Yarn',
#     'XML',
#     'X-ray',
#     'Windows',
#     'WAN',
#     'WLAN and VPN',
#     'Web Analytics',
#     'Web Programming',
#     'WebEx',
#     'WebGL',
#     'Webflow',
#     'Webpack',
#     'nan',
#     'Next.js',
#     'SAS',
#     "Equities",
#     "SAP",
#     'phone calls',
#     'Financial Accounting',
# ]
# analytical = [ # Analytical
#     'Analysis',
#     'Analyst',
#     'Analytical',
#     'Analytics',
# ]
# angular = [ # Angular.js
#     'Angular',
#     'AngularJS',
# ]
# automation_testing = [ # Automation Testing
#     'Automated Testing',
#     'Automation Tools',
# ]
# autocad = [ # autoCAD
#     'CAD',
# ]
# to_remove = [
#     'BA Degree',
#     'BA/BS',
#     'BA/BSc',
#     'BASIS',
#     'BS/BA degree',
#     'Dell and/or HP certified',
#     'English',
#     'Experiences',
#     'Fixed Income',
#     'HP Quality Center',
#     'IC-9',
#     'Infrastructure',
#     'Inspecting',
#     'Invoice',
#     'Japanese',
#     'Knowledge in tech',
#     'Layout',
#     'Long/Short',
#     'New relic',
#     'Play',
#     'Principle',
#     'Standardizing',
#     'Valuation',
#     'fast.ai',
#     'experience',
#     'Zuora',
#     'Zeplin',
# ]

## Grouping Skillset

In [0]:
# # Group

# oracle = [
#     'Oracle',
#     'Oracle Analytics',
#     'Oracle ERP'
# ]
# computer_tech = [
#     'AR Software',
#     'ATS',
#     'Abila',
#     'Android',
#     'Aurora',
#     'Beam',
#     'autoCAD',
#     'CRM',
#     'Cisco Fire Power',
#     'Cisco ISE',
#     'Confluence',
#     'DNS Protocal',
#     'DRG',
#     'Dynatrace',
#     'EHR',
#     'EMR',
#     'Egnyte',
#     'FireEye HX',
#     'Google Suite',
#     'Hadoop',
#     'IOS',
#     'IT System',
#     'InVision',
#     'JAMF',
#     'SAP',
#     'SAS',
#     'Linux/Unix',
#     'MDM',
#     'MSE',
#     'Qlikview',
#     'Quickbooks',
#     'Salesforce',
#     'Sketch',
#     'Social Studio',
#     'Symantec EP',
#     'Tableau',
#     'Totango',
#     'Zendesk'
#     "Magento",
#     "Mac",
# ]
# computer_language = [
#     '.NET',
#     'Angular.js',
#     'BASH',
#     'C',
#     'C#',
#     'C++',
#     'COBOL',
#     'CSS',
#     'Electron',
#     'Flask',
#     'GoLang',
#     'GraphQL',
#     'Gulp',
#     'HTML',
#     'Hugo',
#     'JavaScript',
#     'JSL',
#     'JSON',
#     'Java',
#     'Kotlin',
#     'MatLab',
#     'MySQL',
#     'NoSQL',
#     'Next.js', 
#     'Node.js',
#     'NumPy',
#     'Objective C',
#     'PHP',
#     'Pearl',
#     'Perl',
#     'Python',
#     'R',
#     'Pandas',
#     'Rails',
#     'React',
#     'Ruby',
#     'SQL',
#     'Scala',
#     'Spring',
#     'Swift',
#     'TensorFlow',
#     'TypeScript',
#     'Vue.js',
#     'openCL',
#     'jQuery',
#     'XML'
#     "PyTorch",
#     "PySpark",
# ]
# data = [
#     'Chartio',
#     'D3',
#     'Data Analysis',
#     'Data Entry',
#     'Data Management',
#     'Data Mining',
#     'Data Structure',
#     'Data Visualization',
#     'Database',
#     'Dataflow',
#     'Django',
#     'ETL',
#     'GGplot',
#     'Grafana',
#     'Spark',
# ]
# dev_tech = [
#     'AJAX',
#     'Agile',
#     'Automation Testing',
#     'BDD',
#     'BIM',
#     'Babel',
#     'Bootstrap',
#     'CDNs',
#     'Chrome developer tools',
#     'Complexity Analysis',
#     'Debugging',
#     'Distributed System',
#     'Docker',
#     'ESLint',
#     'Fabric',
#     'Flowtype',
#     'Flux',
#     'Front End',
#     'Framework',
#     'Full Stack',
#     'Gatsby',
#     'Git/Github',
#     'Graphite',
#     'JIRA',
#     'JVM',
#     'Jenkins',
#     'Lean',
#     'Lerna',
#     'MEAN',
#     'NPM',
#     'OOP',
#     'Network',
#     'Open Source',
#     'Packer',
#     'Parquet',
#     'Prettier',
#     'Process Mining',
#     'QA',
#     'REST',
#     'Runtime Optimization',
#     'Jupyter Notebooks',
#     'Kafka',
#     'Kanban',
#     'Kubernetes',
#     'MVC',
#     'MongoDB',
#     'Neo4j',
#     'Oracle',
#     'Postgres',
#     'Redis',
#     'Redux',
#     'SDKs',
#     'Server Rendering/Language',
#     'Software Design',
#     'Software Licensing',
#     'TDD',
#     'Terraform',
#     'UI',
#     'UX',
#     'Visual Studio',
#     'web security',
#     'image optimization',
#     'Yarn',
#     'Windows',
#     'WAN',
#     'WLAN and VPN',
#     'Web Analytics',
#     'Web Programming',
#     'WebEx',
#     'WebGL',
#     'Webflow',
#     'Webpack',
#     "Scrum",
#     "Machine Learning",
#     "Heroku",
# ]
# cloud_computing = [
#     'AWS',
#     'Azure',
#     'Beanstalk',
#     'CloudForm',
#     'CloudWatch',
#     'Elasticsearch',
#     'GCP',
#     'Lambda',
#     'Redshift',
#     'S3',
#     'Snowflake',
# ]
# big_data = [
#     'BigQuery',
#     'Big Data',
# ]
# MS = [
#     'Access',
#     'Excel',
#     'Microsoft Office',
#     'Microsoft Power BI',
#     'Office 365',
#     'Outlook',
#     'PowerPoint',
#     'Publisher',
#     'SharePoint',
#     'Word',
# ]
# adobe = [
#     'Adobe',
#     'After Effects',
#     'Illustrator',
#     'InDesign',
#     'Photoshop',
# ]
# soft_skill = [
#     'Algorithms',
#     'Ambitious',
#     'Analytical',
#     'Bilingual',
#     'Client-oriented',
#     'Communication',
#     'Compassion',
#     'Critical Thinking',
#     'Curiosity',
#     'Customer Service',
#     'Dependable',
#     'Desire',
#     'Detail-Oriented',
#     'Determined',
#     'Driven',
#     'Event Planning',
#     'Follow through',
#     'Go to meeting',
#     'Interpersonal',
#     'Leadership',
#     'Management',
#     'Marketing',
#     'Meeting Prep',
#     'Microservices',
#     'Motivated',
#     'Multitask',
#     'Negotiation',
#     'Networking',
#     'Optimism',
#     'Organized',
#     'Outreach',
#     'Passion',
#     'Planning',
#     'Presentation',
#     'Problem Solving',
#     'Product Management',
#     'Project Management',
#     'Public Speaking',
#     'Quantitative',
#     'Reading',
#     'Reliable',
#     'Resilient',
#     'Responsible',
#     'Result-oriented',
#     'responsiveness',
#     'Sales',
#     'Self Motivated',
#     'Socially Skilled',
#     'Strategic thinking',
#     'Supervise',
#     'Teamwork',
#     'Time Management',
# ]
# business = [
#     'SaaS',
#     'B2B',
#     'Billing',
#     'Corporate Finance',
#     'Credit',
#     'Debit/Credit',
#     'Demand Generation',
#     'Digital Marketing',
#     'Equities',
#     'Commodities',
#     'FX',
#     'Financial Markets',
#     'Repos',
#     'Risk Management',
#     'SEO',
#     'SOX Audits',
#     'SaltStack',
#     'Statistic',
#     'Trade Skills',
#     'Upsell products',
#     'prior sales experience',
#     'office work',
#     'knowledge of markets and investments',
#     'knowledge of purchase process',
#     'knowledge of real estate market',
#     'Financial Accounting',
#     "BASIS",
# ]
# miss = [
#     'Linkedin',
#     'BenPlus',
#     'Building Code',
#     'Catchpoint',
#     'Cl',
#     'Construction',
#     'Extensively walking',
#     'Graphic Design',
#     'Healthcare',
#     'None',
#     'SGS',
#     'Social Media',
#     'Sports',
#     'Swim',
#     'Tech savvy',
#     'Typing',
#     'Visiting',
# ]
# research = [
#     'Research',
# ]
# health_pro = [
#     'CPT',
#     'CT scan',
#     'Clinical Research',
#     'Massage',
#     'Physical Therapy',
#     'Medicine',
#     'Radiology',
#     'Rehab',
#     'wound treatment',
#     'nutrition assessments',
#     'X-ray'
# ]
# office = [
#     'E-mail',
#     'Editing',
#     'phone calls',
# ]

## Replace for consistency

In [0]:
# # to replace all the word by using the consistant group
# # from 0 -> 404
# for i in range(405):
#   skills = df.iloc[i][8]
#   split_skills = [x for x in str(skills).split(", ")]
#   temp = ""
#   for split_skill in split_skills:
#     if split_skill in vue:
#       temp += "Vue.js, "
#     elif split_skill in typescript:
#       temp += "TypeScript, "
#     elif split_skill in tf:
#       temp += "TensorFlow, "
#     elif split_skill in teamwork:
#       temp += "Teamwork, "
#     elif split_skill in stat:
#        temp += "Statistic, "
#     elif split_skill in server:
#       temp += "Server Rendering/Language, "
#     elif split_skill in self_motivated:
#       temp += "Self Motivated, "
#     elif split_skill in research:
#       temp += "Research, "
#     elif split_skill in PowerPoint:
#       temp += "PowerPoint, "
#     elif split_skill in Problem_Solving:
#       temp += "Problem Solving, "
#     elif split_skill in Public_Speaking:
#       temp += "Public Speaking, "
#     elif split_skill in QA:
#       temp += "QA, "
#     elif split_skill in REST:
#       temp += "REST, "
#     elif split_skill in React:
#       temp += "React, "
#     elif split_skill in Postgres:
#       temp += "Postgres, "
#     elif split_skill in Nodejs:
#       temp += "Node.js, "
#     elif split_skill in ms:
#       temp += "Microsoft Office, "
#     elif split_skill in linux:
#       temp += "Linux/Unix, "
#     elif split_skill in jira:
#       temp += "JIRA, "
#     elif split_skill in js:
#       temp += "JavaScript, "
#     elif split_skill in interpersonal:
#       temp += "Interpersonal, "
#     elif split_skill in it:
#       temp += "IT System, "
#     elif split_skill in ios:
#       temp += "IOS, "
#     elif split_skill in html:
#       temp += "HTML, "
#     elif split_skill in gotomeeting:
#       temp += "Go to meeting, "
#     elif split_skill in go:
#       temp += "GoLang, "
#     elif split_skill in git:
#       temp += "Git/Github, "
#     elif split_skill in google:
#       temp += "Google Suite, "
#     elif split_skill in flow:
#       temp += "Flowtype, "
#     elif split_skill in Customer_Service:
#       temp += "Customer Service, "
#     elif split_skill in SaaS:
#       temp += "SaaS, "
#     elif split_skill in Net:
#       temp += ".NET, "
#     elif split_skill in communication:
#       temp += "Communication, "
#     elif split_skill in done:
#       temp += split_skill
#       temp += ", "
#     elif split_skill in analytical:
#       temp += "Analytical, "
#     elif split_skill in angular:
#       temp += "Angular.js, "
#     elif split_skill in automation_testing:
#       temp += "Automation Testing, "
#     elif split_skill in autocad:
#       temp += "autoCAD, "
#     elif split_skill in organize:
#       temp += "Organized, "
#     elif split_skill in to_remove:
#       continue
#     elif split_skill in typo:
#       continue
#     else:
#       print("\"" + split_skill + "\",")
#   df.at[i, "Skills"] = temp[:-2]

    

In [0]:
# df.to_csv("consistent.csv", index=False)

# Random

## Remove

In [0]:
# # get the list of item that i dont want
# unwant = []
# unclean = get_position(df,"Business", False)
# for position in first_part + second_part:
#   if position not in tech + data + intern + insurance + analyst + development + \
#   miss + real_estate + hr + recruit + auditor + tax + marketing + design + accountant + \
#   admin + consulting + risk + transaction + project + sales + investment + actuary:
#     unwant.append(position)

In [0]:
# for i in range(152):
#   job_titles = df.iloc[i,6]
#   split_job_titles = [x for x in job_titles.split(", ")]
#   temp = []
#   for split_job_title in split_job_titles:
#     if split_job_title in unwant: continue
#     else: temp.append(split_job_title)
#     #if split_job_title == "Risk & Portfolio Analyst": print(i)
#   comma = ", "
#   df.iloc[i, 6] = comma.join(temp)

In [0]:
# df.to_csv("madified.csv")

In [0]:
# df.sample(10)

In [0]:
# # replace allNone, N/A, nan, 
# no_na = df.replace(['None','NaN'], 'None')
# no_na.fillna('None',inplace=True)

In [0]:
# df = no_na

## Grouping and Expanding skillsets

In [0]:
# df.to_csv("master_original.csv", index=False)

In [0]:
new_data = {
    'Company': [],
    'CUNY Alumni Count': [],
    'NYC Professional Count': [],
    'Year Founded': [],
    'Sector': [],
    'Industry': [],
    'Job': [],
    'Major': [],
    'Skill Label': [],
    'Skill': [],
    'Type': [],
    'Website': []
}

In [0]:
for i in range(405):
  company  = df.iloc[i]["Company"]
  cuny     = df.iloc[i]['# of CUNY Alumni']
  pro      = df.iloc[i]['# of NYC Professionals']
  year     = df.iloc[i]['Year Founded']
  sector   = df.iloc[i]['Sector']
  industry = df.iloc[i]['Industry']
  job      = df.iloc[i]['Job(s) or Opportunities']
  major    = df.iloc[i]['Major(s)']
  skill    = df.iloc[i]['Skills']
  Type     = df.iloc[i]['Type']
  website  = df.iloc[i]['Website']
  skills = [x for x in str(skill).split(", ")]
  for s in skills:
    label = group_skill(s)
    if not label:
      print("Skill: \"" + str(s) + "\", at company: " + "\"" + company + "\"")
      label = "None"
    new_data['Company'].append(company)
    new_data['CUNY Alumni Count'].append(cuny)
    new_data['NYC Professional Count'].append(pro)
    new_data['Year Founded'].append(year)
    new_data['Sector'].append(sector)
    new_data['Industry'].append(industry)
    new_data['Job'].append(job)
    new_data['Major'].append(major)
    new_data['Skill Label'].append(label)
    new_data['Skill'].append(s)
    new_data['Type'].append(Type)
    new_data['Website'].append(website)

In [0]:
new_df = pd.DataFrame(data=new_data)
# new_df.to_csv("master_skill_expand.csv", index=False)

In [0]:
# temp = new_df.loc[new_df["Sector"]=="Law"]
# temp.head(10)

In [0]:
# temp.loc[temp["Skill Label"]=="Miscellaneous"]["Skill"].value_counts()

In [0]:
new_df.loc[new_df["Sector"]=="Business"]["Skill Label"].value_counts()

Soft Skill                        287
Microsoft Office                  118
Programming Language/Framework     41
Miscellaneous                      38
Business Skill                     24
Software/Technology                17
Research                            6
Development Technology              5
Oracle                              4
Office Skill                        2
Data Related                        1
Health Professional Skill           1
Cloud Computing                     1
Name: Skill Label, dtype: int64

In [0]:
new_df.loc[new_df["Sector"]=="Tech"]["Skill Label"].value_counts()

Programming Language/Framework    281
Soft Skill                        183
Development Technology            138
Software/Technology                57
Microsoft Office                   49
Miscellaneous                      30
Cloud Computing                    29
Data Related                       23
Business Skill                     15
Adobe Suite                         5
Research                            3
Health Professional Skill           2
Office Skill                        1
Name: Skill Label, dtype: int64

In [0]:
new_df.loc[new_df["Sector"]=="Health"]["Skill Label"].value_counts()

Soft Skill                        136
Programming Language/Framework     31
Microsoft Office                   27
Miscellaneous                      18
Software/Technology                17
Development Technology             15
Data Related                       15
Health Professional Skill           8
Cloud Computing                     7
Business Skill                      5
Adobe Suite                         4
Research                            1
Name: Skill Label, dtype: int64

In [0]:
new_df.loc[new_df["Sector"]=="Law"]["Skill Label"].value_counts()

Soft Skill                        240
Research                           60
Microsoft Office                   60
Programming Language/Framework      1
Name: Skill Label, dtype: int64

In [0]:
# tech_expand = new_df.loc[new_df["Sector"] == "Tech"]
# business_expand = new_df.loc[new_df["Sector"] == "Business"]
# health_expand = new_df.loc[new_df["Sector"] == "Health"]
# law_expand = new_df.loc[new_df["Sector"] == "Law"]
# tech_expand.to_csv("tech_skill_expand.csv", index=False)
# business_expand.to_csv("business_skill_expand.csv", index=False)
# health_expand.to_csv("health_skill_expand.csv", index=False)
# law_expand.to_csv("law_skill_expand.csv", index=False)

## Grouping and Expanding Jobs

In [0]:
# old_df = df
# list(old_df)

In [0]:
# initializing new data
new_data = {
    'Company': [],
    'CUNY Alumni Count': [],
    'NYC Professional Count': [],
    'Year Founded': [],
    'Sector': [],
    'Industry': [],
    'Group': [],
    'Job': [],
    'Major': [],
    'Skill': [],
    'Type': [],
    'Website': []
}

In [0]:
for i in range(405):
  company  = df.iloc[i]["Company"]
  cuny     = df.iloc[i]['# of CUNY Alumni']
  pro      = df.iloc[i]['# of NYC Professionals']
  year     = df.iloc[i]['Year Founded']
  sector   = df.iloc[i]['Sector']
  industry = df.iloc[i]['Industry']
  job      = df.iloc[i]['Job(s) or Opportunities']
  major    = df.iloc[i]['Major(s)']
  skill    = df.iloc[i]['Skills']
  Type     = df.iloc[i]['Type']
  website  = df.iloc[i]['Website']
  if sector == "Law": positions = [x for x in job.split(". ")]
  else: positions = [x for x in job.split(", ")]
  for p in positions:
    if sector == "Tech":
      label = group_tech(p)
    elif sector == "Business":
      label = group_business(p)
    elif sector == "Health":
      label = group_health(p)
    elif sector == "Law":
      label = group_law(p)
    else:
      print("Sector not specified at row: " + str(i))
      label = "Error"
    if not label:
      print("Position not being grouped at row: " + str(i))
      
    new_data['Company'].append(company)
    new_data['CUNY Alumni Count'].append(cuny)
    new_data['NYC Professional Count'].append(pro)
    new_data['Year Founded'].append(year)
    new_data['Sector'].append(sector)
    new_data['Industry'].append(industry)
    new_data['Group'].append(label)
    new_data['Job'].append(p)
    new_data['Major'].append(major)
    new_data['Skill'].append(skill)
    new_data['Type'].append(Type)
    new_data['Website'].append(website)
  
  

In [0]:
new_df = pd.DataFrame(data=new_data)

In [0]:
# new_df.to_csv("master_job_expand.csv", index=False)

In [0]:
# # old_df.to_csv("master.csv", index=False)
# tech_expand = new_df.loc[new_df["Sector"] == "Tech"]
# business_expand = new_df.loc[new_df["Sector"] == "Business"]
# health_expand = new_df.loc[new_df["Sector"] == "Health"]
# law_expand = new_df.loc[new_df["Sector"] == "Law"]
# tech_expand.to_csv("tech_job_expand.csv", index=False)
# business_expand.to_csv("business_job_expand.csv", index=False)
# health_expand.to_csv("health_job_expand.csv", index=False)
# law_expand.to_csv("law_job_expand.csv", index=False)

In [95]:
pd.Series(get_major(df, 'Law')).count()

150

In [96]:
pd.Series(get_major(df, 'Tech')).count()

388

In [97]:
pd.Series(get_major(df, 'Business')).count()

422

In [98]:
pd.Series(get_major(df, 'Health')).count()

292

In [99]:
pd.Series(get_skill(df,'Law')).count()

361

In [100]:
pd.Series(get_skill(df,'Business')).count()

545

In [101]:
pd.Series(get_skill(df,'Health')).count()

284

In [102]:
pd.Series(get_skill(df,'Tech')).count()

816

In [104]:
pd.Series(get_position(df,'Law')).count()

367

In [105]:
pd.Series(get_position(df,'Business')).count()

647

In [106]:
pd.Series(get_position(df,'Health')).count()

245

In [107]:
pd.Series(get_position(df,'Tech')).count()

375