In [10]:
import pandas as pd
from collections import Counter
import csv

exclude_skills = [
    "Disabilities", "Levelling", "Equalization", "Maintainability", 
    "Activism", "Medic", "Survey Data Analysis", "Survey Data Collection", "Additives", 
    "Industrialization", "Coloring", "Accessioning", "Minimum Data Set",
    "Tooling", "Dashboard", "Personalization", "Dataset"
]

exclude_skills += ["Source Data", "Executable", "Limiter", "Collections", "Visualization"]

exclude_skills += ["Job Descriptions", "Digitization", "Centering", "Receivables", "Data Analysis", 
                   "Data Science", "Metadata", 	"Algorithms", "Computer Science", "Vaccination", "Finance", 
                   "Statistics", "Data Quality", "Resourcing", "Automation", "Market Data", "Analytics", "Financial Data",
                  "Banking", "Physics", "Validations", "Sustainability", "Commercialization", "Claims Processing"]

exclude_skills += ["E (Programming Language)", "Tracking (Commercial Airline Flight)", "Track (Rail Transport)", 
                   "Consumables", "Life Insurance Sales"]
exclude_skills += ["Component Object Model (COM)", "Sage SAFE X3", "Google Ads", "E-Commerce", "Surveys", 
                   "Hostile Work Environment", "Genetics", "Hospitality", "Operations", "Sales", "Integration", "Consulting", "Management"]

def clean_skills(skills, exclude_set):
    return [skill for skill in skills if skill not in exclude_set]

# 加载数据
file_path = 'updated_dataset.csv' 
data = pd.read_csv(file_path)

def count_skills(column, exclude_set):
    skills_counter = Counter()
    
    for row in column:
        if isinstance(row, str):
            skills = list(set(row.split(', ')))
            cleaned_skills = clean_skills(skills, exclude_set)
            skills_counter.update(cleaned_skills)
        
    return skills_counter.most_common(10)  # 获取前10项


top_hard_skills = count_skills(data['hard_skills'], set(exclude_skills))
top_soft_skills = count_skills(data['soft_skills'], set(exclude_skills))

print("Top 10 Hard Skills:")
for skill, count in top_hard_skills:
    print(f"{skill}: {count}")

print("\nTop 10 Soft Skills:")
for skill, count in top_soft_skills:
    print(f"{skill}: {count}")

hard_skills_df = pd.DataFrame(top_hard_skills, columns=['Skill', 'Count'])
soft_skills_df = pd.DataFrame(top_soft_skills, columns=['Skill', 'Count'])

hard_skills_df.to_csv('top_10_hard_skills.csv', index=False, encoding='utf-8')
soft_skills_df.to_csv('top_10_soft_skills.csv', index=False, encoding='utf-8')



Top 10 Hard Skills:
SQL (Programming Language): 8430
Python (Programming Language): 5101
Tableau (Business Intelligence Software): 4764
Data Visualization: 3641
R (Programming Language): 3320
Power BI: 3059
Business Intelligence: 2578
Quantitative Data Analysis: 2484
Ad Hoc Testing: 2424
Data Modeling: 2334

Top 10 Soft Skills:
Positivity: 8391
Communications: 7648
Collaboration: 6721
Presentations: 5389
Planning: 4980
Decisiveness: 4060
Innovation: 3991
Verbal Communication Skills: 3939
Problem Solving: 3743
Research: 3469
