# 数据预处理

## 编码格式转换

In [6]:
pip install chardet


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [19]:
import os
import chardet
import pandas as pd

def get_encoding(filename):
    """
    检测并返回文件的编码格式。
    """
    with open(filename, 'rb') as f:
        result = chardet.detect(f.read())
        return result['encoding']

def convert_to_utf8(filename):
    """
    将文件转换为UTF-8编码格式。
    """
    encoding = get_encoding(filename)  # 获取文件的原始编码
    try:
        # 尝试使用检测到的编码读取文件
        df = pd.read_csv(filename, encoding=encoding)
    except UnicodeDecodeError:
        # 如果检测到的编码失败，尝试使用常见的编码
        try:
            df = pd.read_csv(filename, encoding='gbk')  # 尝试使用GBK编码
        except UnicodeDecodeError:
            df = pd.read_csv(filename, encoding='gb18030')  # 尝试使用GB18030编码

    # 生成新的文件名
    base, ext = os.path.splitext(filename)
    new_filename = f"{base}_utf8{ext}"

    # 保存为UTF-8编码的新文件
    df.to_csv(new_filename, encoding='utf-8', index=False)
    print(f"文件已转换为UTF-8编码并保存为：{new_filename}")

def batch_convert_to_utf8(path, ext_name='csv'):
    """
    批量转换指定目录下所有指定后缀的文件为UTF-8编码。
    """
    for filename in os.listdir(path):
        if filename.endswith('.' + ext_name):
            full_path = os.path.join(path, filename)
            convert_to_utf8(full_path)

# 使用示例
if __name__ == "__main__":
    # 使用当前工作目录
    path = os.getcwd()  # 获取当前工作目录
    print(f"当前工作目录：{path}")
    batch_convert_to_utf8(path)  # 调用批量转换函数

当前工作目录：/Users/os/Desktop/mdm2222
文件已转换为UTF-8编码并保存为：/Users/os/Desktop/mdm2222/Cleaned_DS_Jobs_utf8_utf8.csv
文件已转换为UTF-8编码并保存为：/Users/os/Desktop/mdm2222/Cleaned_DS_Jobs_utf8_utf8_utf8.csv
文件已转换为UTF-8编码并保存为：/Users/os/Desktop/mdm2222/DataAnalyst_utf8_utf8.csv
文件已转换为UTF-8编码并保存为：/Users/os/Desktop/mdm2222/DataAnalyst_utf8_utf8_utf8.csv


## 数据整合

In [1]:
# 导入必要库
import pandas as pd
import numpy as np
import re

# # 设置 Pandas 显示所有列
# pd.set_option('display.max_columns', None)  # 设置为 None，表示显示所有列
# pd.set_option('display.max_colwidth', None)  # 设置为 None，表示显示完整的列宽
# pd.set_option('display.width', None)  # 设置为 None，表示显示完整的宽度


# 1. 文件读取

# 读取两个CSV文件
df_ds = pd.read_csv('Cleaned_DS_Jobs_utf8.csv')
df_da = pd.read_csv('DataAnalyst_utf8.csv')

# 计算公司成立年份
df_ds['Founded'] = 2020 - df_ds['company_age']
df_ds.drop(columns=['company_age'], inplace=True)

# 处理公司名称中的评分信息（如"Vera Institute of Justice\n3.2"）
df_da['Company Name'] = df_da['Company Name'].str.extract(r'(.+?)(?:\n(\d+\.\d+))?$')[0]
df_da.drop(columns=['Unnamed: 0'], inplace=True)

In [3]:
df_da

Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,Industry,Sector,Revenue,Competitors,Easy Apply
0,"Data Analyst, Center on Immigration and Justic...",$37K-$66K (Glassdoor est.),Are you eager to roll up your sleeves and harn...,3.2,Vera Institute of Justice,"New York, NY","New York, NY",201 to 500 employees,1961,Nonprofit Organization,Social Assistance,Non-Profit,$100 to $500 million (USD),-1,TRUE
1,Quality Data Analyst,$37K-$66K (Glassdoor est.),Overview\n\nProvides analytical and technical ...,3.8,Visiting Nurse Service of New York,"New York, NY","New York, NY",10000+ employees,1893,Nonprofit Organization,Health Care Services & Hospitals,Health Care,$2 to $5 billion (USD),-1,-1
2,"Senior Data Analyst, Insights & Analytics Team...",$37K-$66K (Glassdoor est.),We__e looking for a Senior Data Analyst who ha...,3.4,Squarespace,"New York, NY","New York, NY",1001 to 5000 employees,2003,Company - Private,Internet,Information Technology,Unknown / Non-Applicable,GoDaddy,-1
3,Data Analyst,$37K-$66K (Glassdoor est.),Requisition NumberRR-0001939\nRemote:Yes\nWe c...,4.1,Celerity,"New York, NY","McLean, VA",201 to 500 employees,2002,Subsidiary or Business Segment,IT Services,Information Technology,$50 to $100 million (USD),-1,-1
4,Reporting Data Analyst,$37K-$66K (Glassdoor est.),ABOUT FANDUEL GROUP\n\nFanDuel Group is a worl...,3.9,FanDuel,"New York, NY","New York, NY",501 to 1000 employees,2009,Company - Private,Sports & Recreation,"Arts, Entertainment & Recreation",$100 to $500 million (USD),DraftKings,TRUE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2248,RQS - IHHA - 201900004460 -1q Data Security An...,$78K-$104K (Glassdoor est.),Maintains systems to protect data from unautho...,2.5,"Avacend, Inc.","Denver, CO","Alpharetta, GA",51 to 200 employees,-1,Company - Private,Staffing & Outsourcing,Business Services,Unknown / Non-Applicable,-1,-1
2249,Senior Data Analyst (Corporate Audit),$78K-$104K (Glassdoor est.),Position:\nSenior Data Analyst (Corporate Audi...,2.9,Arrow Electronics,"Centennial, CO","Centennial, CO",10000+ employees,1935,Company - Public,Wholesale,Business Services,$10+ billion (USD),"Avnet, Ingram Micro, Tech Data",-1
2250,"Technical Business Analyst (SQL, Data analytic...",$78K-$104K (Glassdoor est.),"Title: Technical Business Analyst (SQL, Data a...",-1.0,Spiceorb,"Denver, CO",-1,-1,-1,-1,-1,-1,-1,-1,-1
2251,"Data Analyst 3, Customer Experience",$78K-$104K (Glassdoor est.),Summary\n\nResponsible for working cross-funct...,3.1,Contingent Network Services,"Centennial, CO","West Chester, OH",201 to 500 employees,1984,Company - Private,Enterprise Software & Network Solutions,Information Technology,$25 to $50 million (USD),-1,-1


In [5]:
# =====================
# 第四步：薪资标准化
# =====================

# 定义标准化薪资的函数
def standardize_salary(s):
    if isinstance(s, str):
        # 检查是否只包含 '-1'
        if s.strip() == '-1':
            return [np.nan, np.nan, np.nan]  # 直接返回无效数据标记
        
        # 处理 df_da 格式: $37K-$66K (Glassdoor est.)
        numbers = re.findall(r'\$(\d+)K', s)
        if len(numbers) == 2:
            try:
                # 将提取的数字转换为整数，并计算平均值
                numbers = [int(n) * 1000 for n in numbers]
                avg = (numbers[0] + numbers[1]) / 2  # 使用浮点除法
                return numbers + [avg]
            except ValueError as e:
                print(f"Error processing '{s}': {e}")
                return [np.nan, np.nan, np.nan]
        
        # 处理 df_ds 格式: 56-97
        elif '-' in s:
            numbers = s.split('-')
            if len(numbers) == 2:
                try:
                    # 将提取的数字转换为整数，并计算平均值
                    numbers = [int(n) * 1000 for n in numbers]
                    avg = (numbers[0] + numbers[1]) / 2  # 使用浮点除法
                    return numbers + [avg]
                except ValueError:
                    print(f"Error processing '{s}': Invalid number format")
                    return [np.nan, np.nan, np.nan]
            else:
                print(f"Invalid format for '{s}'")
                return [np.nan, np.nan, np.nan]
        else:
            print(f"Invalid format for '{s}'")
            return [np.nan, np.nan, np.nan]
    else:
        print(f"Invalid input type: {type(s)}")
        return [np.nan, np.nan, np.nan]
# 应用标准化函数
salary_data_ds = df_ds['Salary Estimate'].apply(standardize_salary).apply(pd.Series)
salary_data_ds.columns = ['Salary_Min', 'Salary_Max', 'Salary_Avg']

# 将处理后的数据合并到原始 DataFrame 中
df_ds = pd.concat([df_ds, salary_data_ds], axis=1)
# 删除原有数据
df_ds.drop(columns=['Salary Estimate','min_salary', 'max_salary', 'avg_salary'], inplace=True)


# 应用标准化函数
salary_data_da = df_da['Salary Estimate'].apply(standardize_salary).apply(pd.Series)
salary_data_da.columns = ['Salary_Min', 'Salary_Max', 'Salary_Avg']

# 将处理后的数据合并到原始 DataFrame 中
df_da = pd.concat([df_da, salary_data_da], axis=1)
df_da.drop(columns=['Salary Estimate'], inplace=True)


In [7]:
# =====================
# 第五步：地理位置标准化
# =====================

def parse_location(loc):
    """将"New York, NY"分解为城市和州"""
    if pd.notna(loc):
        parts = loc.split(', ')
        if len(parts) == 2:
            return parts[0], parts[1]
    return np.nan, np.nan

for df in [df_ds, df_da]:
    df[['City', 'State']] = df['Location'].apply(parse_location).apply(pd.Series)
    df.drop(columns=['Location'], inplace=True)

In [17]:
# 定义统一的列结构
unified_columns = [
    # 工作基础信息
    'Job Title', 'Job Description', 'City', 'State', 'job_state', 'same_state', 'job_simp', 'seniority', 'Easy Apply', 'Competitors',
    # 公司信息
    'Company Name', 'Headquarters', 'Size', 'Type of ownership', 'Industry', 'Sector', 'Revenue', 'Founded', 'Rating',
    # 技能信息
    'python', 'excel', 'hadoop', 'spark', 'aws', 'tableau', 'big_data',
    # 工资信息
    'Salary_Min', 'Salary_Max', 'Salary_Avg'
]

# 获取两个数据集的列名
columns_ds = set(df_ds.columns)
columns_da = set(df_da.columns)

# 找出 df_ds 中存在但 df_da 中不存在的列
missing_in_da = columns_ds - columns_da

# 找出 df_da 中存在但 df_ds 中不存在的列
missing_in_ds = columns_da - columns_ds

# =====================
# 第三步：处理 df_da 特有列
# =====================
# 添加 df_ds 中缺失的列到 df_da，并填充默认值
for col in missing_in_da:
    df_da[col] = np.nan  # 标记为待后续处理

# =====================
# 第四步：处理 df_ds 特有列
# =====================
# 添加 df_da 中缺失的列到 df_ds，并填充默认值
for col in missing_in_ds:
    df_ds[col] = np.nan

# =====================
# 第五步：确保两个 DataFrame 的列顺序一致
# =====================
# 重新排列列顺序，确保两个 DataFrame 的列顺序一致
df_ds = df_ds[unified_columns]
df_da = df_da[unified_columns]

# =====================
# 第六步：最终合并
# =====================
# 合并数据集
combined_df = pd.concat([df_ds, df_da], ignore_index=True)


# 保存结果
combined_df.to_csv('./outputs/combined_jobs_dataset.csv', index=False)

In [13]:
combined_df

Unnamed: 0,Job Title,Job Description,City,State,job_state,same_state,job_simp,seniority,Easy Apply,Competitors,...,python,excel,hadoop,spark,aws,tableau,big_data,Salary_Min,Salary_Max,Salary_Avg
0,Sr Data Scientist,Description\n\nThe Senior Data Scientist is re...,New York,NY,NY,1.0,data scientist,senior,,,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,137000.0,171000.0,154000.0
1,Data Scientist,"Secure our Nation, Ignite your Future\n\nJoin ...",Chantilly,VA,VA,0.0,data scientist,na,,,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,137000.0,171000.0,154000.0
2,Data Scientist,Overview\n\n\nAnalysis Group is one of the lar...,Boston,MA,MA,1.0,data scientist,na,,,...,1.0,1.0,0.0,0.0,1.0,0.0,0.0,137000.0,171000.0,154000.0
3,Data Scientist,JOB DESCRIPTION:\n\nDo you have a passion for ...,Newton,MA,MA,0.0,data scientist,na,,,...,1.0,1.0,0.0,0.0,1.0,0.0,0.0,137000.0,171000.0,154000.0
4,Data Scientist,Data Scientist\nAffinity Solutions / Marketing...,New York,NY,NY,1.0,data scientist,na,,,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,137000.0,171000.0,154000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2908,RQS - IHHA - 201900004460 -1q Data Security An...,Maintains systems to protect data from unautho...,Denver,CO,,,,,-1,-1,...,,,,,,,,78000.0,104000.0,91000.0
2909,Senior Data Analyst (Corporate Audit),Position:\nSenior Data Analyst (Corporate Audi...,Centennial,CO,,,,,-1,"Avnet, Ingram Micro, Tech Data",...,,,,,,,,78000.0,104000.0,91000.0
2910,"Technical Business Analyst (SQL, Data analytic...","Title: Technical Business Analyst (SQL, Data a...",Denver,CO,,,,,-1,-1,...,,,,,,,,78000.0,104000.0,91000.0
2911,"Data Analyst 3, Customer Experience",Summary\n\nResponsible for working cross-funct...,Centennial,CO,,,,,-1,-1,...,,,,,,,,78000.0,104000.0,91000.0


## 数据处理