In [10]:
import pandas as pd
import numpy as np

In [2]:
fields = [
    "age", "workclass", "fnlwgt", "education", "education_num",
    "marital_status", "occupation", "relationship", "race", "sex",
    "capital_gain", "capital_loss", "hours_per_week", "native_country", "income"
]

In [3]:
file_path = 'data/adult.data'  
df = pd.read_csv(file_path, names=fields, na_values="?", skipinitialspace=True)

In [4]:
unique_marital_status = df['marital_status'].unique()
print("Unique marital status values in the dataset:", unique_marital_status)

Unique marital status values in the dataset: ['Never-married' 'Married-civ-spouse' 'Divorced' 'Married-spouse-absent'
 'Separated' 'Married-AF-spouse' 'Widowed']


In [5]:
print(df.head())
print(df.info())

   age         workclass  fnlwgt  education  education_num  \
0   39         State-gov   77516  Bachelors             13   
1   50  Self-emp-not-inc   83311  Bachelors             13   
2   38           Private  215646    HS-grad              9   
3   53           Private  234721       11th              7   
4   28           Private  338409  Bachelors             13   

       marital_status         occupation   relationship   race     sex  \
0       Never-married       Adm-clerical  Not-in-family  White    Male   
1  Married-civ-spouse    Exec-managerial        Husband  White    Male   
2            Divorced  Handlers-cleaners  Not-in-family  White    Male   
3  Married-civ-spouse  Handlers-cleaners        Husband  Black    Male   
4  Married-civ-spouse     Prof-specialty           Wife  Black  Female   

   capital_gain  capital_loss  hours_per_week native_country income  
0          2174             0              40  United-States  <=50K  
1             0             0             

In [6]:
marital_status_mapping = {
    'Married-civ-spouse': 'Married',
    'Widowed': 'Married',
    'Married-spouse-absent': 'Married',
    'Married-AF-spouse': 'Married',
    'Separated': 'Married',
    'Divorced': 'Unmarried',
    'Never-married': 'Unmarried'
}
df['marital_status'] = df['marital_status'].map(marital_status_mapping)

In [7]:
print(df.head())
print(df.info())

   age         workclass  fnlwgt  education  education_num marital_status  \
0   39         State-gov   77516  Bachelors             13      Unmarried   
1   50  Self-emp-not-inc   83311  Bachelors             13        Married   
2   38           Private  215646    HS-grad              9      Unmarried   
3   53           Private  234721       11th              7        Married   
4   28           Private  338409  Bachelors             13        Married   

          occupation   relationship   race     sex  capital_gain  \
0       Adm-clerical  Not-in-family  White    Male          2174   
1    Exec-managerial        Husband  White    Male             0   
2  Handlers-cleaners  Not-in-family  White    Male             0   
3  Handlers-cleaners        Husband  Black    Male             0   
4     Prof-specialty           Wife  Black  Female             0   

   capital_loss  hours_per_week native_country income  
0             0              40  United-States  <=50K  
1             0 

In [8]:
df.to_csv('pro_adult_data.data', index=False,header=False)

In [11]:
total_rows = len(df)
partition_size = 6514
num_partitions = total_rows // partition_size  # 完整的分区数


df['partition_id'] = np.arange(total_rows) // partition_size


if total_rows % partition_size != 0:
    df.loc[df['partition_id'] > num_partitions, 'partition_id'] = num_partitions


In [12]:
df.to_csv('par_adult_data.data', index=False,header=False)