In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [2]:
# jupyter notebook --generate-config

In [3]:
column_names = [
    'age',
    'workclass',
    'fnlwgt',
    'education',
    'education-num',
    'marital-status',
    'occupation',
    'relationship',
    'race',
    'sex',
    'capital-gain',
    'capital-loss',
    'hours-per-week',
    'native-country',
    'income'
]
df = pd.read_csv('adult/adult.data', header=None, names=column_names)
df

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [4]:
df = df.drop(columns=['education-num'])

In [5]:
# Preprocessing age ang convert init age categories to 10s
def clip_age(x):
    result = np.clip(int(x) / 10, 1, 9)
    return int(result)


df['age'] = df['age'].map(clip_age)

In [6]:
# Encode categorical columns
def encode_categorical(df, column, mapping):
    def encode(x):
        if pd.isna(x) or x.strip() == '?':
            # all unknown values are mapped to 100
            return -1
        else:
            return mapping.index(x.strip())

    df[column] = df[column].map(encode).astype(int)
    # df[column].map(encode).astype(pd.Int64Dtype())
    return df[column]


workclass_map = ['Private', 'Self-emp-not-inc', 'Self-emp-inc', 'Federal-gov', 'Local-gov', 'State-gov', 'Without-pay',
                 'Never-worked']
education_map = ['Bachelors', 'Some-college', '11th', 'HS-grad', 'Prof-school', 'Assoc-acdm', 'Assoc-voc', '9th',
                 '7th-8th', '12th', 'Masters', '1st-4th', '10th', 'Doctorate', '5th-6th', 'Preschool']
marital_status_map = ['Married-civ-spouse', 'Divorced', 'Never-married', 'Separated', 'Widowed',
                      'Married-spouse-absent', 'Married-AF-spouse']
occupation_map = ['Tech-support', 'Craft-repair', 'Other-service', 'Sales', 'Exec-managerial', 'Prof-specialty',
                  'Handlers-cleaners', 'Machine-op-inspct', 'Adm-clerical', 'Farming-fishing', 'Transport-moving',
                  'Priv-house-serv', 'Protective-serv', 'Armed-Forces']
relationship_map = ['Wife', 'Own-child', 'Husband', 'Not-in-family', 'Other-relative', 'Unmarried']
race_map = ['White', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo', 'Other', 'Black']
sex_map = ['Female', 'Male']
native_country_map = ['United-States', 'Cambodia', 'England', 'Puerto-Rico', 'Canada', 'Germany',
                      'Outlying-US(Guam-USVI-etc)',
                      'India', 'Japan', 'Greece', 'South', 'China', 'Cuba', 'Iran', 'Honduras', 'Philippines',
                      'Italy',
                      'Poland', 'Jamaica', 'Vietnam', 'Mexico', 'Portugal', 'Ireland', 'France',
                      'Dominican-Republic',
                      'Laos', 'Ecuador', 'Taiwan', 'Haiti', 'Columbia', 'Hungary', 'Guatemala', 'Nicaragua',
                      'Scotland',
                      'Thailand', 'Yugoslavia', 'El-Salvador', 'Trinadad&Tobago', 'Peru', 'Hong',
                      'Holand-Netherlands']
income_map = ['<=50K', '>50K']

df['workclass'] = encode_categorical(df, 'workclass', workclass_map)
df['education'] = encode_categorical(df, 'education', education_map)
df['marital-status'] = encode_categorical(df, 'marital-status', marital_status_map)
df['occupation'] = encode_categorical(df, 'occupation', occupation_map)
df['relationship'] = encode_categorical(df, 'relationship', relationship_map)
df['race'] = encode_categorical(df, 'race', race_map)
df['sex'] = encode_categorical(df, 'sex', sex_map)
df['native-country'] = encode_categorical(df, 'native-country', native_country_map)
df['income'] = encode_categorical(df, 'income', income_map)

In [7]:
# Encoding of capital gain and capital loss
def encode_capital_gain(x):
    if pd.isna(x) or x == '?':
        return 100
    elif x < 1000:
        return 0
    else:
        return int(x / 1000)


df['capital-gain'] = df['capital-gain'].map(encode_capital_gain)
df['capital-loss'] = df['capital-loss'].map(encode_capital_gain)

In [8]:
def encode_fnlwgt(x):
    if pd.isna(x) or x == '?':
        return 100
    elif x < 20000:
        return 0
    else:
        return int(x / 20000)


df['fnlwgt'] = df['fnlwgt'].map(encode_fnlwgt)

In [9]:
df

Unnamed: 0,age,workclass,fnlwgt,education,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,3,5,3,0,2,8,3,0,1,2,0,40,0,0
1,5,1,4,0,0,4,2,0,1,0,0,13,0,0
2,3,0,10,3,1,6,3,0,1,0,0,40,0,0
3,5,0,11,2,0,6,2,4,1,0,0,40,0,0
4,2,0,16,0,0,5,0,4,0,0,0,40,12,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,2,0,12,5,0,0,0,0,0,0,0,38,0,0
32557,4,0,7,3,0,7,2,0,1,0,0,40,0,1
32558,5,0,7,3,4,8,5,0,0,0,0,40,0,0
32559,2,0,10,3,2,8,1,0,1,0,0,20,0,0


In [10]:
df.to_csv('../adult.csv', index=False)

In [154]:
# Bin the Age column and map the bin labels to the encoding dictionary
# bins = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, np.inf]
# labels = ['0-9', '10-19', '20-29', '30-39', '40-49', '50-59', '60-69', '70-79', '80-89', '90-99']
# df['age'] = pd.cut(df['age'], bins=bins, labels=labels)
#
# # Manually define the bins and labels
# bins = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, np.inf]
# labels = ['0-9', '10-19', '20-29', '30-39', '40-49', '50-59', '60-69', '70-79', '80-89', '90+']
#
# # Apply the function and create the age category column
# df['age'] = pd.cut(df['age'], bins=bins, labels=labels, include_lowest=True)
# df['age'] = df['age'].map(lambda x: int(np.clip(int(x) / 10, 1, 9)))


# Define the age bin encoding dictionary
# age_bin_encoding = {0: '0-9', 1: '10-19', 2: '20-29', 3: '30-39', 4: '40-49', 5: '50-59', 6: '60-69', 7: '70-79',
#                     8: '80-89', 9: '90-99'}
# df['age'] = df['age'].map(age_bin_encoding)

In [152]:
# def encode_features(data, features):
#     """
#     Encodes numerical features by dividing by a factor and clipping within a range.
#
#     Args:
#         data: A pandas DataFrame containing the features.
#         features: A list of feature names to encode.
#
#     Returns:
#         A new DataFrame with encoded features.
#     """
#     encoded_data = data.copy()  # Create a copy to avoid modifying original data
#     for feature in features:
#         # Get maximum value for the feature
#         max_value = data[feature].max()
#         # Define scaling factor and clipping range (adjust as needed)
#         scaling_factor = 100  # Adjust scaling factor based on value range
#         clipping_range = (1, 10)  # Adjust clipping range as needed
#         encoded_data[feature] = np.clip(data[feature] / scaling_factor, clipping_range[0], clipping_range[1])
#     return encoded_data
#
#
# # Example usage
# encoded_data = encode_features(df.copy(), features=['fnlwgt', 'education-num', 'capital-gain', 'capital-loss'])


In [107]:
def encode_workclass(x):
    workclass_map = ['Never-worked', 'Without-pay', 'Self-emp-inc', 'Self-emp-not-inc', 'Private', 'Local-gov',
                     'State-gov', 'Federal-gov']
    if pd.isna(x) or x.strip() == '?':
        return np.nan
    else:
        return workclass_map.index(x.strip())


df['workclass'] = df['workclass'].map(encode_workclass)

AttributeError: 'float' object has no attribute 'strip'

In [36]:
def encode_education(x):
    education_map = ['Bachelors', 'Some-college', '11th', 'HS-grad', 'Prof-school', 'Assoc-acdm', 'Assoc-voc', '9th',
                     '7th-8th', '12th', 'Masters', '1st-4th', '10th', 'Doctorate', '5th-6th', 'Preschool']
    if pd.isna(x) or x.strip() == '?':
        return np.nan
    else:
        return education_map.index(x.strip())


df['education'] = df['education'].map(encode_education)

In [37]:
def encode_marital_status(x):
    # "Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse"
    marital_status_map = ['Married-civ-spouse', 'Divorced', 'Never-married', 'Separated', 'Widowed',
                          'Married-spouse-absent', 'Married-AF-spouse']
    if pd.isna(x) or x.strip() == '?':
        return np.nan
    else:
        return marital_status_map.index(x.strip())


df['marital-status'] = df['marital-status'].map(encode_marital_status)

In [38]:
def encode_occupation(x):
    occupation_map = ['Tech-support', 'Craft-repair', 'Other-service', 'Sales', 'Exec-managerial', 'Prof-specialty',
                      'Handlers-cleaners', 'Machine-op-inspct', 'Adm-clerical', 'Farming-fishing', 'Transport-moving',
                      'Priv-house-serv', 'Protective-serv', 'Armed-Forces']
    if pd.isna(x) or x.strip() == '?':
        return np.nan
    else:
        return occupation_map.index(x.strip())


df['occupation'] = df['occupation'].map(encode_occupation)

In [39]:
def encode_relationship(x):
    relationship_map = ['Wife', 'Own-child', 'Husband', 'Not-in-family', 'Other-relative', 'Unmarried']
    if pd.isna(x) or x.strip() == '?':
        return np.nan
    else:
        return relationship_map.index(x.strip())


df['relationship'] = df['relationship'].map(encode_relationship)

In [40]:
def encode_race(x):
    race_map = ['White', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo', 'Other', 'Black']
    if pd.isna(x) or x.strip() == '?':
        return np.nan
    else:
        return race_map.index(x.strip())


df['race'] = df['race'].map(encode_race)

In [41]:
def encode_sex(x):
    sex_map = ['Female', 'Male']
    if pd.isna(x) or x.strip() == '?':
        return np.nan
    else:
        return sex_map.index(x.strip())


df['sex'] = df['sex'].map(encode_sex)

In [42]:
def encode_native_country(x):
    native_country_map = ['United-States', 'Cambodia', 'England', 'Puerto-Rico', 'Canada', 'Germany',
                          'Outlying-US(Guam-USVI-etc)',
                          'India', 'Japan', 'Greece', 'South', 'China', 'Cuba', 'Iran', 'Honduras', 'Philippines',
                          'Italy',
                          'Poland', 'Jamaica', 'Vietnam', 'Mexico', 'Portugal', 'Ireland', 'France',
                          'Dominican-Republic',
                          'Laos', 'Ecuador', 'Taiwan', 'Haiti', 'Columbia', 'Hungary', 'Guatemala', 'Nicaragua',
                          'Scotland',
                          'Thailand', 'Yugoslavia', 'El-Salvador', 'Trinadad&Tobago', 'Peru', 'Hong',
                          'Holand-Netherlands']
    if pd.isna(x) or x.strip() == '?':
        return np.nan
    else:
        return native_country_map.index(x.strip())


df['native-country'] = df['native-country'].map(encode_native_country)

In [43]:
def encode_income(x):
    income_map = ['<=50K', '>50K']
    if pd.isna(x) or x.strip() == '?':
        return np.nan
    else:
        return income_map.index(x.strip())
    # return income_map.index(x.strip())


df['income'] = df['income'].map(encode_income)

In [45]:
df

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,3,6.0,77516,0,13,2,8.0,3,0,1,2,0,40,0.0,0
1,5,3.0,83311,0,13,0,4.0,2,0,1,0,0,13,0.0,0
2,3,4.0,215646,3,9,1,6.0,3,0,1,0,0,40,0.0,0
3,5,4.0,234721,2,7,0,6.0,2,4,1,0,0,40,0.0,0
4,2,4.0,338409,0,13,0,5.0,0,4,0,0,0,40,12.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,2,4.0,257302,5,12,0,0.0,0,0,0,0,0,38,0.0,0
32557,4,4.0,154374,3,9,0,7.0,2,0,1,0,0,40,0.0,1
32558,5,4.0,151910,3,9,4,8.0,5,0,0,0,0,40,0.0,0
32559,2,4.0,201490,3,9,2,8.0,1,0,1,0,0,20,0.0,0


In [111]:
df.to_csv('')

In [65]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

# Load the Adult dataset (you can replace this with your own data loading code)
# Assuming the dataset is in a CSV file named 'adult.csv'
# data = pd.read_csv('adult.csv')

# Define features and target variable
X = df.drop(columns=['income'])  # Features
y = df['income']  # Target variable

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define preprocessing steps
numeric_features = ['age', 'hours-per-week']
categorical_features = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Fit the preprocessor on the training data
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

# Now you can use X_train_preprocessed and X_test_preprocessed for training your ML model
# Remember to handle fairness-aware techniques as needed (e.g., reweighting, bias mitigation)

# Example: Train a logistic regression model
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
model.fit(X_train_preprocessed, y_train)

# Evaluate model performance
accuracy = model.score(X_test_preprocessed, y_test)
print(f"Accuracy on test set: {accuracy:.2f}")

# Note: You can further explore fairness metrics and adjust preprocessing steps based on your requirements.


Accuracy on test set: 0.83
