In [1]:
from explainability.structured.core.structured_manipulator import StructuredManipulator
import pandas as pd

In [2]:
# Load adult income dataset, 100 rows
df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data", header=None, nrows=100)
df.columns = [
    "Age", "WorkClass", "fnlwgt", "Education", "EducationNum",
    "MaritalStatus", "Occupation", "Relationship", "Race", "Gender",
    "CapitalGain", "CapitalLoss", "HoursPerWeek", "NativeCountry",
    "Income"
]
df.drop([
    "fnlwgt", "EducationNum",
    "MaritalStatus", "Occupation", "Relationship", "Race",
    "NativeCountry"
], axis=1, inplace=True)
label_column = "Income"

In [3]:
# Create a new structured manipulator
sm = StructuredManipulator(df, label_column)
# view data
sm.df.head()

Unnamed: 0,Age,WorkClass,Education,Gender,CapitalGain,CapitalLoss,HoursPerWeek,Income
0,48,Private,Assoc-acdm,Female,0,0,40,<=50K
1,32,Federal-gov,HS-grad,Male,0,0,40,<=50K
2,40,Private,Doctorate,Male,0,0,60,>50K
3,31,Local-gov,7th-8th,Male,0,0,40,<=50K
4,34,Local-gov,Bachelors,Male,0,0,40,>50K


In [4]:
sm.duplicate_features(column="Age", num_dups=2, dup_col_names=["Age_cat", "Age_replace"])
sm.df.head()

Unnamed: 0,Age,WorkClass,Education,Gender,CapitalGain,CapitalLoss,HoursPerWeek,Income,Age_cat,Age_replace
0,48,Private,Assoc-acdm,Female,0,0,40,<=50K,48,48
1,32,Federal-gov,HS-grad,Male,0,0,40,<=50K,32,32
2,40,Private,Doctorate,Male,0,0,60,>50K,40,40
3,31,Local-gov,7th-8th,Male,0,0,40,<=50K,31,31
4,34,Local-gov,Bachelors,Male,0,0,40,>50K,34,34


In [5]:
# Commands can also be chained
sm \
    .categorize(column="Age_cat") \
    .replace_random_values(column="Age_replace", proportion=0.9)

sm.df.head()

Unnamed: 0,Age,WorkClass,Education,Gender,CapitalGain,CapitalLoss,HoursPerWeek,Income,Age_cat,Age_replace
0,48,Private,Assoc-acdm,Female,0,0,40,<=50K,"(18.0, 48.5]",48.0
1,32,Federal-gov,HS-grad,Male,0,0,40,<=50K,"(18.0, 48.5]",32.0
2,40,Private,Doctorate,Male,0,0,60,>50K,"(18.0, 48.5]",38.39
3,31,Local-gov,7th-8th,Male,0,0,40,<=50K,"(18.0, 48.5]",31.0
4,34,Local-gov,Bachelors,Male,0,0,40,>50K,"(18.0, 48.5]",38.39


In [6]:
sm.split_category_value(column="Gender", dup_value=" Male", new_value="M", proportion=0.75)
sm.df.head()

Unnamed: 0,Age,WorkClass,Education,Gender,CapitalGain,CapitalLoss,HoursPerWeek,Income,Age_cat,Age_replace
0,48,Private,Assoc-acdm,M,0,0,40,<=50K,"(18.0, 48.5]",48.0
1,32,Federal-gov,HS-grad,M,0,0,40,<=50K,"(18.0, 48.5]",32.0
2,40,Private,Doctorate,M,0,0,60,>50K,"(18.0, 48.5]",38.39
3,31,Local-gov,7th-8th,Male,0,0,40,<=50K,"(18.0, 48.5]",31.0
4,34,Local-gov,Bachelors,M,0,0,40,>50K,"(18.0, 48.5]",38.39


In [7]:
# View list of performed operations
print(sm.trace)


duplicate_features
{'column': 'Age', 'dup_col_names': ['Age_cat', 'Age_replace'], 'num_dups': 2}


categorize
{'bin_names': [(18.0, 48.5], (48.5, 79.0]]
Categories (2, interval[float64, right]): [(18.0, 48.5] < (48.5, 79.0]],
 'bins': array([18. , 48.5, 79. ]),
 'column': 'Age_cat',
 'num_bins': 2}


replace_random_values
{'column': 'Age_replace', 'proportion': 0.5, 'value': 38.39}


split_category_value
{'column': 'Gender', 'dup_value': ' Male', 'new_value': 'M'}



In [8]:
# The train-test split respects the sort operation, allowing for different feature distributions in train and test
sm.sort_values(column="CapitalLoss", ascending=False)
X_train, y_train, X_test, y_test = sm.train_test_split()
X_train["CapitalLoss"].mean(), X_test["CapitalLoss"].mean()

(110.3375, 0.0)