# Feature importance

Today we'll talk about feature importance and feature selection. The dataset is the same one that we used during week 5 - [adult income dataset](https://archive.ics.uci.edu/dataset/2/adult).


In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import urllib
import zipfile

pd.set_option('display.max_columns', None)

# Suppress all warnings
import warnings
warnings.filterwarnings('ignore')


In [2]:
url = "https://archive.ics.uci.edu/static/public/2/adult.zip"
urllib.request.urlretrieve(url, "adult.zip")

with zipfile.ZipFile("adult.zip", "r") as zip_ref:
    zip_ref.extractall("adults")

columns = [
    "age",
    "workclass",
    "fnlwgt",
    "education",
    "education-num",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "capital-gain",
    "capital-loss",
    "hours-per-week",
    "native-country",
    "income",
]

# the dataset is pre-split into train and test
# I want to do the split myself, so let's join the datasets
df1 = pd.read_csv("adult.data", header=None, names=columns)
# first row of adult.test is weird, let's remove it
df2 = pd.read_csv("adult.test", header=None, names=columns)[1:]
df = pd.concat([df1, df2])

In [3]:
# Helper function for plotting
def plot_importances(importances, features, xlabel="importance"):
    df = pd.DataFrame({
        'feature': features,
        'importance': importances
    })
    df = df.sort_values('importance', ascending=False)
    plt.figure(figsize=(8, 5))
    sns.barplot(x='importance', y='feature', data=df)
    plt.xlabel(xlabel)


## Preprocessing

In [4]:
# Copying preprocessing from week 5
df = df.drop(columns=["fnlwgt"])
df = df.rename(columns=lambda x: x.replace("-", "_"))
# Replace "?" with NaN in all columns
df = df.replace(" ?", None)

# map target to more usable 0/1
df["income"] = df["income"].str.strip().str.replace(".", "")
df["income"] = df["income"].map({"<=50K": 0, ">50K": 1})

# Convert age column to integer type
df["age"] = df["age"].astype(int)


In [5]:
numerical_features = [
    "age",
    "education_num",
    "capital_gain",
    "capital_loss",
    "hours_per_week",
]


In [6]:
df = df[numerical_features + ["income"]]

In [7]:
from sklearn.model_selection import train_test_split

# Split the dataset into training and testing sets
y_train, y_test, X_train, X_test = train_test_split(
    df["income"], df.drop(columns=["income"]), test_size=0.2, random_state=10
)

# Also split a fix validation set
y_train, y_val, X_train, X_val = train_test_split(
    y_train, X_train, test_size=0.25, random_state=10
)
