# Feature Engineering

As we have no duplicates or null values we can just directly import the dataset again and do ur feature engineering from there.

In [50]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

DATA_DIR = "../Data/"

train_data = pd.read_csv(DATA_DIR +"train.csv")
test_data = pd.read_csv(DATA_DIR + "test.csv")



def preprocess_data(train_data : pd.DataFrame, test_data : pd.DataFrame) -> pd.DataFrame:
    # Function for preprocessing data
    # Drop id
    train_data.drop("id",axis = 1, inplace = True)
    test_data.drop("id", axis = 1, inplace = True)

    # Get column types
    numerical_columns = list(train_data.select_dtypes(exclude=["object"]).columns)
    cat_columns = list(train_data.select_dtypes(include="object").columns)
    test_numerical_columns = list(test_data.select_dtypes(exclude=["object"]).columns)
    
    # Encode cat data with one hot encoding
    train_data = pd.get_dummies(train_data, cat_columns, drop_first= True, dtype = int)
    test_data = pd.get_dummies(test_data, cat_columns, drop_first= True, dtype = int)

    return train_data, test_data

In [51]:
def feature_engineer(train_data :pd.DataFrame, test_data : pd.DataFrame) -> pd.DataFrame:
    '''
    Function for engineering new features into the data frames

    '''

    train_data["income_per_year_emp"] = train_data["person_income"] / ( 1 + train_data["person_emp_length"])
    test_data["income_per_year_emp"] = test_data["person_income"] / ( 1 + test_data["person_emp_length"])

    return train_data, test_data

In [52]:
preprocessed_train, preprocessted_test = preprocess_data(train_data, test_data)
eng_train, eng_test = feature_engineer(preprocessed_train, preprocessted_test)

eng_train.to_csv("../Data/processed_train.csv")
eng_test.to_csv("../Data/processed_test.csv")