## No FE Dataframe

We'll create a dataframe without any Feature Engineering or EDA to compare results so we can understand the importance of FE/EDA

In [None]:
import os
import pandas as pd
import numpy as np

In [None]:
notebook_dir = os.path.dirname(os.path.abspath('__file__'))

pre_data_path = os.path.join(notebook_dir, '../data/raw/car_prices.csv')
pos_data_path = os.path.join(notebook_dir, '../data/processed/car_prices_after_FE.csv')

df = pd.read_csv(pre_data_path)
processed_df = pd.read_csv(pos_data_path)

In [None]:
df.info()

In [None]:
processed_df.info()

In [None]:
# To mantain the same structure, we need 'age' column
saledate = df['saledate'].str.split(expand=True)
df = pd.merge(left=df, right=saledate.iloc[:, 3], left_index=True, right_index=True)
df.rename(columns={3: 'salesyear'}, inplace=True)
df['salesyear'] = df['salesyear'].fillna(2015).astype(np.int64)
df['age'] = df['salesyear'] - df['year']

In [None]:
# We will drop useless columns
columns_to_drop = ['vin', 'seller', 'saledate', 'year', 'salesyear']
df = df.drop(columns=columns_to_drop, errors='ignore')

In [None]:
# Drop all nan values so that our model doesn't have any errors
df.dropna(how = 'any', inplace = True)

In [None]:
df.info()

In [None]:
processed_df.info()

In [None]:
# Save dataframe
notebook_dir = os.path.dirname(os.path.abspath('__file__'))
save_path = os.path.join(notebook_dir, '../data/processed/car_prices_without_FE.csv')

df.to_csv(save_path, index=False)