In [10]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

training_data = pd.read_csv('train.csv')  # 188533 rows, 13 columns - last column is price
test_data = pd.read_csv('test.csv')  # 125690 rows

def encode_columns(df):
    df['hp'] = df['engine'].str.extract(r'(\d+\.?\d*)HP').astype(float)
    quantiles = [0.04 * cnt for cnt in range(26)]
    bin_edges = df['hp'].quantile(quantiles).values
    df['hp_bin'] = pd.cut(df['hp'], bins=bin_edges, labels=False, include_lowest=True) #bucket into 11 unique (was originally 348)
    df['cylinder'] = df['engine'].str.extract(r'(\d+\.?\d*) Cylinder').astype(float) #7 unique

    df = df.drop(columns=['engine', 'hp'])
    
    df['got_V'] = df['model'].str.extract(r'(\d+\.?\d*) V').notna().astype(int)
    
    df['clean_title'] = df['clean_title'].fillna('unknown')

    df = df.drop(columns = ['id'])
    return df

training_data = encode_columns(training_data)
test_data = encode_columns(test_data)

In [11]:
test_data['price'] = 0  
all_data = pd.concat([training_data, test_data], ignore_index=True)

nan_cols = ['fuel_type', 'accident', 'hp_bin', 'cylinder']
all_data['nans'] = 0              
for nan_col_idx in range(4):
    nan_col = nan_cols[nan_col_idx]
    all_data['nans'] += all_data[nan_col].isna().astype(int) * (2 ** nan_col_idx) 

all_data.fillna({'fuel_type': 'unknown', 'accident': 'unknown', 'hp_bin' : -1, 'cylinder': -1}, inplace=True)

training_data = all_data.head(188533).drop(columns = ['price'])
y_train = all_data.head(188533)['price']
test_data = all_data.loc[188533:(188533+125690)].drop(columns = ['price'])

In [12]:
prior_val_weight = 1
num_iterations = 9

In [14]:
import numpy as np
import pandas as pd

def ordered_target_encode(train, test, y_train, categorical_features, prior_val_weight, num_iterations):
    # Initialize new DataFrames with the same structure as train and test
    new_train = pd.DataFrame(np.zeros(train.shape), columns=train.columns)
    new_test = pd.DataFrame(np.zeros(test.shape), columns=test.columns)
    
    global_mean = y_train.mean()

    for _ in range(num_iterations):
        # Shuffle the data
        shuffled_indices = np.random.permutation(len(train))        
        train_encoded = train.copy()
        test_encoded = test.copy()
        
        for col in categorical_features:
            # Initialize cumulative sum and count dictionaries
            cumulative_sum = {}
            cumulative_count = {}
            
            train_encoded_values = []

            for idx in range(len(train)):
                i = shuffled_indices[idx]
                current_value = train.iloc[i][col]
                current_target = y_train[i]

                # Initialize cumulative sum and count if not already set for this category
                if current_value not in cumulative_sum:
                    cumulative_sum[current_value] = 0
                    cumulative_count[current_value] = 0
                
                # Calculate the ordered target encoding with prior smoothing
                numerator = cumulative_sum[current_value] + prior_val_weight * global_mean
                denominator = cumulative_count[current_value] + prior_val_weight
                
                mean_value = numerator / denominator
                
                # Append the encoded value
                train_encoded_values.append(mean_value)
                
                # Update the cumulative sum and count
                cumulative_sum[current_value] += current_target
                cumulative_count[current_value] += 1

            # Apply the ordered encoding to the training data
            train_encoded[col] = train_encoded_values
            
            # Apply the encoding to the test data using the cumulative sums and counts from training
            test_encoded[col] = test[col].map(lambda x: (cumulative_sum.get(x, 0) + prior_val_weight * global_mean) / 
                                              (cumulative_count.get(x, 0) + prior_val_weight))
        
        # Update the overall encoded train and test DataFrames
        new_train += train_encoded * (1 / num_iterations)
        new_test += test_encoded * (1 / num_iterations)
    
    return new_train, new_test
categorical_features = ['brand', 'model', 'fuel_type', 'transmission', 'ext_col', 'int_col', 'accident', 'clean_title', 'hp_bin', 'cylinder', 'got_V', 'nans']

prior_val_weight = 10
num_iterations = 9

training_data, test_data = ordered_target_encode(training_data, test_data, y_train, categorical_features, prior_val_weight, num_iterations)

In [15]:
training_data.to_csv('training_data.csv', index=False)
test_data.to_csv('test_data.csv', index=False)