In [81]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from category_encoders import TargetEncoder, CountEncoder
from sklearn.feature_extraction import FeatureHasher
from category_encoders import BinaryEncoder
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [10]:
# Sample DataFrame
data = {
    'numeric_1': np.random.rand(1000),
    'numeric_2': np.random.rand(1000),
    'numeric_3': np.random.rand(1000),
    'category': np.random.choice(['A', 'B', 'C', 'D', 'E'], size=1000),
    'price': np.random.rand(1000) * 1000
}
df = pd.DataFrame(data)

In [14]:
df.head()

Unnamed: 0,numeric_1,numeric_2,numeric_3,category,price
0,0.051844,0.357047,0.447131,B,162.144258
1,0.036138,0.561187,0.213785,A,596.175435
2,0.490587,0.414976,0.111666,A,656.44138
3,0.213039,0.674542,0.408711,D,197.604708
4,0.736692,0.234375,0.071663,D,864.775032


In [37]:
# Get the number of unique categories
num_unique_categories = df['category'].nunique()

print(f"Number of unique categories: {num_unique_categories}")

# Count occurrences of each category
category_counts = df['category'].value_counts()

print("Count of each category:")
print(category_counts)

Number of unique categories: 5
Count of each category:
category
A    204
B    201
E    201
D    197
C    197
Name: count, dtype: int64


In [105]:
# Function to perform cross-validation and return mean squared error for different regression models
def evaluate_encoding_regression(df, encoding):
    X = df.drop(columns=['price'])
    if encoding=='Mean Encoding' or encoding=='Frequency Encoding'or encoding == 'One-Hot Encoding':
        X = df.drop(columns=['category'])
    y = df['price']
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    mse_scores = {'Linear Regression': [], 'Random Forest Regression': [], 'Ridge Regression': []}
    
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
    # Converting column names to strings
        X_train.columns = X_train.columns.astype(str)
        X_test.columns = X_test.columns.astype(str)
    
    # Handling Feature Hashing for X_train and X_test
        if encoding == 'Feature Hashing':
            hasher = FeatureHasher(n_features=5, input_type='string')
            X_train_hashed = hasher.fit_transform(X_train.values.astype(str))
            X_test_hashed = hasher.transform(X_test.values.astype(str))
        else:
            X_train_hashed, X_test_hashed = X_train, X_test
        
        
        # One-Hot Encoding
        if encoding == 'One-Hot Encoding':
            one_hot_encoder = OneHotEncoder(sparse=False)
            X_train_encoded = one_hot_encoder.fit_transform(X_train[['category_encoded']])
            X_test_encoded = one_hot_encoder.transform(X_test[['category_encoded']])
        else:
            X_train_encoded, X_test_encoded = X_train, X_test
        
        # Label Encoding
        if encoding == 'Label Encoding':
            label_encoder = LabelEncoder()
            X_train_encoded['category'] = label_encoder.fit_transform(X_train_encoded['category'])
            X_test_encoded['category'] = label_encoder.transform(X_test_encoded['category'])
                
        
        
        # Linear Regression
        lr_model = LinearRegression()
        lr_model.fit(X_train, y_train)
        y_pred_lr = lr_model.predict(X_test)
        mse_lr = mean_squared_error(y_test, y_pred_lr)
        mse_scores['Linear Regression'].append(mse_lr)
        
        # Random Forest Regression
        rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
        rf_model.fit(X_train, y_train)
        y_pred_rf = rf_model.predict(X_test)
        mse_rf = mean_squared_error(y_test, y_pred_rf)
        mse_scores['Random Forest Regression'].append(mse_rf)
        
        # Ridge Regression
        ridge_model = Ridge(alpha=1.0)
        ridge_model.fit(X_train, y_train)
        y_pred_ridge = ridge_model.predict(X_test)
        mse_ridge = mean_squared_error(y_test, y_pred_ridge)
        mse_scores['Ridge Regression'].append(mse_ridge)
        
    return {model: np.mean(mse_scores[model]) for model in mse_scores}

In [74]:
# Mean Encoding
mean_encoder = TargetEncoder(cols=['category'])
df_mean_encoded = df.copy()
df_mean_encoded['category_encoded'] = mean_encoder.fit_transform(df_mean_encoded['category'], df_mean_encoded['price'])
mean_scores = evaluate_encoding_regression(df_mean_encoded, 'Mean Encoding')


In [75]:
df_mean_encoded.head()

Unnamed: 0,numeric_1,numeric_2,numeric_3,category,price,category_encoded
0,0.051844,0.357047,0.447131,B,162.144258,475.941388
1,0.036138,0.561187,0.213785,A,596.175435,490.791852
2,0.490587,0.414976,0.111666,A,656.44138,490.791852
3,0.213039,0.674542,0.408711,D,197.604708,504.698291
4,0.736692,0.234375,0.071663,D,864.775032,504.698291


In [77]:
# Frequency Encoding
frequency_encoder = CountEncoder(cols=['category'])
df_freq_encoded = df.copy()
df_freq_encoded['category_encoded'] = frequency_encoder.fit_transform(df_freq_encoded['category'])
freq_scores = evaluate_encoding_regression(df_freq_encoded, 'Frequency Encoding')

In [78]:
# Feature Hashing
hasher = FeatureHasher(n_features=5, input_type='string')
X_category = df[['category']].values.astype(str)
hashed_features = hasher.fit_transform(X_category)
df_hashed = pd.concat([df.drop(columns=['category']), pd.DataFrame(hashed_features.toarray())], axis=1)
df_hashed.head()
hash_scores = evaluate_encoding_regression(df_hashed, 'Feature Hashing')

In [79]:
# Binary Encoding
binary_encoder = BinaryEncoder(cols=['category'])
df_binary_encoded = binary_encoder.fit_transform(df['category'])
df_binary_encoded = pd.concat([df.drop(columns=['category']), df_binary_encoded], axis=1)
binary_scores = evaluate_encoding_regression(df_binary_encoded, 'Binary Encoding')

In [97]:
# Convert categorical column to numeric using LabelEncoder
label_encoder = LabelEncoder()
df['category_encoded'] = label_encoder.fit_transform(df['category'])
df_one_hot_encoded = df.copy()

# Drop the original categorical column
df_one_hot_encoded.drop(columns=['category'], inplace=True)


print(df_one_hot_encoded.head())

# Now, perform one-hot encoding
one_hot_scores = evaluate_encoding_regression(df, 'One-Hot Encoding')

   numeric_1  numeric_2  numeric_3       price  category_encoded
0   0.051844   0.357047   0.447131  162.144258                 1
1   0.036138   0.561187   0.213785  596.175435                 0
2   0.490587   0.414976   0.111666  656.441380                 0
3   0.213039   0.674542   0.408711  197.604708                 3
4   0.736692   0.234375   0.071663  864.775032                 3




In [106]:
# Label Encoding

label_scores = evaluate_encoding_regression(df, 'Label Encoding')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_encoded['category'] = label_encoder.fit_transform(X_train_encoded['category'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_encoded['category'] = label_encoder.transform(X_test_encoded['category'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_encoded['category'] = label_e

In [94]:
df.head()

Unnamed: 0,numeric_1,numeric_2,numeric_3,category,price,category_encoded
0,0.051844,0.357047,0.447131,B,162.144258,1
1,0.036138,0.561187,0.213785,A,596.175435,0
2,0.490587,0.414976,0.111666,A,656.44138,0
3,0.213039,0.674542,0.408711,D,197.604708,3
4,0.736692,0.234375,0.071663,D,864.775032,3


In [107]:
# Print MSE scores for each encoding and regression type
print("Mean Squared Error (MSE) for each encoding method and regression type:")
print("Mean Encoding:", mean_scores)
print("Frequency Encoding:", freq_scores)
print("Feature Hashing:", hash_scores)
print("Binary Encoding:", binary_scores)
print("One-Hot Encoding:", one_hot_scores)
print("Label Encoding:", label_scores)

Mean Squared Error (MSE) for each encoding method and regression type:
Mean Encoding: {'Linear Regression': 8.86258523581127e-26, 'Random Forest Regression': 1.2649102567503177, 'Ridge Regression': 1.8801897569757232e-11}
Frequency Encoding: {'Linear Regression': 1.4407076462324776e-26, 'Random Forest Regression': 1.2173414255371515, 'Ridge Regression': 1.8786298305712438e-11}
Feature Hashing: {'Linear Regression': 86296.69852713795, 'Random Forest Regression': 95883.06551500736, 'Ridge Regression': 86280.98289225053}
Binary Encoding: {'Linear Regression': 86227.88702141985, 'Random Forest Regression': 96480.63665082862, 'Ridge Regression': 86210.12903217121}
One-Hot Encoding: {'Linear Regression': 7.044914489562142e-26, 'Random Forest Regression': 1.2639635055782876, 'Ridge Regression': 1.8783867162015626e-11}
Label Encoding: {'Linear Regression': 86173.71820622984, 'Random Forest Regression': 95352.99911310928, 'Ridge Regression': 86139.28657113982}


In [109]:
# Create a list of dictionaries to store the scores
scores_list = []

# Append mean scores for each encoding technique
scores_list.append({'Encoding': 'Mean Encoding', 
                    'Linear Regression': mean_scores['Linear Regression'], 
                    'Random Forest Regression': mean_scores['Random Forest Regression'], 
                    'Ridge Regression': mean_scores['Ridge Regression']})

scores_list.append({'Encoding': 'Frequency Encoding', 
                    'Linear Regression': freq_scores['Linear Regression'], 
                    'Random Forest Regression': freq_scores['Random Forest Regression'], 
                    'Ridge Regression': freq_scores['Ridge Regression']})

scores_list.append({'Encoding': 'Feature Hashing', 
                    'Linear Regression': hash_scores['Linear Regression'], 
                    'Random Forest Regression': hash_scores['Random Forest Regression'], 
                    'Ridge Regression': hash_scores['Ridge Regression']})

scores_list.append({'Encoding': 'Binary Encoding', 
                    'Linear Regression': binary_scores['Linear Regression'], 
                    'Random Forest Regression': binary_scores['Random Forest Regression'], 
                    'Ridge Regression': binary_scores['Ridge Regression']})

scores_list.append({'Encoding': 'One-Hot Encoding', 
                              'Linear Regression': one_hot_scores['Linear Regression'], 
                              'Random Forest Regression': one_hot_scores['Random Forest Regression'], 
                              'Ridge Regression': one_hot_scores['Ridge Regression']})

scores_list.append({'Encoding': 'Label Encoding', 
                              'Linear Regression': label_scores['Linear Regression'], 
                              'Random Forest Regression': label_scores['Random Forest Regression'], 
                              'Ridge Regression': label_scores['Ridge Regression']})

# Create a DataFrame from the list of dictionaries
scores_df = pd.DataFrame(scores_list)

print(scores_df)


             Encoding  Linear Regression  Random Forest Regression  \
0       Mean Encoding       8.862585e-26                  1.264910   
1  Frequency Encoding       1.440708e-26                  1.217341   
2     Feature Hashing       8.629670e+04              95883.065515   
3     Binary Encoding       8.622789e+04              96480.636651   
4    One-Hot Encoding       7.044914e-26                  1.263964   
5      Label Encoding       8.617372e+04              95352.999113   

   Ridge Regression  
0      1.880190e-11  
1      1.878630e-11  
2      8.628098e+04  
3      8.621013e+04  
4      1.878387e-11  
5      8.613929e+04  


In [110]:
# Dictionary to store the best encoding method for each regression type
best_encodings = {}

# Iterate over the DataFrame rows
for index, row in scores_df.iterrows():
    encoding = row['Encoding']
    lr_score = row['Linear Regression']
    rf_score = row['Random Forest Regression']
    ridge_score = row['Ridge Regression']
   
    
    # Identify the best encoding technique for Linear Regression
    if 'Linear Regression' not in best_encodings or lr_score < best_encodings['Linear Regression'][1]:
        best_encodings['Linear Regression'] = (encoding, lr_score)

    # Identify the best encoding technique for Random Forest Regression
    if 'Random Forest Regression' not in best_encodings or rf_score < best_encodings['Random Forest Regression'][1]:
        best_encodings['Random Forest Regression'] = (encoding, rf_score)

    # Identify the best encoding technique for Ridge Regression
    if 'Ridge Regression' not in best_encodings or ridge_score < best_encodings['Ridge Regression'][1]:
        best_encodings['Ridge Regression'] = (encoding, ridge_score)

# Print the best encoding technique for each regression type
print("\nBest encoding technique for each regression type:")
for regression, (encoding, score) in best_encodings.items():
    print(f"{regression}: {encoding} (MSE: {score})")




Best encoding technique for each regression type:
Linear Regression: Frequency Encoding (MSE: 1.4407076462324776e-26)
Random Forest Regression: Frequency Encoding (MSE: 1.2173414255371515)
Ridge Regression: One-Hot Encoding (MSE: 1.8783867162015626e-11)
