In [1]:
import pandas as pd
import numpy as np
import os

# Define paths
train_data_path = os.path.join("data", "train.csv")
test_data_path = os.path.join("data", "test.csv")
processed_data_folder = "data_processed" # Folder to save cleaned data

# Create processed data folder if it doesn't exist
os.makedirs(processed_data_folder, exist_ok=True)

# Load the datasets
try:
    df_train = pd.read_csv(train_data_path)
    df_test = pd.read_csv(test_data_path)
    print("Train and Test datasets loaded successfully.")
    print(f"Original train shape: {df_train.shape}")
    print(f"Original test shape: {df_test.shape}")
except FileNotFoundError as e:
    print(f"ERROR: {e}")
    print("Make sure train.csv and test.csv are in the 'data' folder.")
    raise
except Exception as e:
    print(f"An error occurred during loading: {e}")
    raise

# Keep track of original shapes
original_train_shape = df_train.shape
original_test_shape = df_test.shape

# Display initial info to recall data types and missing values
print("\nInitial Train Data Info:")
df_train.info()
print("\nInitial Test Data Info:")
df_test.info() # Test data likely has similar issues

Train and Test datasets loaded successfully.
Original train shape: (132379, 14)
Original test shape: (33095, 14)

Initial Train Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 132379 entries, 0 to 132378
Data columns (total 14 columns):
 #   Column                   Non-Null Count   Dtype 
---  ------                   --------------   ----- 
 0   year                     132379 non-null  int64 
 1   month                    132379 non-null  int64 
 2   day                      132379 non-null  int64 
 3   order                    132379 non-null  int64 
 4   country                  132379 non-null  int64 
 5   session ID               132379 non-null  int64 
 6   page 1 (main category)   132379 non-null  int64 
 7   page 2 (clothing model)  132379 non-null  object
 8   colour                   132379 non-null  int64 
 9   location                 132379 non-null  int64 
 10  model photography        132379 non-null  int64 
 11  price                    132379 non-null  i

In [2]:
def standardize_columns(df, column_map=None):
    """Standardizes column names: lowercase, replaces spaces/special chars with _, removes parentheses."""
    original_columns = df.columns.tolist()
    df.columns = df.columns.str.lower()
    df.columns = df.columns.str.replace(r'[ /()]+', '_', regex=True) # Replace space, /, () with _
    df.columns = df.columns.str.replace(r'_+', '_', regex=True) # Replace multiple underscores with single
    df.columns = df.columns.str.strip('_') # Remove leading/trailing underscores
    new_columns = df.columns.tolist()

    # Create or update the column map
    if column_map is None:
        column_map = {}
    map_update = {orig: new for orig, new in zip(original_columns, new_columns) if orig != new}
    column_map.update(map_update)

    print("Column names standardized.")
    return df, column_map

# Standardize columns for both dataframes
column_rename_map = {} # Initialize an empty map
df_train, column_rename_map = standardize_columns(df_train, column_rename_map)
df_test, column_rename_map = standardize_columns(df_test, column_rename_map) # Use the same map logic

print("\nStandardized Train Columns:")
print(df_train.columns)
print("\nStandardized Test Columns:")
print(df_test.columns)
print("\nColumn Rename Mapping (Original -> Standardized):")
print(column_rename_map)

# Display first few rows with new column names
print("\nTrain data head with standardized columns:")
df_train.head()

Column names standardized.
Column names standardized.

Standardized Train Columns:
Index(['year', 'month', 'day', 'order', 'country', 'session_id',
       'page_1_main_category', 'page_2_clothing_model', 'colour', 'location',
       'model_photography', 'price', 'price_2', 'page'],
      dtype='object')

Standardized Test Columns:
Index(['year', 'month', 'day', 'order', 'country', 'session_id',
       'page_1_main_category', 'page_2_clothing_model', 'colour', 'location',
       'model_photography', 'price', 'price_2', 'page'],
      dtype='object')

Column Rename Mapping (Original -> Standardized):
{'session ID': 'session_id', 'page 1 (main category)': 'page_1_main_category', 'page 2 (clothing model)': 'page_2_clothing_model', 'model photography': 'model_photography', 'price 2': 'price_2'}

Train data head with standardized columns:


Unnamed: 0,year,month,day,order,country,session_id,page_1_main_category,page_2_clothing_model,colour,location,model_photography,price,price_2,page
0,2008,6,22,21,29,15648,3,C20,13,1,2,48,1,2
1,2008,5,19,6,29,10018,2,B26,13,3,1,57,1,2
2,2008,7,15,2,29,19388,3,C13,9,5,1,48,1,1
3,2008,5,2,2,29,7181,2,B11,2,4,1,43,2,1
4,2008,6,9,16,29,13493,2,B31,9,5,1,57,1,2


In [3]:
print("\n--- Data Type Conversion ---")

# --- Identify Columns by Type ---
# Numerical columns that should be treated as is (or potentially check range)
numeric_cols = ['year', 'month', 'day', 'order', 'price', 'page']
# Categorical columns (even if currently represented by numbers)
# Note: 'page_2_clothing_model' has many unique values, but is still categorical.
# 'session_id' could be argued either way, but let's treat it as categorical/identifier.
categorical_cols = [
    'country', 'session_id', 'page_1_main_category', 'page_2_clothing_model',
    'colour', 'location', 'model_photography', 'price_2'
]

# --- Convert Price ---
# Check if 'price' is not already numeric
if df_train['price'].dtype == 'object':
    print("Converting 'price' column to numeric...")
    # Example conversion: remove '$' or other symbols if necessary (adapt if needed)
    # df_train['price'] = df_train['price'].str.replace(r'[$,]', '', regex=True).astype(float)
    # df_test['price'] = df_test['price'].str.replace(r'[$,]', '', regex=True).astype(float)
    # If it's already loaded correctly but as int, converting to float is safer for potential decimals
    df_train['price'] = pd.to_numeric(df_train['price'], errors='coerce') # errors='coerce' turns problematic values into NaN
    df_test['price'] = pd.to_numeric(df_test['price'], errors='coerce')
    print("'price' converted to numeric.")
elif pd.api.types.is_numeric_dtype(df_train['price']):
     # If already numeric, ensure it's float
     df_train['price'] = df_train['price'].astype(float)
     df_test['price'] = df_test['price'].astype(float)
     print("'price' is already numeric. Ensured it is float type.")
else:
    print("'price' column type is unexpected. Please inspect.")

# --- Convert Categorical Columns ---
print("Converting categorical columns to 'object' type (or 'category')...")
for col in categorical_cols:
    if col in df_train.columns:
        # Using 'astype(object)' is generally safer for broad compatibility
        # Using 'astype(category)' can be more memory efficient for high cardinality cols,
        # but requires careful handling during merging or complex operations. Let's use object for now.
        df_train[col] = df_train[col].astype(object)
        if col in df_test.columns: # Ensure column exists in test set too
             df_test[col] = df_test[col].astype(object)
        print(f"- Converted '{col}' to object type.")
    else:
        print(f"- Warning: Column '{col}' not found in DataFrame.")

# --- Verify Conversions ---
print("\nTrain Data Info after Type Conversion:")
df_train.info()
print("\nTest Data Info after Type Conversion:")
df_test.info()


--- Data Type Conversion ---
'price' is already numeric. Ensured it is float type.
Converting categorical columns to 'object' type (or 'category')...
- Converted 'country' to object type.
- Converted 'session_id' to object type.
- Converted 'page_1_main_category' to object type.
- Converted 'page_2_clothing_model' to object type.
- Converted 'colour' to object type.
- Converted 'location' to object type.
- Converted 'model_photography' to object type.
- Converted 'price_2' to object type.

Train Data Info after Type Conversion:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 132379 entries, 0 to 132378
Data columns (total 14 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   year                   132379 non-null  int64  
 1   month                  132379 non-null  int64  
 2   day                    132379 non-null  int64  
 3   order                  132379 non-null  int64  
 4   country                132379 n

In [4]:
print("\n--- Handling Missing Values ---")

# --- Calculate Imputation Values from Training Data ONLY ---
imputation_values = {}

# Numerical Imputation (using median)
numeric_cols_with_nan = df_train[numeric_cols].isnull().sum()
numeric_cols_to_impute = numeric_cols_with_nan[numeric_cols_with_nan > 0].index.tolist()
print(f"Numerical columns with missing values: {numeric_cols_to_impute}")

for col in numeric_cols_to_impute:
    median_val = df_train[col].median()
    imputation_values[col] = median_val
    print(f"Calculated median for '{col}': {median_val}")
    df_train[col].fillna(median_val, inplace=True)
    if col in df_test.columns: # Apply to test set if column exists
        df_test[col].fillna(median_val, inplace=True) # Use TRAIN median

# Categorical Imputation (using mode)
categorical_cols_with_nan = df_train[categorical_cols].isnull().sum()
categorical_cols_to_impute = categorical_cols_with_nan[categorical_cols_with_nan > 0].index.tolist()
print(f"\nCategorical columns with missing values: {categorical_cols_to_impute}")

for col in categorical_cols_to_impute:
    # mode() can return multiple values if they have the same frequency, take the first one [0]
    mode_val = df_train[col].mode()[0]
    imputation_values[col] = mode_val
    print(f"Calculated mode for '{col}': {mode_val}")
    df_train[col].fillna(mode_val, inplace=True)
    if col in df_test.columns: # Apply to test set if column exists
        df_test[col].fillna(mode_val, inplace=True) # Use TRAIN mode

# --- Verify Imputation ---
print("\nMissing values after imputation (Train):")
print(df_train.isnull().sum().sum()) # Should be 0 if all handled

print("\nMissing values after imputation (Test):")
print(df_test.isnull().sum().sum()) # Should also be 0 if all handled

# Display imputation values used (good for reference)
print("\nImputation values used (derived from train data):")
print(imputation_values)


--- Handling Missing Values ---
Numerical columns with missing values: []

Categorical columns with missing values: []

Missing values after imputation (Train):
0

Missing values after imputation (Test):
0

Imputation values used (derived from train data):
{}


In [5]:
print("\n--- Post-Processing Checks ---")
print("Train data head after preprocessing:")
print(df_train.head())

print("\nTrain data info after preprocessing:")
df_train.info()

print("\nTrain data numerical summary after preprocessing:")
print(df_train.describe())

print("\nTrain data categorical summary after preprocessing:")
# Now describe(include='object') should catch all our categorical features
print(df_train.describe(include=['object']))

# Check shapes again to ensure no rows were accidentally dropped
print(f"\nShape check - Train: {df_train.shape} (Original: {original_train_shape})")
print(f"Shape check - Test: {df_test.shape} (Original: {original_test_shape})")


--- Post-Processing Checks ---
Train data head after preprocessing:
   year  month  day  order country session_id page_1_main_category  \
0  2008      6   22     21      29      15648                    3   
1  2008      5   19      6      29      10018                    2   
2  2008      7   15      2      29      19388                    3   
3  2008      5    2      2      29       7181                    2   
4  2008      6    9     16      29      13493                    2   

  page_2_clothing_model colour location model_photography  price price_2  page  
0                   C20     13        1                 2   48.0       1     2  
1                   B26     13        3                 1   57.0       1     2  
2                   C13      9        5                 1   48.0       1     1  
3                   B11      2        4                 1   43.0       2     1  
4                   B31      9        5                 1   57.0       1     2  

Train data info after p

In [6]:
# Define output file paths
train_cleaned_path = os.path.join(processed_data_folder, "train_cleaned.csv")
test_cleaned_path = os.path.join(processed_data_folder, "test_cleaned.csv")

# Save the cleaned DataFrames
try:
    df_train.to_csv(train_cleaned_path, index=False)
    df_test.to_csv(test_cleaned_path, index=False)
    print(f"\nCleaned training data saved to: {train_cleaned_path}")
    print(f"Cleaned test data saved to: {test_cleaned_path}")
except Exception as e:
    print(f"Error saving cleaned files: {e}")


Cleaned training data saved to: data_processed\train_cleaned.csv
Cleaned test data saved to: data_processed\test_cleaned.csv
