In [1]:
import pandas as pd
import numpy as np
import os

# Optional: Configure pandas to display more columns/rows if needed
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', 100)

print("Libraries imported successfully.")

Libraries imported successfully.


In [2]:
# Define the path to the training data file
train_data_path = os.path.join("data", "train.csv")

# Load the training data into a pandas DataFrame
try:
    df_train = pd.read_csv(train_data_path)
    print(f"Training data loaded successfully from: {train_data_path}")
except FileNotFoundError:
    print(f"ERROR: train.csv not found at {train_data_path}")
    print("Please ensure you ran the previous data splitting notebook correctly.")
    # Stop execution if file not found
    raise
except Exception as e:
    print(f"An error occurred while loading train.csv: {e}")
    raise

Training data loaded successfully from: data\train.csv


In [3]:
# Display the first 5 rows of the training DataFrame
print("First 5 rows of the training data:")
df_train.head()

First 5 rows of the training data:


Unnamed: 0,year,month,day,order,country,session ID,page 1 (main category),page 2 (clothing model),colour,location,model photography,price,price 2,page
0,2008,6,22,21,29,15648,3,C20,13,1,2,48,1,2
1,2008,5,19,6,29,10018,2,B26,13,3,1,57,1,2
2,2008,7,15,2,29,19388,3,C13,9,5,1,48,1,1
3,2008,5,2,2,29,7181,2,B11,2,4,1,43,2,1
4,2008,6,9,16,29,13493,2,B31,9,5,1,57,1,2


In [4]:
# Get the dimensions (rows, columns) of the DataFrame
rows, columns = df_train.shape
print(f"The training dataset has {rows} rows and {columns} columns.")
# Get concise summary of the DataFrame
print("DataFrame Information (Columns, Non-Null Counts, Data Types):")
df_train.info()

The training dataset has 132379 rows and 14 columns.
DataFrame Information (Columns, Non-Null Counts, Data Types):
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 132379 entries, 0 to 132378
Data columns (total 14 columns):
 #   Column                   Non-Null Count   Dtype 
---  ------                   --------------   ----- 
 0   year                     132379 non-null  int64 
 1   month                    132379 non-null  int64 
 2   day                      132379 non-null  int64 
 3   order                    132379 non-null  int64 
 4   country                  132379 non-null  int64 
 5   session ID               132379 non-null  int64 
 6   page 1 (main category)   132379 non-null  int64 
 7   page 2 (clothing model)  132379 non-null  object
 8   colour                   132379 non-null  int64 
 9   location                 132379 non-null  int64 
 10  model photography        132379 non-null  int64 
 11  price                    132379 non-null  int64 
 12  price 2      

In [5]:
# Calculate and display the number of missing values in each column
print("\nMissing values per column:")
missing_values = df_train.isnull().sum()
print(missing_values[missing_values > 0]) # Show only columns with missing values


Missing values per column:
Series([], dtype: int64)


In [6]:
# Generate descriptive statistics for numerical columns
print("\nSummary statistics for numerical columns:")
df_train.describe()


Summary statistics for numerical columns:


Unnamed: 0,year,month,day,order,country,session ID,page 1 (main category),colour,location,model photography,price,price 2,page
count,132379.0,132379.0,132379.0,132379.0,132379.0,132379.0,132379.0,132379.0,132379.0,132379.0,132379.0,132379.0,132379.0
mean,2008.0,5.582759,14.507671,9.811314,26.949629,12038.722063,2.400426,6.227393,3.261106,1.260026,43.788191,1.488484,1.712137
std,0.0,1.328064,8.829106,13.458937,7.153071,7012.460866,1.145004,4.238354,1.714058,0.43865,12.53939,0.499869,0.983699
min,2008.0,4.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,18.0,1.0,1.0
25%,2008.0,4.0,7.0,2.0,29.0,5905.0,1.0,3.0,2.0,1.0,33.0,1.0,1.0
50%,2008.0,5.0,14.0,6.0,29.0,11931.0,2.0,4.0,3.0,1.0,43.0,1.0,1.0
75%,2008.0,7.0,22.0,12.0,29.0,18212.0,3.0,9.0,5.0,2.0,52.0,2.0,2.0
max,2008.0,8.0,31.0,195.0,47.0,24026.0,4.0,14.0,6.0,2.0,82.0,2.0,5.0


In [7]:
# Generate descriptive statistics for categorical (object) columns
print("\nSummary statistics for categorical (object) columns:")
# Note: If your categorical columns are loaded as numbers (int64),
# they won't show up here. We might need to convert their types first in Phase 2.
df_train.describe(include=['object'])


Summary statistics for categorical (object) columns:


Unnamed: 0,page 2 (clothing model)
count,132379
unique,216
top,B4
freq,2824
