In [1]:
import pandas as pd
import os

print("Pandas version:", pd.__version__)


Pandas version: 2.2.3


In [10]:
from sklearn.model_selection import train_test_split

In [3]:
# Define the path to the downloaded data file
# Make sure this matches the name of the file you downloaded into the 'data' subfolder
raw_data_filename = "e-shop clothing 2008.csv" # <-- *** REPLACE THIS WITH YOUR ACTUAL FILENAME ***
raw_data_path = os.path.join("data", raw_data_filename)

# Check if the file exists
if os.path.exists(raw_data_path):
    print(f"Found data file at: {raw_data_path}")
else:
    print(f"ERROR: Data file not found at: {raw_data_path}")
    print("Please make sure the file is in the 'data' folder and the filename is correct.")
    # Stop execution if file not found (optional, but good practice)
    raise FileNotFoundError(f"Data file not found: {raw_data_path}")

Found data file at: data\e-shop clothing 2008.csv


In [4]:
# Try loading with default settings (comma delimiter)
try:
    df_raw = pd.read_csv(raw_data_path)
    print("Successfully loaded with default settings. Displaying first 5 rows:")
    print(df_raw.head())
    print("\nDataFrame Info:")
    df_raw.info()
except Exception as e:
    print(f"Default loading failed. Error: {e}")
    print("\nTrying common alternatives...")

Successfully loaded with default settings. Displaying first 5 rows:
  year;month;day;order;country;session ID;page 1 (main category);page 2 (clothing model);colour;location;model photography;price;price 2;page
0                 2008;4;1;1;29;1;1;A13;1;5;1;28;2;1                                                                                         
1                 2008;4;1;2;29;1;1;A16;1;6;1;33;2;1                                                                                         
2                 2008;4;1;3;29;1;2;B4;10;2;1;52;1;1                                                                                         
3                 2008;4;1;4;29;1;2;B17;6;6;2;38;2;1                                                                                         
4                  2008;4;1;5;29;1;2;B8;4;3;2;52;1;1                                                                                         

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 165474 entrie

In [5]:
try:
    df_raw = pd.read_csv(raw_data_path, sep=';')
    print("Successfully loaded with semicolon delimiter. Displaying first 5 rows:")
    print(df_raw.head())
    print("\nDataFrame Info:")
    df_raw.info()
except Exception as e:
    print(f"Loading with semicolon delimiter failed. Error: {e}")

Successfully loaded with semicolon delimiter. Displaying first 5 rows:
   year  month  day  order  country  session ID  page 1 (main category)  \
0  2008      4    1      1       29           1                       1   
1  2008      4    1      2       29           1                       1   
2  2008      4    1      3       29           1                       2   
3  2008      4    1      4       29           1                       2   
4  2008      4    1      5       29           1                       2   

  page 2 (clothing model)  colour  location  model photography  price  \
0                     A13       1         5                  1     28   
1                     A16       1         6                  1     33   
2                      B4      10         2                  1     52   
3                     B17       6         6                  2     38   
4                      B8       4         3                  2     52   

   price 2  page  
0        2     1  
1

In [6]:
try:
    # Adjust the number '15' based on how many lines you need to skip
    df_raw = pd.read_csv(raw_data_path, sep=',', skiprows=15) # Adjust sep if needed
    print("Successfully loaded skipping initial rows. Displaying first 5 rows:")
    print(df_raw.head())
    # IMPORTANT: Check if the header row is now correct. If not, you might need header=None
    print("\nDataFrame Info:")
    df_raw.info()
except Exception as e:
    print(f"Loading with skiprows failed. Error: {e}")
    

Successfully loaded skipping initial rows. Displaying first 5 rows:
    2008;4;1;6;29;2;1;A10;3;4;1;38;2;1
0   2008;4;1;7;29;2;2;B27;2;3;1;57;1;2
1    2008;4;1;8;29;2;4;P1;3;1;1;38;1;1
2   2008;4;1;9;29;2;4;P34;9;6;2;48;1;2
3  2008;4;1;10;29;2;4;P33;9;5;1;43;1;2
4   2008;4;1;1;21;3;2;B17;6;6;2;38;2;1

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 165459 entries, 0 to 165458
Data columns (total 1 columns):
 #   Column                              Non-Null Count   Dtype 
---  ------                              --------------   ----- 
 0   2008;4;1;6;29;2;1;A10;3;4;1;38;2;1  165459 non-null  object
dtypes: object(1)
memory usage: 1.3+ MB


In [7]:
# Define column names based on the description you provided
column_names = [
    'YEAR', 'MONTH', 'DAY', 'ORDER', 'COUNTRY', 'SESSION ID',
    'PAGE 1 (MAIN CATEGORY)', 'PAGE 2 (CLOTHING MODEL)', 'COLOUR',
    'LOCATION', 'MODEL PHOTOGRAPHY', 'PRICE', 'PRICE 2', 'PAGE'
]

try:
    # Example: Skip 15 description lines, no header in data, use semicolon delimiter
    df_raw = pd.read_csv(raw_data_path, sep=';', skiprows=15, header=None, names=column_names)
    print("Successfully loaded with specified names, skipping rows, no header. Displaying first 5 rows:")
    print(df_raw.head())
    print("\nDataFrame Info:")
    df_raw.info()
except Exception as e:
    print(f"Loading with specified names failed. Error: {e}")

Successfully loaded with specified names, skipping rows, no header. Displaying first 5 rows:
   YEAR  MONTH  DAY  ORDER  COUNTRY  SESSION ID  PAGE 1 (MAIN CATEGORY)  \
0  2008      4    1      6       29           2                       1   
1  2008      4    1      7       29           2                       2   
2  2008      4    1      8       29           2                       4   
3  2008      4    1      9       29           2                       4   
4  2008      4    1     10       29           2                       4   

  PAGE 2 (CLOTHING MODEL)  COLOUR  LOCATION  MODEL PHOTOGRAPHY  PRICE  \
0                     A10       3         4                  1     38   
1                     B27       2         3                  1     57   
2                      P1       3         1                  1     38   
3                     P34       9         6                  2     48   
4                     P33       9         5                  1     43   

   PRICE 2  PAGE 

In [8]:
# Define column names based on the description you provided
column_names = [
    'YEAR', 'MONTH', 'DAY', 'ORDER', 'COUNTRY', 'SESSION ID',
    'PAGE 1 (MAIN CATEGORY)', 'PAGE 2 (CLOTHING MODEL)', 'COLOUR',
    'LOCATION', 'MODEL PHOTOGRAPHY', 'PRICE', 'PRICE 2', 'PAGE'
]

try:
    # Example: Skip 15 description lines, no header in data, use semicolon delimiter
    df_raw = pd.read_csv(raw_data_path, sep=';', skiprows=15, header=None, names=column_names)
    print("Successfully loaded with specified names, skipping rows, no header. Displaying first 5 rows:")
    print(df_raw.head())
    print("\nDataFrame Info:")
    df_raw.info()
except Exception as e:
    print(f"Loading with specified names failed. Error: {e}")

Successfully loaded with specified names, skipping rows, no header. Displaying first 5 rows:
   YEAR  MONTH  DAY  ORDER  COUNTRY  SESSION ID  PAGE 1 (MAIN CATEGORY)  \
0  2008      4    1      6       29           2                       1   
1  2008      4    1      7       29           2                       2   
2  2008      4    1      8       29           2                       4   
3  2008      4    1      9       29           2                       4   
4  2008      4    1     10       29           2                       4   

  PAGE 2 (CLOTHING MODEL)  COLOUR  LOCATION  MODEL PHOTOGRAPHY  PRICE  \
0                     A10       3         4                  1     38   
1                     B27       2         3                  1     57   
2                      P1       3         1                  1     38   
3                     P34       9         6                  2     48   
4                     P33       9         5                  1     43   

   PRICE 2  PAGE 

In [9]:
try:
    df_raw = pd.read_csv(raw_data_path, sep=';', encoding='latin1') # Example
    print("Successfully loaded with latin1 encoding. Displaying first 5 rows:")
    print(df_raw.head())
    print("\nDataFrame Info:")
    df_raw.info()
except Exception as e:
    print(f"Loading with latin1 encoding failed. Error: {e}")

Successfully loaded with latin1 encoding. Displaying first 5 rows:
   year  month  day  order  country  session ID  page 1 (main category)  \
0  2008      4    1      1       29           1                       1   
1  2008      4    1      2       29           1                       1   
2  2008      4    1      3       29           1                       2   
3  2008      4    1      4       29           1                       2   
4  2008      4    1      5       29           1                       2   

  page 2 (clothing model)  colour  location  model photography  price  \
0                     A13       1         5                  1     28   
1                     A16       1         6                  1     33   
2                      B4      10         2                  1     52   
3                     B17       6         6                  2     38   
4                      B8       4         3                  2     52   

   price 2  page  
0        2     1  
1    

In [11]:
from sklearn.model_selection import train_test_split
# Check if df_raw exists and is a DataFrame
if 'df_raw' in locals() and isinstance(df_raw, pd.DataFrame):
    print(f"Original dataset shape: {df_raw.shape}")

    # Define the test set size (e.g., 20% of the data)
    test_set_size = 0.20

    # Split the data
    # random_state ensures the split is the same every time we run the code (reproducibility)
    df_train, df_test = train_test_split(df_raw, test_size=test_set_size, random_state=42)

    print(f"Train set shape: {df_train.shape}")
    print(f"Test set shape: {df_test.shape}")

else:
    print("Error: df_raw is not loaded correctly. Please fix the data loading steps above.")

Original dataset shape: (165474, 14)
Train set shape: (132379, 14)
Test set shape: (33095, 14)


In [12]:
# Define paths for the new train and test files
train_output_path = os.path.join("data", "train.csv")
test_output_path = os.path.join("data", "test.csv")

# Save the DataFrames to CSV files
# index=False prevents pandas from writing the DataFrame index as a column
if 'df_train' in locals() and 'df_test' in locals():
    try:
        df_train.to_csv(train_output_path, index=False)
        df_test.to_csv(test_output_path, index=False)
        print(f"Successfully saved train.csv to {train_output_path}")
        print(f"Successfully saved test.csv to {test_output_path}")
    except Exception as e:
        print(f"Error saving files: {e}")
else:
    print("Error: Train/Test split did not occur. Cannot save files.")

Successfully saved train.csv to data\train.csv
Successfully saved test.csv to data\test.csv
