In [24]:
import pandas as pd
import os
import csv

# Define the path to your data directory
data_directory = "./data"
file_names = [
    file
    for file in os.listdir(data_directory)
    if all([file.endswith(".csv"), "concatenated_autotrader.csv" not in file])
]

# !EXTRACT

# Import all CSV files as a list of dataframes
dataframes = []
for file_name in file_names:
    file_path = os.path.join(data_directory, file_name)
    try:
        df = pd.read_csv(file_path)
        dataframes.append(df)
        headers_list = list(df.columns)
        first_row_list = list(df.iloc[0])
                # Writing the lists to a CSV file
        with open('lists.csv', 'w', newline='') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(headers_list)
            writer.writerow( first_row_list)

    except Exception as e:
        print(f"Error reading {file_name}: {e}")

# Concatenate all dataframes into one
concatenated_df = pd.concat(dataframes, ignore_index=True)

# Optional: Save the concatenated dataframe to a new CSV file
concatenated_df.to_csv(
    os.path.join(data_directory, "concatenated_autotrader.csv"), index=False
)

print("All datasets have been concatenated into one dataframe.")


All datasets have been concatenated into one dataframe.


In [29]:
import pandas as pd
import re 

# Step 1: Load the concatenated CSV file
df = pd.read_csv('./data/concatenated_autotrader.csv')

# !TRANSFORM

# Step 2: Manually mapping current headers to desired headers
# This is an example based on the given first row of data and desired headers.
# You'll need to adjust these based on the actual data in your file.
header_mapping = {
    'sc-kpDqfm 2': 'Make Model',
    'sc-kpDqfm href': 'Title_URL',
    'sc-jEACwC 2': 'Sale_Type',  # Example guess, adjust based on actual data inspection
    'at__sc-1mc7cl3-5': 'Price',
    'sc-jlZhew 2': 'Additional_Comments',
    'sc-jlZhew 3': 'Year',
    'sc-jlZhew 4': 'Car_Type',
    'sc-jlZhew 5': 'Mileage',
    'sc-jlZhew 6': 'Engine_Size',
    'sc-jlZhew 7': 'Engine_Type',
    'sc-jlZhew 8': 'Transmission',
    'sc-jlZhew 9': 'Dealership_Location',
    'sc-jlZhew 10': 'Prior_Owners',
    'sc-jlZhew': 'Information/Detail'
}

# Renaming the columns based on the mapping
df_renamed = df.rename(columns=header_mapping)

# Select only the columns we want to keep, dropping any that aren't listed in the mapping
df_transformed = df_renamed[list(header_mapping.values())]

# Handle potentially non-string (e.g., NaN) values in 'Make Model' before splitting
df_transformed['Make'] = df_transformed['Make Model'].apply(lambda x: x.split(' ')[0] if isinstance(x, str) else '')
df_transformed['Model'] = df_transformed['Make Model'].apply(lambda x: ' '.join(x.split(' ')[1:]) if isinstance(x, str) and len(x.split(' ')) > 1 else '')

df_transformed.drop(columns=['Make Model'], inplace=True)



# Transform the Price column from "£40,750" to an integer, handling missing values
df_transformed['Price'] = df_transformed['Price'].str.replace('£', '').str.replace(',', '')
df_transformed['Price'] = pd.to_numeric(df_transformed['Price'], errors='coerce')

# Use a regular expression to remove any content within parentheses and any non-numeric characters, then convert to numeric
df_transformed['Year'] = df_transformed['Year'].astype(str).apply(lambda x: re.sub(r'\s*\(.*\)', '', x))
df_transformed['Year'] = df_transformed['Year'].str.extract('(\d{4})')  # Extracts only the year (4 digits)
df_transformed['Year'] = pd.to_numeric(df_transformed['Year'], errors='coerce')

df_transformed['Mileage'] = df_transformed['Mileage'].str.replace('miles', '')
df_transformed['Mileage'] = pd.to_numeric(df_transformed['Mileage'], errors='coerce')

df_transformed['Prior_Owners'] = df_transformed['Prior_Owners'].str.replace('owner', '').replace('s', '')
df_transformed['Prior_Owners'] = pd.to_numeric(df_transformed['Prior_Owners'], errors='coerce')


# Reorder columns to match the requested sequence
columns_order = [
    'Make', 'Title_URL', 'Sale_Type', 'Price', 'Model', 'Additional_Comments',
    'Year', 'Car_Type', 'Mileage', 'Engine_Size', 'Engine_Type',
    'Transmission', 'Dealership_Location', 'Prior_Owners', 'Information/Detail'
]
df_transformed = df_transformed[columns_order]

# Step 4: Save the transformed DataFrame to a new CSV file
df_transformed.to_csv('data/transformed_autotrader.csv', index=False)

print("The transformed data has been saved to 'transformed_autotrader.csv'.")

The transformed data has been saved to 'transformed_autotrader.csv'.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_transformed['Make'] = df_transformed['Make Model'].apply(lambda x: x.split(' ')[0] if isinstance(x, str) else '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_transformed['Model'] = df_transformed['Make Model'].apply(lambda x: ' '.join(x.split(' ')[1:]) if isinstance(x, str) and len(x.split(' ')) > 1 else '')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus