In [10]:
import requests
from tqdm import tqdm
import pandas as pd
import re
import os

# CarQuery


In [41]:
BASE_URL = 'https://www.carqueryapi.com/api/0.3'

header = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}


def get_makes(year, sold_in_us=1):
    params = {
        'cmd': 'getMakes',
        'year': year,
        'sold_in_us': sold_in_us
    }
    response = requests.get(BASE_URL, params=params, headers = header)
    response.raise_for_status()
    return response.json()['Makes']  # Adjust based on the actual key for makes

def get_models(make, year=None, sold_in_us=1, body=None):
    params = {
        'cmd': 'getModels',
        'make': make,
        'sold_in_us': sold_in_us
    }
    if year:
        params['year'] = year
    if body:
        params['body'] = body

    response = requests.get(BASE_URL, params=params, headers = header)
    response.raise_for_status()
    return response.json()['Models']  # Adjust based on the actual key for models

def get_trims(make, model, year=None, trim=None, body=None):
    params = {
        'cmd': 'getTrims',
        'make': make,
        'model' : model
    }
    if year:
        params['year'] = year
    if trim:
        params['trim'] = trim
    if body:
        params['body'] = body

    response = requests.get(BASE_URL, params=params, headers = header)
    response.raise_for_status()
    return response.json()['Trims']  # Adjust based on the actual key for trims

def get_model(model_id):
    params = {
        'cmd': 'getModel',
        'model': model_id
    }
    response = requests.get(BASE_URL, params=params, headers = header)
    response.raise_for_status()
    return response.json()  # Adjust based on how the API returns data for a single model

def extract_car_info(car_string):
    pattern = re.compile(r'^(.*?)\s(.*?)(?:\s\((\d{4})-?\d*\)|\s(\d{4})|$)')
    match = pattern.search(car_string)
    if match:
        make = match.group(1)
        model = match.group(2)
        year = match.group(3) if match.group(3) else match.group(4)
        return make, model, year
    else:
        return None, None, None
    
def process_chunk(chunk):    
    # Set the columns that you want to keep from the API data
    api_columns = [
        'model_make_id', 'model_name', 'model_trim', 'model_year', 'model_body',
        'model_engine_cyl', 'model_engine_fuel', 'model_drive', 'model_transmission_type',
        'model_seats', 'model_weight_kg', 'model_fuel_cap_l'
    ]

    api_data = []

    # Process each row in the DataFrame
    for index, row in chunk.iterrows():
        name = row['model']
        reg_date = row['reg_date']

        name_split = name.split(' ')
        make = name_split[0]
        model = name_split[1]

        year = pd.to_datetime(reg_date).year if pd.notnull(reg_date) else None

        model_info = get_trims(make=make, model=model, year= year)

        if model_info:
            first_trim = model_info[0]  # Safely get the first trim
            # Select only the desired columns
            selected_trim_info = {key: first_trim.get(key, pd.NA) for key in api_columns}
            print(str(index) + str(selected_trim_info))
            api_data.append(selected_trim_info)
        else:
            print(str(index) + " Nothing found")
            placeholder = {key: pd.NA for key in api_columns}
            api_data.append(placeholder)
    
    result_df = pd.concat([chunk, pd.DataFrame(api_data)], axis = 1)
    
    return result_df





# Loading the data

In [42]:

data_filename = "../Datasets/merged_dataset.csv"
start_row = 2400 
chunk_size = 1000

chunk = pd.read_csv(data_filename, skiprows= range(1,start_row + 1), nrows=chunk_size, header=0)

# print(chunk)
api_df = process_chunk(chunk)

output_filename = '../Datasets/CarQuery.csv'

# Check if the file exists and has content
if os.path.exists(output_filename) and os.path.getsize(output_filename) > 0:
    header = False  # Don't write header if file already exists
else:
    header = True  # Write header if file doesn't exist

# Append or write the DataFrame to the CSV file
api_df.to_csv(output_filename, mode='a', index=False, header=header)

0{'model_make_id': 'Subaru', 'model_name': 'Forester', 'model_trim': '2.0XT Premium 4dr SUV AWD (2.0L 4cyl Turbo CVT)', 'model_year': '2017', 'model_body': 'Sport Utility Vehicles', 'model_engine_cyl': '4', 'model_engine_fuel': 'Premium Unleaded (Required)', 'model_drive': 'All Wheel Drive', 'model_transmission_type': 'Automatic', 'model_seats': None, 'model_weight_kg': '3624', 'model_fuel_cap_l': '16'}
1 Nothing found
2{'model_make_id': 'BMW', 'model_name': 'X1', 'model_trim': 'sDrive28i 4dr SUV (2.0L 4cyl Turbo 8A)', 'model_year': '2019', 'model_body': 'Sport Utility Vehicles', 'model_engine_cyl': '4', 'model_engine_fuel': 'Premium Unleaded (Required)', 'model_drive': 'Rear Wheel Drive', 'model_transmission_type': 'Automatic', 'model_seats': None, 'model_weight_kg': '3527', 'model_fuel_cap_l': '17'}
3 Nothing found
4 Nothing found
5 Nothing found
6 Nothing found
7 Nothing found
8 Nothing found
9{'model_make_id': 'Toyota', 'model_name': 'Corolla', 'model_trim': 'L 4dr Sedan (1.8L 4cyl

In [13]:
data_filename = "../Dataset/merged_dataset.csv"

df = pd.read_csv(data_filename, nrows= 500, header= None)

column_names = ['name', 'price', 'depreciation', 'mileage', 'eng_cap', 'power', 'reg_date', 'coe_left', 'owners', ' omv', 'arf', 'accessories']
df.columns = column_names
res = []
counter = 0
for index, row in df.iterrows():
    info = extract_car_info(row['new_name'])
    if info == (None, None, None):
        counter += 1
        print(row)
    res.append(info)
print(counter)
print(res)

Name                    name
Price                  price
Depreciation    depreciation
Mileage              mileage
eng_cap              eng_cap
power                  power
reg_date            reg_date
coe_left            coe_left
owners                owners
 omv                     omv
arf                      arf
accessories      accessories
new_name            new_name
Name: 0, dtype: object
Name            BMW 7 Series Mild Hybrid 735i sDrive Pure Exce...
Price                                                      537800
Depreciation                                                50580
Mileage                                                      3900
eng_cap                                                      2998
power                                                         213
reg_date                                              20-Mar-2023
coe_left                                                     8.99
owners                                                          1
 omv  

In [21]:
result_df = pd.concat([df, api_df], axis = 1)

output_filename = "combined_used_cars2.csv"

result_df.to_csv(output_filename, index = False)