## Web Scraping Car Listings from Cars45

This web scraping project focuses on extracting car listings and pricing information from the Cars45 website. The goal is to collect data on a wide range of car listings to facilitate analysis and decision-making.

Website to scrape [https://www.cars45.com/listing?page=1](https://www.cars45.com/listing?page=1)

In [1]:
# import neccessary libraries
from bs4 import BeautifulSoup
import requests
import csv
import pandas as pd
import time

In [2]:

# Initialize empty lists to store car information
car_list = []  # List to store car details
prices = []    # List to store car prices

# Loop through pages 1 to 216 on the website
for i in range(1, 217):
    # Create the URL for the current page
    url = f'https://www.cars45.com/listing?page={i}'
    
    # Send an HTTP GET request to the URL
    data = requests.get(url)
    
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(data.text, 'lxml')
    
    # Find the section containing car listings
    section = soup.find('section', class_='cars-grid grid')
    
    # Find all div tags containing car details
    div_tags = section.find_all('div', class_='car-feature__details')

    # Extract car prices from the current page
    for div in div_tags:
        car_price = div.find('p', class_='car-feature__amount').text.strip()
        prices.append(car_price)
    
    # Find all links to car listings on the current page
    car_links = soup.find_all('a', class_='car-feature car-feature--wide-mobile')
    
    # Extract the href attribute (link) for each car listing
    car_link = [l.get('href') for l in car_links]
    
    # Loop through each car listing link on the current page
    for link in car_link:
        # Create the URL for the individual car listing
        car_url = 'https://www.cars45.com/' + link
        
        # Send an HTTP GET request to the car listing URL
        car_response = requests.get(car_url)
        
        # Parse the HTML content of the individual car listing
        car_soup = BeautifulSoup(car_response.text, 'lxml')
        
        # Find all general information sections for the car listing
        car_listings = car_soup.find_all('div', class_='general-info grid')
        
        # Loop through each general information section for the car listing
        for car_listing in car_listings:
            # Create a dictionary to store car information
            car_info = {}

            # Find all feature names and values within the car listing
            feature_names = car_listing.find_all('p', class_='general-info__name')
            feature_values = car_listing.find_all('span', class_='general-info__value')

            # Iterate through feature names and values, and add them to the dictionary
            for name, value in zip(feature_names, feature_values):
                car_info[value.text.strip()] = name.text.strip()
                
            # Add the car information dictionary to the car_list
            car_list.append(car_info)
            
    # Add a delay of 1 second to avoid sending too many requests in a short time
    time.sleep(1)


In [3]:
len(car_list)

3240

In [4]:
len(prices)

3240

In [5]:
## passing the scraped data into a pandas dataframe
data = car_list
df = pd.DataFrame(data)

df.head()

Unnamed: 0,Make,Model,Year of manufacture,Colour,Condition,Mileage,Engine Size,Registered city,Selling Condition,Bought Condition,Trim,Drivetrain,Seats,Number of Cylinders,Horse Power,Second Condition,Interior Color,Registered Car,Exchange Possible,Key Features
0,Toyota,Camry,2009,Gray,Nigerian Used,110027,3500,Lagos,Registered,Registered,,,,,,,,,,
1,Infiniti,M,2013,Black,Nigerian Used,87256,3700,LAGOS,Registered,Imported,,,,,,,,,,
2,Honda,Civic,2007,White,Nigerian Used,153814,1800,ABUJA,Registered,Imported,,,,,,,,,,
3,Acura,MDX,2008,Silver,Nigerian Used,118730,3700,,Registered,Registered,,,,,,,,,,
4,Toyota,Land Cruiser Prado,2007,Black,Nigerian Used,96601,2700,LAGOS,Registered,Registered,,,,,,,,,,


In [6]:
# Remove currency symbol and commas, then convert to float
prices = [float(price.replace('₦', '').replace(',', '').strip()) for price in prices]

# Create a DataFrame
df2 = pd.DataFrame({'Price': prices})


In [7]:
## add the transformed 'Price' column to the dataset
df['Price'] = df2

df.head()

Unnamed: 0,Make,Model,Year of manufacture,Colour,Condition,Mileage,Engine Size,Registered city,Selling Condition,Bought Condition,...,Drivetrain,Seats,Number of Cylinders,Horse Power,Second Condition,Interior Color,Registered Car,Exchange Possible,Key Features,Price
0,Toyota,Camry,2009,Gray,Nigerian Used,110027,3500,Lagos,Registered,Registered,...,,,,,,,,,,3600000.0
1,Infiniti,M,2013,Black,Nigerian Used,87256,3700,LAGOS,Registered,Imported,...,,,,,,,,,,6750000.0
2,Honda,Civic,2007,White,Nigerian Used,153814,1800,ABUJA,Registered,Imported,...,,,,,,,,,,2925000.0
3,Acura,MDX,2008,Silver,Nigerian Used,118730,3700,,Registered,Registered,...,,,,,,,,,,2981250.0
4,Toyota,Land Cruiser Prado,2007,Black,Nigerian Used,96601,2700,LAGOS,Registered,Registered,...,,,,,,,,,,13500000.0


In [8]:
df.columns

Index(['Make', 'Model', 'Year of manufacture', 'Colour', 'Condition',
       'Mileage', 'Engine Size', 'Registered city', 'Selling Condition',
       'Bought Condition', 'Trim', 'Drivetrain', 'Seats',
       'Number of Cylinders', 'Horse Power', 'Second Condition',
       'Interior Color', 'Registered Car', 'Exchange Possible', 'Key Features',
       'Price'],
      dtype='object')

In [9]:
cols_to_drop = ['Trim', 'Seats', 'Drivetrain', 'Number of Cylinders',
       'Horse Power', 'Second Condition', 'Interior Color', 'Registered Car',
       'Exchange Possible', 'Key Features']
df = df.drop(cols_to_drop, axis=1)

In [10]:
df.head()

Unnamed: 0,Make,Model,Year of manufacture,Colour,Condition,Mileage,Engine Size,Registered city,Selling Condition,Bought Condition,Price
0,Toyota,Camry,2009,Gray,Nigerian Used,110027,3500,Lagos,Registered,Registered,3600000.0
1,Infiniti,M,2013,Black,Nigerian Used,87256,3700,LAGOS,Registered,Imported,6750000.0
2,Honda,Civic,2007,White,Nigerian Used,153814,1800,ABUJA,Registered,Imported,2925000.0
3,Acura,MDX,2008,Silver,Nigerian Used,118730,3700,,Registered,Registered,2981250.0
4,Toyota,Land Cruiser Prado,2007,Black,Nigerian Used,96601,2700,LAGOS,Registered,Registered,13500000.0


### Adding a ready made data to the scraped data to inscrease the size

In [11]:
add_df = pd.read_csv('car_prices.csv')

columns_to_drop = ['car_id', 'fuel type', 'gear type', 'Trim', 'car', 'Drivetrain', 'Seats', 'Number of Cylinders', 'Horse Power']
add_df = add_df.drop(columns_to_drop, axis=1)


In [12]:
add_df.head()

Unnamed: 0,price,Make,Model,Year of manufacture,Colour,Condition,Mileage,Engine Size,Selling Condition,Bought Condition,Registered city
0,2812500,Toyota,Corolla,2004,Silver,Nigerian Used,251732.0,1800.0,Registered,Registered,
1,5000000,Acura,MDX,2007,Green,Foreign Used,224589.0,3700.0,Imported,Imported,
2,7592000,Acura,TL,2012,White,Foreign Used,111334.0,3500.0,Imported,Imported,
3,9880000,Land Rover,Range Rover,2012,Gray,Nigerian Used,72682.0,4000.0,Registered,Imported,
4,4160000,Ford,Edge,2013,Gray,Nigerian Used,92118.0,3500.0,Registered,Registered,Lagos


In [14]:
# Suppose you want to move the 'column_to_move' to the last position.
column_to_move = 'price'

# Get the list of column names in the current order
current_columns = add_df.columns.tolist()

# Remove the column you want to move
current_columns.remove(column_to_move)

# Append the removed column to the end
current_columns.append(column_to_move)

# Reassign the updated column order to the DataFrame
add_df = add_df[current_columns]


In [15]:
add_df.head()

Unnamed: 0,Make,Model,Year of manufacture,Colour,Condition,Mileage,Engine Size,Selling Condition,Bought Condition,Registered city,price
0,Toyota,Corolla,2004,Silver,Nigerian Used,251732.0,1800.0,Registered,Registered,,2812500
1,Acura,MDX,2007,Green,Foreign Used,224589.0,3700.0,Imported,Imported,,5000000
2,Acura,TL,2012,White,Foreign Used,111334.0,3500.0,Imported,Imported,,7592000
3,Land Rover,Range Rover,2012,Gray,Nigerian Used,72682.0,4000.0,Registered,Imported,,9880000
4,Ford,Edge,2013,Gray,Nigerian Used,92118.0,3500.0,Registered,Registered,Lagos,4160000


In [16]:
## create a new 'Price' column
add_df['Price'] = add_df['price']

## drop the initial 'price' column
add_df = add_df.drop('price', axis=1)

In [17]:
# merge the scraped data and the additional data
new_df = pd.concat([df, add_df], axis=0)
new_df.reset_index(drop=True, inplace=True)

In [18]:
new_df

Unnamed: 0,Make,Model,Year of manufacture,Colour,Condition,Mileage,Engine Size,Registered city,Selling Condition,Bought Condition,Price
0,Toyota,Camry,2009,Gray,Nigerian Used,110027,3500,Lagos,Registered,Registered,3600000.0
1,Infiniti,M,2013,Black,Nigerian Used,87256,3700,LAGOS,Registered,Imported,6750000.0
2,Honda,Civic,2007,White,Nigerian Used,153814,1800,ABUJA,Registered,Imported,2925000.0
3,Acura,MDX,2008,Silver,Nigerian Used,118730,3700,,Registered,Registered,2981250.0
4,Toyota,Land Cruiser Prado,2007,Black,Nigerian Used,96601,2700,LAGOS,Registered,Registered,13500000.0
...,...,...,...,...,...,...,...,...,...,...,...
6957,Toyota,RAV4,2003,Red,Nigerian Used,282769.0,2000.0,Plateau,Registered,Imported,1924000.0
6958,Mercedes-Benz,M Class,2002,Silver,Nigerian Used,182544.0,3200.0,,Registered,Registered,1890000.0
6959,Toyota,Camry,2000,Black,Nigerian Used,234354.0,2200.0,RIVERS,Registered,Registered,1155000.0
6960,Toyota,RAV4,2001,Black,Nigerian Used,345926.0,2000.0,,Registered,Registered,1260000.0


In [19]:
## save new_df to a csv file

#save_file = new_df.to_csv('train.csv')