<a href="https://colab.research.google.com/github/hunterad93/projects/blob/main/carbitrage_page_parser.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
def parse_name(name):
    year_pattern = r'\b\d{4}\b'
    make_model_pattern = r'\b([A-Za-z]+)\s+([A-Za-z0-9-]+)'
    
    year_match = re.search(year_pattern, name)
    make_model_match = re.search(make_model_pattern, name)

    year = year_match.group(0) if year_match else None
    make, model = make_model_match.groups() if make_model_match else (None, None)

    return make, model, year

def parse_html(soup):    
    car_name = soup.find('p', class_='attrgroup')
    if car_name is not None:
        car_name = car_name.find('b').text.strip()
    # Extract latitude and longitude
    map_div = soup.find('div', {'id': 'map'})
    latitude = float(map_div['data-latitude'])
    longitude = float(map_div['data-longitude'])

    # Extract attributes
    attrgroup = soup.find_all('p', class_='attrgroup')
    attributes = {}
    for group in attrgroup:
        for span in group.find_all('span'):
            if ':' in span.text:
                key, value = span.text.split(':')
                attributes[key.strip()] = value.strip()

    # Extract make, model, and year
    make, model, year = parse_name(car_name)  # Pass the car_name variable to the parse_name function

    parsed_data = {
        'Title Status': attributes.get('title status'),
        'Paint Color': attributes.get('paint color'),
        'Odometer': int(attributes.get('odometer')),
        'Drive': attributes.get('drive'),
        'Condition': attributes.get('condition'),
        'Make': make,
        'Model': model,
        'Year': year
    }

    return parsed_data




In [2]:
!pip install beautifulsoup4 requests pandas

import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
import re



url_list = [
    'https://minneapolis.craigslist.org/hnp/cto/d/saint-michael-1998-subaru-forester/7626667598.html',
    'https://minneapolis.craigslist.org/ank/cto/d/champlin-subaru-forester-as-is/7624015627.html',
    'https://minneapolis.craigslist.org/wsh/cto/d/saint-paul-2016-subaru-forester-25i/7621121149.html',
    'https://minneapolis.craigslist.org/ank/cto/d/minneapolis-subaru-forester-2014/7620355994.html',
    'https://siouxcity.craigslist.org/cto/d/spirit-lake-2016-subaru-forester-touring/7629107422.html',
    'https://stcloud.craigslist.org/cto/d/alexandria-2010-subaru-forester-25x/7628737384.html',
    'https://duluth.craigslist.org/cto/d/duluth-2016-subaru-forester/7627821456.html',
    'https://minneapolis.craigslist.org/hnp/cto/d/hopkins-2019-subaru-outback-25i-premium/7629205310.html',
    'https://minneapolis.craigslist.org/hnp/cto/d/eden-prairie-2017-subaru-impreza/7629166784.html',
    'https://minneapolis.craigslist.org/hnp/cto/d/minneapolis-2013-subaru-forester-ll/7620709154.html'
]

# Create an empty DataFrame to store the extracted data
columns = ['Make', 'Model', 'Year', 'Miles', 'Price', 'Title', 'Paint', 'Drive', 'Condition', 'Description', 'Latitude', 'Longitude']
df = pd.DataFrame(columns=columns)

for url in url_list:
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        script_tag = soup.find('script', {'id': 'ld_posting_data'})

        if script_tag is None:
            print(f"Skipping URL {url} - script_tag not found")
            continue

        json_data = json.loads(script_tag.string)

        # Extract relevant fields
        parsed_data = parse_html(soup)
        description = json_data['description']
        price = json_data['offers']['price']
        latitude = json_data['offers']['availableAtOrFrom']['geo']['latitude']
        longitude = json_data['offers']['availableAtOrFrom']['geo']['longitude']

        # Append the extracted data to the DataFrame
        new_row = pd.DataFrame({
            'Make': [parsed_data['Make']],
            'Model': [parsed_data['Model']],
            'Year': [parsed_data['Year']],
            'Miles': [parsed_data['Odometer']],
            'Price': [price],
            'Title': [parsed_data['Title Status']],
            'Paint': [parsed_data['Paint Color']],
            'Drive': [parsed_data['Drive']],
            'Condition': [parsed_data['Condition']],
            'Description': [description],
            'Latitude': [latitude],
            'Longitude': [longitude],
        })

        df = pd.concat([df, new_row], ignore_index=True)
        
    except AttributeError as e:
        print(f"Skipping URL {url} - Error: {e}")

# view the DataFrame
df



Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Skipping URL https://minneapolis.craigslist.org/hnp/cto/d/minneapolis-2013-subaru-forester-ll/7620709154.html - script_tag not found


Unnamed: 0,Make,Model,Year,Miles,Price,Title,Paint,Drive,Condition,Description,Latitude,Longitude
0,subaru,forester,1998,199000,3000.0,clean,,,,"Rust free, new tires, awd Ac works Great. 19...",45.2064,-93.6593
1,Subaru,Forester,2006,130320,6000.0,salvage,silver,4wd,good,Selling AS IS: cosmetic damage as seen in the ...,45.204562,-93.37755
2,subaru,forester,2016,89000,17000.0,clean,blue,4wd,excellent,Rare find. Just arrived from Arizona - no rus...,44.9913,-92.9487
3,subaru,forester,2014,176000,10900.0,clean,brown,4wd,excellent,CLEAN TITLE Clean Inside&amp;Out Highway mi...,45.1697,-93.1889
4,Subaru,Forester,2016,75800,16350.0,salvage,grey,4wd,excellent,"AWD Subaru Forester Touring from Prescott, AZ ...",43.4257,-95.1057
5,Subaru,Forester,2010,32850,13900.0,clean,blue,4wd,excellent,"One Owner 2010 Subaru Forester 2.5X; 34,885 or...",45.8817,-95.382
6,subaru,forester,2016,115000,15650.0,salvage,blue,4wd,excellent,2016 Subaru Forester in great shape. Bought ...,46.7765,-92.1301
7,Subaru,Outback,2019,58464,23499.0,clean,blue,4wd,excellent,Selling my 2019 Subaru Outback 2.5i Premium. H...,44.972667,-93.427391
8,Subaru,Impreza,2017,210000,5995.0,clean,white,4wd,fair,"Hi there, we are selling our well loved 2017 S...",44.8574,-93.4376
