<a href="https://colab.research.google.com/github/hunterad93/projects/blob/main/carbitrage_page_parser.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [26]:
def parse_miles(description):
    doc = nlp(description)
    miles = None
    for token in doc:
        if token.text.lower() in ["miles", "mile", "mileage"]:
            prev_token = token.nbor(-1)
            if prev_token.like_num:
                miles = int(prev_token.text.replace(',', ''))
                break
    return miles



def parse_name(name):
    year_pattern = r'\b\d{4}\b'
    make_model_pattern = r'\b([A-Za-z]+)\s+([A-Za-z0-9-]+)'
    
    year_match = re.search(year_pattern, name)
    make_model_match = re.search(make_model_pattern, name)

    year = year_match.group(0) if year_match else None
    make, model = make_model_match.groups() if make_model_match else (None, None)

    return make, model, year



In [31]:
#!pip install beautifulsoup4 requests pandas spacy
#!python -m spacy download en_core_web_sm

import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
import spacy

nlp = spacy.load("en_core_web_sm")


url_list = [
    'https://minneapolis.craigslist.org/hnp/cto/d/saint-michael-1998-subaru-forester/7626667598.html',
    'https://minneapolis.craigslist.org/ank/cto/d/champlin-subaru-forester-as-is/7624015627.html',
    'https://minneapolis.craigslist.org/wsh/cto/d/saint-paul-2016-subaru-forester-25i/7621121149.html',
    'https://minneapolis.craigslist.org/ank/cto/d/minneapolis-subaru-forester-2014/7620355994.html',
    'https://siouxcity.craigslist.org/cto/d/spirit-lake-2016-subaru-forester-touring/7629107422.html',
    'https://stcloud.craigslist.org/cto/d/alexandria-2010-subaru-forester-25x/7628737384.html',
    'https://duluth.craigslist.org/cto/d/duluth-2016-subaru-forester/7627821456.html',
    'https://minneapolis.craigslist.org/hnp/cto/d/hopkins-2019-subaru-outback-25i-premium/7629205310.html',
    'https://minneapolis.craigslist.org/hnp/cto/d/eden-prairie-2017-subaru-impreza/7629166784.html',
    'https://minneapolis.craigslist.org/hnp/cto/d/minneapolis-2013-subaru-forester-ll/7620709154.html'
]

# Create an empty DataFrame to store the extracted data
columns = ['Make', 'Model', 'Year', 'Miles', 'Price', 'Latitude', 'Longitude', 'Name', 'Description']
df = pd.DataFrame(columns=columns)

for url in url_list:
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        script_tag = soup.find('script', {'id': 'ld_posting_data'})

        if script_tag is None:
            print(f"Skipping URL {url} - script_tag not found")
            continue

        json_data = json.loads(script_tag.string)

        # Extract relevant fields
        name = json_data['name']
        make, model, year = parse_name(name)
        description = json_data['description']
        miles = parse_miles(description)
        price = json_data['offers']['price']
        latitude = json_data['offers']['availableAtOrFrom']['geo']['latitude']
        longitude = json_data['offers']['availableAtOrFrom']['geo']['longitude']

        # Append the extracted data to the DataFrame
        new_row = pd.DataFrame({
            'Name': [name],
            'Make': [make],
            'Model': [model],
            'Year': [year],
            'Miles': [miles],
            'Description': [description],
            'Price': [price],
            'Latitude': [latitude],
            'Longitude': [longitude],
        })

        df = pd.concat([df, new_row], ignore_index=True)
        
    except AttributeError as e:
        print(f"Skipping URL {url} - Error: {e}")


# Print the DataFrame
df


Skipping URL https://minneapolis.craigslist.org/hnp/cto/d/minneapolis-2013-subaru-forester-ll/7620709154.html - script_tag not found


Unnamed: 0,Make,Model,Year,Miles,Price,Latitude,Longitude,Name,Description
0,Subaru,Forester,1998,199999.0,3000.0,45.2064,-93.6593,1998 Subaru Forester,"Rust free, new tires, awd Ac works Great. 19..."
1,Subaru,Forester,2006,,6000.0,45.204562,-93.37755,Subaru Forester 2006 - AS IS,Selling AS IS: cosmetic damage as seen in the ...
2,Subaru,Forester,2016,,17000.0,44.9913,-92.9487,2016 Subaru Forester 2.5I Premium,Rare find. Just arrived from Arizona - no rus...
3,Subaru,Forester,2014,,10900.0,45.1697,-93.1889,Subaru Forester 2014,CLEAN TITLE Clean Inside&amp;Out Highway mi...
4,Subaru,Forester,2016,75800.0,16500.0,43.4257,-95.1057,2016 Subaru Forester Touring,"Subaru from Prescott, AZ with 75,800 miles. It..."
5,Subaru,Forester,2010,,13900.0,45.8817,-95.382,"2010 Subaru Forester 2.5X; 34,885 miles","One Owner 2010 Subaru Forester 2.5X; 34,885 or..."
6,Subaru,Forester,2016,,15650.0,46.7765,-92.1301,2016 Subaru Forester,2016 Subaru Forester in great shape. Bought ...
7,Subaru,Outback,2019,,23499.0,44.972667,-93.427391,2019 Subaru Outback 2.5i Premium,Selling my 2019 Subaru Outback 2.5i Premium. H...
8,Subaru,Impreza,2017,210000.0,5995.0,44.8574,-93.4376,2017 Subaru Impreza Hatchback AWD 5 speed manual,"Hi there, we are selling our well loved 2017 S..."
