# Zenrows Scraping API EDA

## Imports

In [4]:
import requests
from pprint import pprint
import polars
import json
import copy
import logging

## Functions

In [8]:
def load_json_data(json_file_path):
    try:
        # Open and read the JSON file
        with open(json_file_path, 'r') as file:
            parsed_json_data = json.load(file)

        return parsed_json_data
    except UnicodeDecodeError as e: 
        logging.error(e)
        raise e
    except Exception as e:
        logging.error(e)
        raise e
    
valid_json_file_path = '../data/network_requests.json'
res = load_json_data(valid_json_file_path)
print(res)

# invalid_json_file_path = '../data/turo.com.har'
# res = load_json_data(invalid_json_file_path)
# print(res)



{'data_type': 'Turo Search Page Results', 'timestamp': '2024-10-13T23:48:54.662Z', 'entries': [{'request_method': 'POST', 'request_url': 'https://turo.com/api/v2/search', 'response_status': 200, 'response_status_text': '', 'response_content': {'banner': {'actionText': None, 'bannerDesign': {'animationLoopCount': None, 'animationURL': None, 'clickableURL': None, 'designVariant': None, 'resizeableIconDarkURL': 'https://resources.turo.com/resources/img/banner/thumbs-up-dark__H6f7aeec8335a018494e506131904a399__.jpg', 'resizeableIconURL': 'https://resources.turo.com/resources/img/banner/thumbs-up__H8e2ff43641228b6c6246c70974174b4a__.jpg', 'withBorder': True}, 'bannerName': 'CHECKOUT_MORE_THAN_25_HR', 'text': 'Donâ€™t stress: you can cancel your trip for free, up to 24 hours before it starts.', 'title': None}, 'dismissibleBanner': None, 'makesFilterCount': None, 'searchId': '9FyJbenr', 'searchLocation': {'appliedRadius': {'unit': 'MILES', 'value': 38.2182017975829}, 'country': 'US', 'isOpera

## Test request

In [42]:

url = 'https://turo.com/api/vehicle/daily_pricing?end=10%2F12%2F2023&start=08%2F01%2F2022&vehicleId=1711107'
apikey = '43adce71ec56b763f7ab9e710da66c81c448013b'
params = {
    'url': url,
    'apikey': apikey,
}
response = requests.get('https://api.zenrows.com/v1/', params=params)
print(response.text)

{"calendarCurrencyHeader":"All prices are in CAD","dailyPricingResponses":[{"custom":false,"date":"2022-08-01","localizedDayOfWeek":"Monday","localizedShortDayOfWeek":"Mon","price":68.00,"priceEditable":true,"priceWithCurrency":{"amount":68.00,"currencyCode":"CAD"},"source":"DEFAULT","wholeDayUnavailable":false},{"custom":false,"date":"2022-08-02","localizedDayOfWeek":"Tuesday","localizedShortDayOfWeek":"Tue","price":68.00,"priceEditable":true,"priceWithCurrency":{"amount":68.00,"currencyCode":"CAD"},"source":"DEFAULT","wholeDayUnavailable":false},{"custom":false,"date":"2022-08-03","localizedDayOfWeek":"Wednesday","localizedShortDayOfWeek":"Wed","price":68.00,"priceEditable":true,"priceWithCurrency":{"amount":68.00,"currencyCode":"CAD"},"source":"DEFAULT","wholeDayUnavailable":false},{"custom":false,"date":"2022-08-04","localizedDayOfWeek":"Thursday","localizedShortDayOfWeek":"Thu","price":68.00,"priceEditable":true,"priceWithCurrency":{"amount":68.00,"currencyCode":"CAD"},"source":"D

In [43]:
with open("../data/daily_pricing.json", "w") as fp:
    json.dump(response.json(),fp)
# pprint(response.json())

In [44]:
daily_prcing_responses = response.json()['dailyPricingResponses']

for idx in range(len(daily_prcing_responses)):
    _date = daily_prcing_responses[idx]['date']
    _price = daily_prcing_responses[idx]['price']
    _wholeDayUnavailable = daily_prcing_responses[idx]['wholeDayUnavailable']
    _source =  daily_prcing_responses[idx]['source']
    
    
    print(f"{idx} - On date {_date}, the price was set by {_source} at {_price} & the car was {'Booked' if _wholeDayUnavailable else 'Available'}")
    

0 - On date 2022-08-01, the price was set by DEFAULT at 68.0 & the car was Available
1 - On date 2022-08-02, the price was set by DEFAULT at 68.0 & the car was Available
2 - On date 2022-08-03, the price was set by DEFAULT at 68.0 & the car was Available
3 - On date 2022-08-04, the price was set by DEFAULT at 68.0 & the car was Available
4 - On date 2022-08-05, the price was set by DEFAULT at 68.0 & the car was Available
5 - On date 2022-08-06, the price was set by DEFAULT at 68.0 & the car was Available
6 - On date 2022-08-07, the price was set by DEFAULT at 68.0 & the car was Available
7 - On date 2022-08-08, the price was set by DEFAULT at 68.0 & the car was Available
8 - On date 2022-08-09, the price was set by DEFAULT at 68.0 & the car was Available
9 - On date 2022-08-10, the price was set by DEFAULT at 68.0 & the car was Available
10 - On date 2022-08-11, the price was set by DEFAULT at 68.0 & the car was Available
11 - On date 2022-08-12, the price was set by DEFAULT at 68.0 & 

## Parse Turo search page results

### Load JSON data from a turo search page scraping

In [37]:
# Specify the path to your JSON file
json_file_path = '../data/network_requests.json'

# Open and read the JSON file
with open(json_file_path, 'r') as file:
    turo_search_data = json.load(file)

In [38]:
turo_vehicle_search_item = turo_search_data['entries'][0]['response_content']['vehicles'][0]
pprint(turo_vehicle_search_item)

{'availability': None,
 'avgDailyPrice': {'amount': 61.2, 'currency': 'USD'},
 'completedTrips': 14,
 'estimatedQuote': None,
 'hostId': 42801905,
 'id': 2706225,
 'images': [{'originalImageUrl': 'https://images.turo.com/media/vehicle/images/Ve-3pJyESBKK7QKKW-O3Og.jpg',
             'resizeableUrlTemplate': 'https://images.turo.com/media/vehicle/images/Ve-3pJyESBKK7QKKW-O3Og.{width}x{height}.jpg'}],
 'isAllStarHost': False,
 'isFavoritedBySearcher': False,
 'isNewListing': False,
 'location': {'city': 'Lackawanna',
              'country': 'US',
              'distance': {'unit': 'MILES', 'value': 32.07219451118512},
              'homeLocation': {'lat': 42.82337016041945,
                               'lng': -78.8265809482411},
              'isDelivery': False,
              'locationId': None,
              'locationSlugs': {'en_AU': 'lackawanna-ny',
                                'en_CA': 'lackawanna-ny',
                                'en_GB': 'lackawanna-ny',
                 

### Import Pydantic model

In [36]:
## imported model from ../etl/model/search_page_models.py

from typing import Any, List, Optional
from pydantic import BaseModel, ValidationError


class AvgDailyPrice(BaseModel):
    amount: float
    currency: str


class Image(BaseModel):
    originalImageUrl: str
    resizeableUrlTemplate: str


class Distance(BaseModel):
    unit: str
    value: float


class HomeLocation(BaseModel):
    lat: float
    lng: float


class LocationSlugs(BaseModel):
    fr_CA: str
    en_GB: str
    en_CA: str
    en_US: str
    en_AU: str
    fr_FR: str


class Location(BaseModel):
    city: str
    country: str
    distance: Distance
    homeLocation: HomeLocation
    isDelivery: bool
    locationId: Optional[Any]  # Adjust type if more information is available
    locationSlugs: LocationSlugs
    state: str


class Tag(BaseModel):
    label: str
    type: str


class VehicleSearchModel(BaseModel):
    availability: Optional[Any]  # Adjust type if more information is available
    avgDailyPrice: AvgDailyPrice
    completedTrips: int
    estimatedQuote: Optional[Any]  # Adjust type if more information is available
    hostId: int
    id: int
    images: List[Image]
    isAllStarHost: bool
    isFavoritedBySearcher: bool
    isNewListing: bool
    location: Location
    make: str
    model: str
    rating: float
    seoCategory: str
    tags: List[Tag]
    type: str
    year: int


### Parse data using pydantic

In [41]:
try:
    parsed_data = VehicleSearchModel(**turo_vehicle_search_item)
    pprint(parsed_data)
except ValidationError as e:
    print(f"Validation error: {e}")

VehicleSearchModel(availability=None, avgDailyPrice=AvgDailyPrice(amount=61.2, currency='USD'), completedTrips=14, estimatedQuote=None, hostId=42801905, id=2706225, images=[Image(originalImageUrl='https://images.turo.com/media/vehicle/images/Ve-3pJyESBKK7QKKW-O3Og.jpg', resizeableUrlTemplate='https://images.turo.com/media/vehicle/images/Ve-3pJyESBKK7QKKW-O3Og.{width}x{height}.jpg')], isAllStarHost=False, isFavoritedBySearcher=False, isNewListing=False, location=Location(city='Lackawanna', country='US', distance=Distance(unit='MILES', value=32.07219451118512), homeLocation=HomeLocation(lat=42.82337016041945, lng=-78.8265809482411), isDelivery=False, locationId=None, locationSlugs=LocationSlugs(fr_CA='lackawanna-ny', en_GB='lackawanna-ny', en_CA='lackawanna-ny', en_US='lackawanna-ny', en_AU='lackawanna-ny', fr_FR='lackawanna-ny'), state='NY'), make='Ford', model='Explorer', rating=4.9167, seoCategory='SUV', tags=[], type='SUV', year=2021)


In [42]:
## mess up the data and see the validation error
turo_vehicle_search_item_invalid = copy.copy(turo_vehicle_search_item)
turo_vehicle_search_item_invalid['avgDailyPrice']='20'
try:
    parsed_data = VehicleSearchModel(**turo_vehicle_search_item_invalid)
    pprint(parsed_data)
except ValidationError as e:
    print(f"Validation error: {e}")

Validation error: 1 validation error for VehicleSearchModel
avgDailyPrice
  Input should be a valid dictionary or instance of AvgDailyPrice [type=model_type, input_value='20', input_type=str]
    For further information visit https://errors.pydantic.dev/2.9/v/model_type


## Automting extraction flow for daily_pricing

In [12]:
import sqlite3

# Connect to the SQLite database
conn = sqlite3.connect('../data/turo.sqlite')

# Create a cursor object using the cursor() method
cursor = conn.cursor()

print("Connected to the database successfully")

Connected to the database successfully


In [16]:
query = """
SELECT v.id,
    v.listing_created_date,
    v.listing_active
FROM vehicles as v
WHERE listing_created_date is NULL;
"""

cursor.execute(query)
results = cursor.fetchall()

# Convert the results to a Polars DataFrame
df = polars.DataFrame(results, schema=["id", "listing_created_date", "listing_active"])

# Print the DataFrame
df.head()


  df = polars.DataFrame(results, schema=["id", "listing_created_date", "listing_active"])


id,listing_created_date,listing_active
i64,null,null
514340,,
597559,,
1109057,,
1120970,,
1159994,,


In [19]:
vehicleId = 514340
url = f'https://turo.com/api/vehicle/detail?vehicleId={vehicleId}'
apikey = '43adce71ec56b763f7ab9e710da66c81c448013b'
params = {
    'url': url,
    'apikey': apikey,
}
response = requests.get('https://api.zenrows.com/v1/', params=params)


{"airportLocations":[],"badges":[{"id":5,"label":"AUX input","value":"aux_input"},{"id":27,"label":"Backup camera","value":"backup_camera"},{"id":17,"label":"Bluetooth","value":"bluetooth"},{"id":4,"label":"GPS","value":"gps"},{"id":23,"label":"Heated seats","value":"heated_seats"},{"id":16,"label":"Long-term car","value":"long_term_rental"},{"id":22,"label":"Snow tires or chains","value":"snow_tires"},{"id":24,"label":"Sunroof","value":"sunroof"},{"id":21,"label":"Toll pass","value":"toll_pass"},{"id":30,"label":"USB charger","value":"usb_charger"},{"id":20,"label":"USB input","value":"usb_input"}],"basicCarDetails":{"averageFuelEconomy":null,"averageFuelEconomyWithLabel":null,"cityFuelEconomy":null,"fuelGrade":null,"fuelType":{"label":"Electric","value":"ELECTRIC"},"fuelTypeAndGradeLabel":"Electric","fuelUnit":"LITERS_PER_100_KILOMETERS","fuelUnitLabel":"L/100km","highwayFuelEconomy":null,"numberOfDoors":2,"numberOfDoorsLabel":"2 doors","numberOfSeats":2,"numberOfSeatsLabel":"2 seats

In [25]:
import json

# Assuming 'response.json()' is the dictionary you want to dump
data = response.json()

# Serialize the dictionary to a JSON formatted string
json_str = json.dumps(data)

# Calculate the size of the JSON string in bytes
size_in_bytes = len(json_str.encode('utf-8'))

print(f"Size of JSON file in bytes: {size_in_bytes}")

Size of JSON file in bytes: 18771


In [35]:
import time
import functools

def execution_time_decorator(func):
    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        start_time = time.time()  # Record start time
        result = func(*args, **kwargs)  # Call the actual function
        end_time = time.time()  # Record end time
        execution_time = end_time - start_time  # Calculate elapsed time
        print(f"Execution time of {func.__name__}: {execution_time:.4f} seconds. Args were {args} and kwargs were {kwargs}")
        return result
    return wrapper


In [36]:
@execution_time_decorator
def get_vehicle_details(vehicleId:int):
    url = f'https://turo.com/api/vehicle/detail?vehicleId={vehicleId}'
    apikey = '43adce71ec56b763f7ab9e710da66c81c448013b'
    params = {
        'url': url,
        'apikey': apikey,
    }
    response = requests.get('https://api.zenrows.com/v1/', params=params)
    return response.json()

In [37]:
vehicleId = 514340
payload = get_vehicle_details(vehicleId=vehicleId)


Execution time of get_vehicle_details: 4.4794 seconds. Args were () and kwargs were {'vehicleId': 514340}


In [38]:
cache = {}

for vehicleId in df['id']:

    cache[vehicleId] = get_vehicle_details(vehicleId=vehicleId)


Execution time of get_vehicle_details: 3.6355 seconds. Args were () and kwargs were {'vehicleId': 514340}
Execution time of get_vehicle_details: 11.2692 seconds. Args were () and kwargs were {'vehicleId': 597559}
Execution time of get_vehicle_details: 5.1392 seconds. Args were () and kwargs were {'vehicleId': 1109057}
Execution time of get_vehicle_details: 3.0787 seconds. Args were () and kwargs were {'vehicleId': 1120970}
Execution time of get_vehicle_details: 3.2698 seconds. Args were () and kwargs were {'vehicleId': 1159994}
Execution time of get_vehicle_details: 8.3845 seconds. Args were () and kwargs were {'vehicleId': 1174770}
Execution time of get_vehicle_details: 4.2375 seconds. Args were () and kwargs were {'vehicleId': 1245097}
Execution time of get_vehicle_details: 12.2806 seconds. Args were () and kwargs were {'vehicleId': 1543514}
Execution time of get_vehicle_details: 1.8394 seconds. Args were () and kwargs were {'vehicleId': 1546798}
Execution time of get_vehicle_details

In [41]:
cache[2808983]

{'airportLocations': [],
 'badges': [{'id': 26, 'label': 'Android Auto', 'value': 'android_auto'},
  {'id': 25, 'label': 'Apple CarPlay', 'value': 'apple_carplay'},
  {'id': 27, 'label': 'Backup camera', 'value': 'backup_camera'},
  {'id': 17, 'label': 'Bluetooth', 'value': 'bluetooth'},
  {'id': 4, 'label': 'GPS', 'value': 'gps'},
  {'id': 23, 'label': 'Heated seats', 'value': 'heated_seats'},
  {'id': 29, 'label': 'Keyless entry', 'value': 'keyless_entry'},
  {'id': 24, 'label': 'Sunroof', 'value': 'sunroof'},
  {'id': 30, 'label': 'USB charger', 'value': 'usb_charger'},
  {'id': 20, 'label': 'USB input', 'value': 'usb_input'}],
 'basicCarDetails': {'averageFuelEconomy': None,
  'averageFuelEconomyWithLabel': None,
  'cityFuelEconomy': None,
  'fuelGrade': {'label': 'Regular', 'value': 'REGULAR'},
  'fuelType': {'label': 'Gas', 'value': 'GASOLINE'},
  'fuelTypeAndGradeLabel': 'Gas (Regular)',
  'fuelUnit': 'LITERS_PER_100_KILOMETERS',
  'fuelUnitLabel': 'L/100km',
  'highwayFuelEcono

In [42]:
with open("../data/vehicle_details.json", "w") as fp:
    json.dump(cache, fp)

## Automting extraction flow for daily_pricing

## DEBUG