In [10]:
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.poolmanager import PoolManager
import ssl
import pandas as pd
import time
from datetime import datetime, timedelta

# Define the base API endpoint
base_url = 'https://api.cepik.gov.pl/pojazdy'

# Define rate limiting variables
MAX_REQUESTS_PER_SECOND = 20
MAX_REQUESTS_PER_MINUTE = 100

# Track request counts and timestamps
request_count = 0
minute_start_time = datetime.now()
second_start_time = datetime.now()

# Create a session with custom SSL context
class SSLAdapter(requests.adapters.HTTPAdapter):
    def init_poolmanager(self, *args, **kwargs):
        context = ssl.create_default_context()
        context.set_ciphers("DEFAULT:@SECLEVEL=1")
        kwargs['ssl_context'] = context
        return super(SSLAdapter, self).init_poolmanager(*args, **kwargs)

# Create a session
session = requests.Session()
session.mount('https://', SSLAdapter())

# Function to fetch data with retries and handle rate limiting
def fetch_data_with_retries(url, params=None, retries=5):
    global request_count, minute_start_time, second_start_time
    
    # Check if we need to enforce rate limiting
    now = datetime.now()
    if (now - second_start_time).total_seconds() >= 1:
        second_start_time = now
        request_count = 0
    
    if (now - minute_start_time).total_seconds() >= 60:
        minute_start_time = now
        request_count = 0
    
    # Wait if we have reached the rate limits
    while request_count >= MAX_REQUESTS_PER_SECOND:
        time.sleep(0.1)  # Wait for 0.1 second
        now = datetime.now()
        if (now - second_start_time).total_seconds() >= 1:
            second_start_time = now
            request_count = 0

    while request_count >= MAX_REQUESTS_PER_MINUTE:
        time.sleep(0.1)  # Wait for 1 second
        now = datetime.now()
        if (now - minute_start_time).total_seconds() >= 60:
            minute_start_time = now
            request_count = 0

    # Perform the request
    for attempt in range(retries):
        response = session.get(url, params=params, headers={'accept': 'application/json'})
        request_count += 1
        
        if response.status_code == 200:
            return response.json()
        elif response.status_code == 429:
            wait_time = 2 ** attempt  # Exponential backoff
            print(f"Rate limit exceeded. Retrying in {wait_time} seconds...")
            time.sleep(wait_time)
        else:
            print(f"Failed to retrieve data: {response.status_code}")
            break
    
    return None

# Fetch vehicle IDs
first_page = 1
last_page = 2
vehicles_ids = []

for page in range(first_page, last_page + 1):
    params = {
        'wojewodztwo': '30',
        'data-od': '20190101',
        'data-do': '20191231',
        'page': f'{page}'
    }

    data = fetch_data_with_retries(base_url, params=params)

    if data:
        for item in data['data']:
            id = item['id']
            vehicles_ids.append(id)

# Fetch vehicle data and create DataFrame
df_list = []

for vehicle_id in vehicles_ids:
    url = f'{base_url}/{vehicle_id}'
    data = fetch_data_with_retries(url)

    if data:
        attributes = data['data']['attributes']
        df_list.append(pd.DataFrame([attributes]))
    else:
        print(f"Failed to retrieve data for vehicle ID {vehicle_id}")

# Concatenate all DataFrames in the list into a single DataFrame
if df_list:
    df = pd.concat(df_list, ignore_index=True)
    
else:
    print("No data was retrieved.")


ConnectionError: HTTPSConnectionPool(host='api.cepik.gov.pl', port=443): Max retries exceeded with url: /pojazdy?wojewodztwo=30&data-od=20190101&data-do=20191231&page=1 (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000002204FEE2140>: Failed to resolve 'api.cepik.gov.pl' ([Errno 11001] getaddrinfo failed)"))

In [6]:
df.head(10)

Unnamed: 0,marka,kategoria-pojazdu,typ,model,wariant,wersja,rodzaj-pojazdu,podrodzaj-pojazdu,przeznaczenie-pojazdu,pochodzenie-pojazdu,...,data-wprowadzenia-danych,rejestracja-wojewodztwo,rejestracja-gmina,rejestracja-powiat,wlasciciel-wojewodztwo,wlasciciel-powiat,wlasciciel-gmina,wlasciciel-wojewodztwo-kod,wojewodztwo-kod,poziom-emisji-co2-paliwo-alternatywne-1
0,BMW,0,---,X1,---,---,SAMOCHÓD OSOBOWY,KOMBI,---,UŻYW. IMPORT INDYW,...,,WIELKOPOLSKIE,KĘPNO,KĘPIŃSKI,,,,,30,
1,WFM,11111111,---,M 06,---,---,MOTOCYKL,OSOBOWY,---,PONOWNA REJESTRACJA,...,,WIELKOPOLSKIE,SZAMOTUŁY,SZAMOTULSKI,,,,,30,
2,VOLVO,0,---,FH,---,---,CIĄGNIK SAMOCHODOWY,SIODŁOWY,---,UŻYW. ZAKUPIONY W KRAJU,...,,WIELKOPOLSKIE,POZNAŃ-STARE MIASTO,POZNAŃ,,,,,30,
3,HYUNDAI,11111111,---,I10,---,---,SAMOCHÓD OSOBOWY,WIELOZADANIOWY,---,UŻYW. IMPORT INDYW,...,,WIELKOPOLSKIE,ZŁOTÓW,ZŁOTOWSKI,,,,,30,
4,CADILLAC,13,01SL,CT6,AAAB,ABBAAAAAA,SAMOCHÓD OSOBOWY,KARETA (SEDAN),---,UŻYW. ZAKUPIONY W KRAJU,...,,WIELKOPOLSKIE,POZNAŃ-JEŻYCE,POZNAŃ,,,,,30,
5,OPEL,11111111,---,CORSA-C,---,---,SAMOCHÓD OSOBOWY,HATCHBACK,---,UŻYW. IMPORT INDYW,...,,WIELKOPOLSKIE,GRODZISK WIELKOPOLSKI,GRODZISKI,,,,,30,
6,MERCEDES-BENZ,11111111,---,E 200,---,---,SAMOCHÓD OSOBOWY,KARETA (SEDAN),---,UŻYW. ZAKUPIONY W KRAJU,...,,WIELKOPOLSKIE,SŁUPCA,SŁUPECKI,,,,,30,
7,OPEL,11111111,---,AGILA 1.2 KAT,---,---,SAMOCHÓD OSOBOWY,HATCHBACK,---,UŻYW. IMPORT INDYW,...,,WIELKOPOLSKIE,xxxxxxxxx,xxxxxxxxx,,,,,30,
8,MAN,11111111,---,TGM,---,---,SAMOCHÓD CIĘŻAROWY,FURGON/PODEST,UNIWERSALNY,UŻYW. ZAKUPIONY W KRAJU,...,,WIELKOPOLSKIE,POZNAŃ-JEŻYCE,POZNAŃ,,,,,30,
9,VOLKSWAGEN,14,5N,TIGUAN,ACDFHAX1,AD7AD7DL003N5R5NVR27CP1CSB,SAMOCHÓD OSOBOWY,KOMBI,---,NOWY ZAKUPIONY W KRAJU,...,,WIELKOPOLSKIE,KONIN,KONIN,,,,,30,


In [8]:
df.shape

(200, 68)