In [None]:
import pandas as pd
from sodapy import Socrata
import os
import re

#### Extract Data and write to parquet files

In [None]:
while True:
    try:
        client = Socrata("data.cityofchicago.org", None)

        base_query = "SELECT * ORDER BY start_time DESC"

        limit = 100000

        # Create a directory to store the parquet files if it does not exist
        parquet_dir = 'parquet_files'
        os.makedirs(parquet_dir, exist_ok=True)

        # Function to extract offset from file name
        def extract_offset(filename):
            match = re.search(r'data_(\d+).parquet', filename)
            return int(match.group(1)) if match else 0

        # Find the highest offset in the parquet folder
        existing_files = [f for f in os.listdir(parquet_dir) if f.endswith('.parquet')]
        offsets = [extract_offset(f) for f in existing_files]
        offset = max(offsets, default=0) + limit if offsets else 0

        while True:
            query = f"{base_query} LIMIT {limit} OFFSET {offset}"
            
            results = client.get("fg6s-gzvg", query=query)
            if not results:
                break
            
            print(f'Retrieved {offset} to {offset + limit} rows')

            # Convert the results to a DataFrame
            results_df = pd.DataFrame.from_records(results)

            # Save the DataFrame to a parquet file
            file_name = f"{parquet_dir}/data_{offset}.parquet"
            results_df.to_parquet(file_name, index=False)

            offset += limit

        print("Data retrieval and saving complete.")

    except Exception as e:
        print(f"An error occurred: {e}")
            

#### Filter and take a random sample

In [None]:
import pandas as pd
from sodapy import Socrata
import os
import re

parquet_dir = 'parquet_files'
parquet_files = [f for f in os.listdir(parquet_dir) if f.endswith('.parquet')]

all_data = pd.concat([pd.read_parquet(os.path.join(parquet_dir, file)) for file in parquet_files])
filtered_data = all_data[all_data['start_time'] >= '2017']

sample_size = 100000
random_sample = filtered_data.sample(n=sample_size)


#### Write random sample to csv

In [None]:
random_sample.to_csv('rs_100000_2017_to_present.csv', index=False)