# How to work with large files

In [2]:
import os
# get the current working directory
cwd = os.getcwd()
# print(cwd)
# get a list of all entries (files and directories) in the specified folder
folder_path = cwd
folder_path = '.'
contents = os.listdir(folder_path)
# filter for only files (optional, if you only want files and not subdirectories)
files_only = [item for item in contents if os.path.isfile(os.path.join(folder_path, item))]
folder_only = [item for item in contents if os.path.isdir(os.path.join(folder_path, item))]
print("all contents (files and directories):")
for item in contents:
  print(item)
print("\nonly files:")
for file_name in files_only:
  print(file_name)
print("\nonly directories:")
for dir_name in folder_only:
  print(dir_name)

all contents (files and directories):


.euporie.jupyter-terminal.md.swp


discordo.md


tmp-env


euporie.jupyter-terminal.md


lue.ebook-reader.md


README.md





only files:


.euporie.jupyter-terminal.md.swp


discordo.md


euporie.jupyter-terminal.md


lue.ebook-reader.md


README.md





only directories:


tmp-env


## Using iterators for line-by-line reading

In [5]:
with open('README.md', 'r') as file:
  for line in file:
    print(line)

# README











## Reading in Chunks

In [6]:
def read_file_in_chunks(file_path, chunk_size=1024):
  with open(file_path, 'rb') as file:
    while True:
      chunk = file.read(chunk_size)
      if not chunk:
          break
      print(chunk)
read_file_in_chunks('README.md', chunk_size=64)

b'# README\n\n'


## Buffered File Reading

In [7]:
with open('README.md', 'rb', buffering=10*1024*1024) as file:
  for line in file:
    print(line)

b'# README\n'


b'\n'


## Memory-Mapped Files (mmap)

In [12]:
import mmap
with open('README.md', 'r') as file:
  with mmap.mmap(file.fileno(), length=0, access=mmap.ACCESS_READ) as mm:
    for line in mm:
      print(line.decode('utf-8'))

#


 


R


E


A


D


M


E














## Using Generators

In [14]:
def generate_lines(file_path):
  with open(file_path, 'r') as file:
    for line in file:
      yield line

for line in generate_lines('README.md'):
  print(line)

# README











## Processing Batches of Lines

In [15]:
def read_batches(file_path, batch_size=5):
  with open(file_path, 'r') as file:
    batch = []
    for line in file:
      batch.append(line.strip())
      if len(batch) == batch_size:
        yield batch
        batch = []
    if batch:
      yield batch

# example usage:
for batch in read_batches('README.md', batch_size=5):
  print(batch)  # Replace with your processing logic

['# README', '']


## Stream Processing

If data arrives continuously (e.g., logs or APIs), use stream processing.

In [1]:
# pip3 install requests
import requests

def stream_data(url):
  with requests.get(url, stream=True) as response:
    for line in response.iter_lines():
      print(line)
stream_data('https://raw.githubusercontent.com/igorlima/unapologetic-thoughts/407a240d0d22022670d7af33bc546257d1ac6e85/snippets/cli-and-tui/README.md')
stream_data('https://raw.githubusercontent.com/igorlima/unapologetic-thoughts/refs/heads/master/snippets/cli-and-tui/README.md')

b'# README'


b''


b'# README'


b''
