# Working with files in Python

https://realpython.com/working-with-files-in-python/

* `with open(...)` statement allows open files and close automatically

In [1]:
help(open)

Help on function open in module _io:

open(
    file,
    mode='r',
    buffering=-1,
    encoding=None,
    errors=None,
    newline=None,
    closefd=True,
    opener=None
)
    Open file and return a stream.  Raise OSError upon failure.

    file is either a text or byte string giving the name (and the path
    if the file isn't in the current working directory) of the file to
    be opened or an integer file descriptor of the file to be
    wrapped. (If a file descriptor is given, it is closed when the
    returned I/O object is closed, unless closefd is set to False.)

    mode is an optional string that specifies the mode in which the file
    is opened. It defaults to 'r' which means open for reading in text
    mode.  Other common values are 'w' for writing (truncating the file if
    it already exists), 'x' for creating and writing to a new file, and
    'a' for appending (which on some Unix systems, means that all writes
    append to the end of the file regardless of the cur

    ========= ===============================================================
    Character Meaning
    --------- ---------------------------------------------------------------
    'r'       open for reading (default)
    'w'       open for writing, truncating the file first
    'x'       create a new file and open it for writing
    'a'       open for writing, appending to the end of the file if it exists
    'b'       binary mode
    't'       text mode (default)
    '+'       open a disk file for updating (reading and writing)
    ========= ===============================================================

In [2]:
with open("../datasets/data.txt") as file:
    data = file.read()

In [3]:
data.split()

['-lfmwmkl2mg',
 'ergmlñgmgmwñemglwerg',
 'werñlgwegmlñwerm',
 'werfpwlemglwmer',
 '}wer',
 'gwerfwer',
 'g}r}r}wer',
 'gwerfwergwer',
 'gwerfwewerfwerf',
 'wergwerdsvsddafwaefqwfgwer',
 'gwerfwerrgwerfwe',
 'fvdsds']

## Directory listing

* `os` module ...

In [4]:
import os

In [5]:
entries =os.listdir("../")  # list all files and subdirectories of the given path
entries, type(entries)

(['.git',
  '.gitignore',
  'certifications',
  'datasets',
  'images',
  'notebooks',
  'notes',
  'practice',
  'projects',
  'readme.md',
  'requirements.txt',
  'scripts'],
 list)

* `os.scandir` is an alternative to `os.listdir`

In [6]:
entries = os.scandir("../")  # returns an iterator pointing to the entries in the directory
entries, type(entries)

(<nt.ScandirIterator at 0x1dc01171d70>, nt.ScandirIterator)

In [7]:
# os.scandir suports the context manager protocol
with os.scandir("../") as entries:  
    for entry in entries: 
        print(entry.name)

.git
.gitignore
certifications
datasets
images
notebooks
notes
practice
projects
readme.md
requirements.txt
scripts


* Using `pathlib`

In [8]:
from pathlib import Path

In [9]:
entries = Path("../")
entries, type(entries)

(WindowsPath('..'), pathlib._local.WindowsPath)

In [10]:
for entry in entries.iterdir(): 
    print(entry, entry.name, type(entry))

..\.git .git <class 'pathlib._local.WindowsPath'>
..\.gitignore .gitignore <class 'pathlib._local.WindowsPath'>
..\certifications certifications <class 'pathlib._local.WindowsPath'>
..\datasets datasets <class 'pathlib._local.WindowsPath'>
..\images images <class 'pathlib._local.WindowsPath'>
..\notebooks notebooks <class 'pathlib._local.WindowsPath'>
..\notes notes <class 'pathlib._local.WindowsPath'>
..\practice practice <class 'pathlib._local.WindowsPath'>
..\projects projects <class 'pathlib._local.WindowsPath'>
..\readme.md readme.md <class 'pathlib._local.WindowsPath'>
..\requirements.txt requirements.txt <class 'pathlib._local.WindowsPath'>
..\scripts scripts <class 'pathlib._local.WindowsPath'>


* `pathlib.Path(...).iterdir()` doesn't support the context manager protocol

| Function                 | Description                                                                     |
|--------------------------|---------------------------------------------------------------------------------|
| `os.listdir()`           | Returns a list of all files and folders in a directory                         |
| `os.scandir()`           | Returns an iterator of all the objects in a directory including file attribute information |
| `pathlib.Path.iterdir()` | Returns an iterator of all the objects in a directory including file attribute information |


### Listing all **files** in a directory

In [11]:
basepath = "../"

* Using `os.listdir()`

In [12]:
for entry in os.listdir(basepath): 
    if os.path.isfile(os.path.join(basepath, entry)):
        # checks if the current basepath+entry es a file 
        print(entry)

.gitignore
readme.md
requirements.txt


* Using `os.scandir()`

In [13]:
with os.scandir(basepath) as entries: 
    for entry in entries: 
        if entry.is_file(): 
            print(entry.name)

.gitignore
readme.md
requirements.txt


* Using `pathlib.Path()`

In [14]:
basepath = Path("../")
files_in_basepath = basepath.iterdir()

In [15]:
for item in files_in_basepath: 
    if item.is_file(): 
        print(item.name)

.gitignore
readme.md
requirements.txt


## Listing subdirectories

* Using `os.listdir()` and `os.path()`

In [16]:
basepath = "../"

In [17]:
for entry in os.listdir(basepath):
    if os.path.isdir(os.path.join(basepath, entry)):
        print(entry)

.git
certifications
datasets
images
notebooks
notes
practice
projects
scripts


In [18]:
with os.scandir(basepath) as entries: 
    for entry in entries: 
        if entry.is_dir(): print(entry.name)

.git
certifications
datasets
images
notebooks
notes
practice
projects
scripts


* Using `pathlib.Path().scandir()`

In [19]:
basepath = Path("../")
entries = basepath.iterdir()

for entry in entries: 
    if entry.is_dir(): print(entry.name)

.git
certifications
datasets
images
notebooks
notes
practice
projects
scripts


## Getting file atributes

* We can get information of a file, such as size and modified times easy. 

In [20]:
with os.scandir("../") as dir_contents: 
    for entry in dir_contents: 
        info = entry.stat()
        #print(type(info))  # returns a os.stat_result
        print(entry.name, info.st_mtime)  # print the time of the files was last modified in seconds

.git 1757364826.4669695
.gitignore 1757452584.921516
certifications 1757265203.999761
datasets 1757446959.244487
images 1757369331.2478101
notebooks 1757452378.0220563
notes 1757357356.1709347
practice 1757364894.3521965
projects 1755886436.210866
readme.md 1755886491.9523795
requirements.txt 1755886751.089005
scripts 1757449113.376423


* Using `pathlib.Path().iterdir()`

In [21]:
current_dir = Path("../")
for path in current_dir.iterdir():
    info = path.stat()
    print(path.name, info.st_mtime)

.git 1757364826.4669695
.gitignore 1757452584.921516
certifications 1757265203.999761
datasets 1757446959.244487
images 1757369331.2478101
notebooks 1757452378.0220563
notes 1757357356.1709347
practice 1757364894.3521965
projects 1755886436.210866
readme.md 1755886491.9523795
requirements.txt 1755886751.089005
scripts 1757449113.376423


## Making directories

|Function | Description|
|---------|-------------|
|os.mkdir() | Creates a single subdirectory|
|pathlib.Path.mkdir() | Creates single or multiple directories|
|os.makedirs() | Creates multiple directories, including intermediate directories|

* Creating a single dir

In [24]:
#os.mkdir("example_dir")

In [25]:
"example_dir" in os.listdir(".")

True

In [26]:
#os.mkdir("example_dir")  # raises an FileExistingError

In [28]:
p = Path("second_example_dir")
p.mkdir(exist_ok=True)

In [29]:
p.name in os.listdir(".")

True

In [30]:
# p.mkdir()  # raises a FileExistingError

# We can catch the error as follows:
try: 
    p.mkdir()
except FileExistsError as exc: 
    print(exc)


[WinError 183] No se puede crear un archivo que ya existe: 'second_example_dir'


In [31]:
# Alternativelly: 
p = Path("dir")
p.mkdir(exist_ok=True)
p.name in os.listdir()

True

In [32]:
p.mkdir(exist_ok=True)

In [33]:
p = Path("some_directory")
for item in p.iterdir(): 
    print(item.name)

admin.py
data_01.txt
data_01_backup.txt
data_02.txt
data_02_backup.txt
data_03.txt
data_03_backup.txt
sub_dir
tests.py


## Filename Pattern Matching

### String methods

* There are multiple string methods, the most useful when finding a string are `.endswith()`, `.startswith()`

In [34]:
for f_name in os.listdir("some_directory"): 
    if f_name.endswith('.txt'): 
        print(f_name)

data_01.txt
data_01_backup.txt
data_02.txt
data_02_backup.txt
data_03.txt
data_03_backup.txt


In [35]:
for f_name in os.listdir("some_directory"): 
    if f_name.endswith('.py'): 
        print(f_name)

admin.py
tests.py


In [36]:
for f_name in os.listdir("some_directory"): 
    if f_name.startswith('data'): print(f_name)

data_01.txt
data_01_backup.txt
data_02.txt
data_02_backup.txt
data_03.txt
data_03_backup.txt


### Using `fnmatch`

* `fnmatch` has advenced methods for pattern matching
* `fnmatch.fnmatch()` supports regex wildcards

In [37]:
import fnmatch

In [38]:
for file_name in os.listdir("some_directory"): 
    if fnmatch.fnmatch(file_name, "*.txt"): 
        print(file_name)

data_01.txt
data_01_backup.txt
data_02.txt
data_02_backup.txt
data_03.txt
data_03_backup.txt


### Advanced pattern matching

In [39]:
# printing only backup data
for file_name in os.listdir("some_directory/"):
    if fnmatch.fnmatch(file_name, "data_*_backup.txt"): 
        print(file_name)

data_01_backup.txt
data_02_backup.txt
data_03_backup.txt


### Using glob

* `glob` python module allow us to work with glob UNIX patterns

In [40]:
import glob

In [41]:
notebooks = glob.glob("*.ipynb")
type(notebooks), notebooks 

(list,
 ['01-importing_data.ipynb',
  '02-importing_data.ipynb',
  'cleaning_data.ipynb',
  'data_manipulation_with_pandas.ipynb',
  'downloading_data.ipynb',
  'intro_seaborn.ipynb',
  'working_with_files.ipynb',
  'working_with_pd-dates.ipynb',
  'zip_files.ipynb'])

* `glob.glob()` returns a `list` with all paths matching the pattern

In [42]:
glob.glob("some_directory/*.py")

['some_directory\\admin.py', 'some_directory\\tests.py']

In [43]:
for name in glob.glob("*[0-9]*.txt", root_dir="some_directory"): 
    print(name)

data_01.txt
data_01_backup.txt
data_02.txt
data_02_backup.txt
data_03.txt
data_03_backup.txt


* Searching patterns on subdirectories recursively

In [44]:
for name in glob.iglob("**/*.py", recursive=True, root_dir="some_directory"): 
    print(name)

admin.py
tests.py
sub_dir\file1.py
sub_dir\file2.py


* `iglob()` returns an iterator instad of a list given by `glob()`

#### Using `pathlib.Path().glob`

In [45]:
p = Path("./some_directory/")

In [46]:
for item in p.iterdir(): print(item.name)

admin.py
data_01.txt
data_01_backup.txt
data_02.txt
data_02_backup.txt
data_03.txt
data_03_backup.txt
sub_dir
tests.py


* `pathlib.Path().glob()` returns a generator that points to all files in the curr dir

In [47]:
for name in p.glob("*.p*"):
    print(name)

some_directory\admin.py
some_directory\tests.py


## Traversing directories and processing files

* A common programming task is walking a directory tree and processing files in the tree. 
* We use `os.walk()`

In [48]:
for dirpath, dirnames, files in os.walk("./some_directory/"):
    print(f'Found dir: {dirpath}')
    for file_name in files: 
        print(file_name)

Found dir: ./some_directory/
admin.py
data_01.txt
data_01_backup.txt
data_02.txt
data_02_backup.txt
data_03.txt
data_03_backup.txt
tests.py
Found dir: ./some_directory/sub_dir
file1.py
file2.py


* `os.walk()` returns three values on each iteration of the loop:

    1. The name of the current folder

    2. A list of folders in the current folder

    3. A list of files in the current folder

## Temporary files and directories

* we can create temp files used when the program is running

In [49]:
from tempfile import TemporaryFile, NamedTemporaryFile, TemporaryDirectory

In [50]:
fp = TemporaryFile('w+t')

fp.write('Hello universe!\n')
fp.write('This is a new line.\n')
fp.seek(0)
data = fp.read()
fp.close()

In [51]:
print(data, type(data))

Hello universe!
This is a new line.
 <class 'str'>


* `tempfile` objects are contexts

In [52]:
with TemporaryFile('w+t') as fp:
    fp.write('Hello universe!\n')
    fp.write('holaholahloa')
    fp.seek(0)
    print(fp.name)
    print(fp.read())
# File is now closed and removed

C:\Users\isaul\AppData\Local\Temp\tmp0r9hjkjg
Hello universe!
holaholahloa


In [53]:
with NamedTemporaryFile(mode='w', delete=True) as tf: 
    tf.write('Test message\n')
    tf.write('Test message\n')
    tf.write('Test message\n')
    tf.write('Test message\n')

    temp_file_name = tf.name
    print(f"Temporary file created at: {temp_file_name}")
    # with open(temp_file_name, 'r') as f_read: 
    #     content = f_read.read()
    #     print('Content of the temp file:\n', content)


Temporary file created at: C:\Users\isaul\AppData\Local\Temp\tmp780wdni7


* Temp dirs

In [54]:
with TemporaryDirectory() as tmpdir: 
    print('created temp dir: ', tmpdir)
    print(os.path.exists(tmpdir))

created temp dir:  C:\Users\isaul\AppData\Local\Temp\tmp5i14pmg5
True


## Deleting files and directories

### Deleting files

In [58]:
for item in p.iterdir(): 
    print(item.name)

admin.py
data.txt
data_01.txt
data_01_backup.txt
data_02.txt
data_02_backup.txt
data_03.txt
data_03_backup.txt
sub_dir
tests.py


In [60]:
data_file = p.joinpath("data.txt")
print(data_file)

some_directory\data.txt


In [61]:
os.remove(data_file)

In [62]:
data_file in p.iterdir()

False

* File doesn't exist anymore

In [63]:
try: 
    os.remove(data_file)
except Exception as e: 
    print(type(e), e)

<class 'FileNotFoundError'> [WinError 2] El sistema no puede encontrar el archivo especificado: 'some_directory\\data.txt'


* what happens if the file is a directory?

In [64]:
dir_path = p.joinpath("sub_dir")
print(dir_path)

some_directory\sub_dir


In [65]:
try: 
    os.remove(dir_path)
except Exception as e: 
    print(type(e), e)

<class 'PermissionError'> [WinError 5] Acceso denegado: 'some_directory\\sub_dir'


In [68]:
os.path.isfile(p.joinpath("admin.py")), os.path.isfile(dir_path)

(True, False)

### Deleting directories

* we can use: 
    * `os.rmdir()`
    * `pathlib.Path().rmdir()`
    * `shutil.rmtree()`

In [71]:
p

WindowsPath('some_directory')

In [79]:
empty_dir = p.joinpath("empty")
empty_dir.mkdir(exist_ok=True)

In [81]:
empty_dir in p.iterdir()

True

In [82]:
empty_dir.rmdir()

In [83]:
empty_dir in p.iterdir()

False

In [75]:
trash_dir = Path("some_directory/trash_dir")
trash_dir.mkdir(exist_ok=True)

In [76]:
trash_dir in p.iterdir()

True

In [77]:
for x in trash_dir.iterdir(): print(x)

some_directory\trash_dir\a.txt
some_directory\trash_dir\b.txt


* Non-empty dirs cannot be deleted

In [78]:
try: 
    os.rmdir(trash_dir)
except Exception as e: 
    print(type(e), e)

<class 'OSError'> [WinError 145] El directorio no está vacío: 'some_directory\\trash_dir'


### Deleting entire directory trees

In [84]:
import shutil

In [85]:
trash_dir

WindowsPath('some_directory/trash_dir')

In [86]:
try: 
    shutil.rmtree(trash_dir)
except Exception as e: 
    print(type(e), e)

In [87]:
trash_dir in p.iterdir()

False

|Function | Description|
|---------|-------------|
|os.remove() | Deletes a file and does not delete directories|
|os.unlink()|Is identical to os.remove() and deletes a single file|
|pathlib.Path.unlink()|Deletes a file and cannot delete directories|
|os.rmdir()|Deletes an empty directory|
|pathlib.Path.rmdir()|Deletes an empty directory|
|shutil.rmtree()|Deletes entire directory tree and can be used to delete non-empty directories|

## Copying 

### Files

In [88]:
src = "dir/datitos.txt"
dst = "some_directory/subdir"

shutil.copy(src, dst)

'some_directory/subdir'

### Dirs

In [96]:
shutil.copytree("backup/", "some_directory/", dirs_exist_ok=True)

'some_directory/'

In [97]:
shutil.rmtree("backup/")

## Archiving - ZIP Files

In [98]:
import zipfile

In [100]:
zip_path = Path("some_directory/ziped.zip")
zip_path, zip_path.is_file()

(WindowsPath('some_directory/ziped.zip'), True)

In [110]:
with zipfile.ZipFile(zip_path, 'r') as zipobj: 
    print(zipobj.namelist())
    bar_info = zipobj.getinfo("sub_dir/bar.py")
    print(f"File Name: {bar_info.filename}")
    print(f"Uncompressed Size: {bar_info.file_size} bytes")
    print(f"Compressed Size: {bar_info.compress_size} bytes")
    print(f"Compression Method: {bar_info.compress_type}")


['sub_dir/', 'sub_dir/bar.py', 'sub_dir/foo.py', 'file1.py', 'file2.py', 'file3.py']
File Name: sub_dir/bar.py
Uncompressed Size: 9 bytes
Compressed Size: 9 bytes
Compression Method: 0


In [112]:
with zipfile.ZipFile(zip_path) as my_zip: 
    for member in my_zip.infolist(): 
        print(f'Member: {member.filename}, Size: {member.file_size}')

Member: sub_dir/, Size: 0
Member: sub_dir/bar.py, Size: 9
Member: sub_dir/foo.py, Size: 9
Member: file1.py, Size: 11
Member: file2.py, Size: 11
Member: file3.py, Size: 11


### Extracting ZIP Archives

In [113]:
data_zip = zipfile.ZipFile(zip_path, 'r')

In [116]:
data_zip.extract('file1.py', path="some_directory/unziped/")

'some_directory\\unziped\\file1.py'

In [117]:
os.remove("some_directory/unziped/file1.py")

In [118]:
data_zip.extractall(path="some_directory/unziped/")

In [119]:
data_zip.close()

* Sometimes zipfiles are protected, we need to specify the password when extracting

```Python
with zipfile.ZipFile('secret.zip', 'r') as pwd_zip:
    # Extract from a password protected archive
    pwd_zip.extractall(path='extract_dir', pwd='Quish3@o')
```

## Creating new ZIP files

In [121]:
lst = [x for x in p.iterdir()]
lst;

In [124]:
with zipfile.ZipFile("../datasets/new_zip.zip", 'w') as new_zip: 
    for name in lst: 
        new_zip.write(name)