## Files and IO


In [1]:
# Reading and writing text data
with open('assets/search_text.txt', 'r') as file:
    data = file.read()
    print(data)

first line
second line
third line
fourth line
fifth line
python is in line #6
other line



In [12]:
#Reading line by line:
with open('assets/search_text.txt', 'r', encoding = 'utf-8', errors = 'replace') as file:
    data = file.readlines()
    print(data)
    for line_number, line in enumerate(data, 1):
        print(f"{line_number}: {line}", end='')

['first line\n', 'second line\n', 'third line\n', 'fourth line\n', 'fifth line\n', 'python is in line #6\n', 'other line\n']
1: first line
2: second line
3: third line
4: fourth line
5: fifth line
6: python is in line #6
7: other line


In [13]:
# Write the print output into a file:
with open('assets/new_file', 'w') as file:
    print('New line printed inside the file', file = file)

In [14]:
# Change default separator and end of line in print:
print('ACME', 50, 91.5)

ACME 50 91.5


In [15]:
print('ACME', 50, 91.5,  sep = ';')

ACME;50;91.5


In [16]:
print('ACME', 50, 91.5, end = '!!!')

ACME 50 91.5!!!

In [17]:
# Performing IO operations on strings:
# Useful for example when have code that works with documents (objects)
# instead of strings, make this type of conversion could be helpful

import io

In [23]:
s = io.StringIO()
s.write('Hello World\n')

12

In [24]:
print('This is a test', file = s)

In [26]:
s.getvalue()

'Hello World\nThis is a test\n'

In [36]:
# Reading and writing compressed files:
#gzip compresion:

import gzip

# Need to be byte data, not str
with gzip.open('assets/file.gz', 'w') as f:
    f.write(b'test line')


# bz2 compression (same with different name for the module)



In [37]:
with gzip.open('assets/file.gz', 'r') as f:
    data = f.read()
    data_decoded = data.decode('utf-8')
    print(data_decoded)

test line


In [1]:
# Manipulating Pathnames
# Get last component of a path:

import os
path = 'my/path/to/file.txt'

print(os.path.basename(path))

file.txt


In [2]:
# Get directory name
os.path.dirname(path)

'my/path/to'

In [4]:
# Join path components:
other_path = 'home/nacho'

print(os.path.join(other_path, path))

home/nacho\my/path/to/file.txt


In [5]:
# Expand users
new_path = '~/my/new/path/to/file.txt'
expanded_new_path = os.path.expanduser(new_path)
print(expanded_new_path)

C:\Users\nacho/my/new/path/to/file.txt


In [6]:
# Split file extension:
path_with_no_extension, extension = os.path.splitext(expanded_new_path)

In [7]:
print(path_with_no_extension, extension)

C:\Users\nacho/my/new/path/to/file .txt


In [8]:
# Look for the existance of a file/directory

print(os.path.isfile(os.path.join('assets', 'search_text.txt')))

True


In [9]:
print(os.path.isdir('assets'))

True


In [11]:
print(os.path.exists(os.path.join('assets', 'search_text.txt')))

True


In [12]:
# Get wether a path is a link
print(os.path.islink('~\Anaconda3\python3'))

# In case it was, get the target path of the link
print(os.path.realpath('~\Anaconda3\python3'))

False


In [15]:
# Get all the files dirs in existing on a directory:
os.listdir('assets')
subdirs = [p for p in os.listdir('assets') if os.path.isdir(os.path.join('assets', p))]
files = [p for p in os.listdir('assets') if os.path.isfile(os.path.join('assets', p))]

print(subdirs, files)

[] ['file.gz', 'new_file', 'search_text.txt']


In [40]:
# In order to check the size of a path in bytes:
print(f"The size in bytes is: {os.path.getsize('assets/search_text.txt')} B")

The size in bytes is: 96 B


In [19]:
# Best way to use wildcards for files patterns
import glob
glob.glob('assets/*.txt')

['assets\\search_text.txt']

In [26]:
#Working with temporary files: best option is to use the tempfile
from tempfile import TemporaryFile, TemporaryDirectory

# These code creates a temporary file (when exit from the context, the file is deleted)
with TemporaryFile('w+') as file:
    file.write('This is the temporary file content')
    # Essential, because read uses the pointer inside the file
    # and read forwards
    file.seek(0)
    data = file.read()


In [29]:
# It is possible to use a NamedTemporaryFile, then use the argument delete = False
# But in these case, the temporary file wont be temporary at all, the only thing to
# take into account is that it will be saves in the temporary folder of the current OS
# to check that:

import tempfile
from tempfile import NamedTemporaryFile

tempfile.gettempdir()

'C:\\Users\\nacho\\AppData\\Local\\Temp'

In [33]:
# So if need to create a "temporary" file in that path:

with NamedTemporaryFile('w', encoding = 'utf-8', prefix = 'nachos_temp', suffix = '.txt', delete = False) as file:
    file.writelines(['linea numero 1\n', 'linea numero 2'])
    # Get the path to that created file
    file_name = file.name

In [34]:
# Lets check for that code:

if os.path.isfile(file_name):
    with open(file_name, 'r') as file:
        print(file.read())


linea numero 1
linea numero 2


In [5]:
# Serializing Python objtects: best way is to use pickle or joblib
# The difference is that pickle needs to open a file with a context
# in order to be able to write. Instead, joblib allows to write direcly

import pickle
import joblib

In [6]:
my_list_to_export = [1,2,34,5]
joblib.dump(my_list_to_export, 'assets/my_exported_list.pkl')

['assets/my_exported_list.pkl']

In [7]:
my_imported_list = joblib.load('assets/my_exported_list.pkl')

In [8]:
print(my_list_to_export == my_imported_list)

True


In [15]:
# Almost all objects in python could be serialize this way. However, it is not
# a good option using it for large objects as big np or df, in this case
# is better option using json/csv. In this case using pandas as a handler is useful

import numpy as np
import pandas as pd
zero_arrays = np.zeros((21123, 234))

In [30]:
pd.DataFrame(zero_arrays).to_csv('assets/zero_arrays.csv', index = False)

In [31]:
zerro_arrays_imported = pd.read_csv('assets/zero_arrays.csv').values

In [32]:
zero_arrays.shape

(21123, 234)

In [33]:
zerro_arrays_imported.shape

(21123, 234)

In [34]:
np.array_equal(zero_arrays, zerro_arrays_imported)

True