# Del 04: Delo s tekstovnimi datotekami

### Osnove branja in pisanja za tekstovne datoteke

#### Line Endings

    Pug\r\n
    Jack Russell Terrier\r\n
    English Springer Spaniel\r\n
    German Shepherd\r\n
    Staffordshire Bull Terrier\r\n
    Cavalier King Charles Spaniel\r\n
    Golden Retriever\r\n
    West Highland White Terrier\r\n
    Boxer\r\n
    Border Terrier\r\n

    Pug\r
    \n
    Jack Russell Terrier\r
    \n
    English Springer Spaniel\r
    \n
    German Shepherd\r
    \n
    Staffordshire Bull Terrier\r
    \n
    Cavalier King Charles Spaniel\r
    \n
    Golden Retriever\r
    \n
    West Highland White Terrier\r
    \n
    Boxer\r
    \n
    Border Terrier\r
    \n

#### Character Encodings

### Opening and Closing a File in Python

In [None]:
file = open('data/test.txt')

In [None]:
file.close()

In [None]:
reader = open('data/test.txt')
try:
    # Further file processing goes here
    pass
finally:
    reader.close()

In [None]:
with open('data/test.txt') as reader:
    # Further file processing goes herež
    pass

In [None]:
with open('data/test.txt', 'r') as reader:
    # Further file processing goes here
    pass

<table class="table table-hover">
<thead>
<tr>
<th>Character</th>
<th>Meaning</th>
</tr>
</thead>
<tbody>
<tr>
<td><code>'r'</code></td>
<td>Open for reading (default)</td>
</tr>
<tr>
<td><code>'w'</code></td>
<td>Open for writing, truncating (overwriting) the file first</td>
</tr>
<tr>
<td><code>'rb'</code> or <code>'wb'</code></td>
<td>Open in binary mode (read/write using byte data)</td>
</tr>
</tbody>
</table>

[open(file, mode='r', buffering=-1, encoding=None, errors=None, newline=None, closefd=True, opener=None)](https://docs.python.org/3/library/functions.html#open)

<table class="docutils align-default" id="index-5">
<colgroup>
<col style="width: 13%">
<col style="width: 88%">
</colgroup>
<thead>
<tr class="row-odd"><th class="head"><p>Character</p></th>
<th class="head"><p>Meaning</p></th>
</tr>
</thead>
<tbody>
<tr class="row-even"><td><p><code class="docutils literal notranslate"><span class="pre">'r'</span></code></p></td>
<td><p>open for reading (default)</p></td>
</tr>
<tr class="row-odd"><td><p><code class="docutils literal notranslate"><span class="pre">'w'</span></code></p></td>
<td><p>open for writing, truncating the file first</p></td>
</tr>
<tr class="row-even"><td><p><code class="docutils literal notranslate"><span class="pre">'x'</span></code></p></td>
<td><p>open for exclusive creation, failing if the file already exists</p></td>
</tr>
<tr class="row-odd"><td><p><code class="docutils literal notranslate"><span class="pre">'a'</span></code></p></td>
<td><p>open for writing, appending to the end of the file if it exists</p></td>
</tr>
<tr class="row-even"><td><p><code class="docutils literal notranslate"><span class="pre">'b'</span></code></p></td>
<td><p>binary mode</p></td>
</tr>
<tr class="row-odd"><td><p><code class="docutils literal notranslate"><span class="pre">'t'</span></code></p></td>
<td><p>text mode (default)</p></td>
</tr>
<tr class="row-even"><td><p><code class="docutils literal notranslate"><span class="pre">'+'</span></code></p></td>
<td><p>open for updating (reading and writing)</p></td>
</tr>
</tbody>
</table>

#### File path Python

In [None]:
from pathlib import Path


def get_absolute_file_path(relative_path: str) -> str:
    return str(Path(__file__).parent.joinpath(relative_path))


local_path = "data/example.txt"
file_path = get_absolute_file_path(local_path)

with open(file_path, "r") as f:
    print(f.read())


#### Text File Types

    open('abc.txt')

    open('abc.txt', 'r')

    open('abc.txt', 'w')

In [None]:
with open('data/test.txt') as file:
    print(type(file))

#### More about encodings

In [None]:
import sys
sys.getdefaultencoding()

In [None]:
with open('data/latin_encoding.txt', 'rt', encoding='latin-1') as f:
    print(f.read())

#### More open() function arguments

In [None]:
# Read with disabled newline translation
with open('data/test.txt', 'rt', newline='') as f:
    pass

In [None]:
#with open('data/ascii_read.txt', 'wt', encoding='utf-16') as f:
#    print(f.write('Test sfsfr refef'))

In [None]:
with open('data/ascii_read.txt', 'rt', encoding='ascii') as f:
    print(f.read())

In [None]:
# Replace bad chars with Unicode U+fffd replacement char
with open('data/ascii_read.txt', 'rt', encoding='ascii', errors='replace') as f:
    print(f.read())

In [None]:
# Ignore bad chars entirely
with open('data/ascii_read.txt', 'rt', encoding='ascii', errors='ignore') as f:
    print(f.read())

### Reading and Writing Opened Files

<div class="table-responsive">
<table class="table table-hover">
<thead>
<tr>
<th>Method</th>
<th>What It Does</th>
</tr>
</thead>
<tbody>
<tr>
<td><a href="https://docs.python.org/3.7/library/io.html#io.RawIOBase.read"><code>.read(size=-1)</code></a></td>
<td>This reads from the file based on the number of <code>size</code> bytes. If no argument is passed or <code>None</code> or <code>-1</code> is passed, then the entire file is read.</td>
</tr>
<tr>
<td><a href="https://docs.python.org/3.7/library/io.html#io.IOBase.readline"><code>.readline(size=-1)</code></a></td>
<td>This reads at most <code>size</code> number of characters from the line. This continues to the end of the line and then wraps back around. If no argument is passed or <code>None</code> or <code>-1</code> is passed, then the entire line (or rest of the line) is read.</td>
</tr>
<tr>
<td><a href="https://docs.python.org/3.7/library/io.html#io.IOBase.readlines"><code>.readlines()</code></a></td>
<td>This reads the remaining lines from the file object and returns them as a list.</td>
</tr>
</tbody>
</table>
</div>

In [None]:
with open('data/test.txt', 'r') as reader:
    # Read & print the entire file
    print(reader.read())

In [None]:
with open('data/test.txt', 'r') as reader:
    # Read & print the first 5 characters of the line 5 times
    print(reader.readline(5))
    # Notice that line is greater than the 5 chars and continues
    # down the line, reading 5 chars each time until the end of the
    # line and then "wraps" around
    print(reader.readline(5))
    print(reader.readline(5))
    print(reader.readline(5))
    print(reader.readline(5))

In [None]:
with open('data/test.txt', 'r') as f:
    data = f.readlines()  # Returns a list object
    print(data)

In [None]:
with open('data/test.txt', 'r') as f:
    data = list(f)
    print(data)

#### Iterating Over Each Line in the File

    print(line.rstrip())

    print(line, end='')

<hr>

In [None]:
with open('data/test.txt', 'r') as reader:
    # Read and print the entire file line by line
    line = reader.readline()
    while line != '':  # The EOF char is an empty string
        print(line, end='')
        line = reader.readline()

In [None]:
with open('data/test.txt', 'r') as reader:
    for line in reader.readlines():
        print(line, end='')

In [None]:
with open('data/test.txt', 'r') as reader:
    for line in reader:
        print(line, end='')

#### Writing lines

<div class="table-responsive">
<table class="table table-hover">
<thead>
<tr>
<th>Method</th>
<th>What It Does</th>
</tr>
</thead>
<tbody>
<tr>
<td><code>.write(string)</code></td>
<td>This writes the string to the file.</td>
</tr>
<tr>
<td><code>.writelines(seq)</code></td>
<td>This writes the sequence to the file. No line endings are appended to each sequence item. It’s up to you to add the appropriate line ending(s).</td>
</tr>
</tbody>
</table>
</div>

In [None]:
with open('data/test.txt', 'r') as f:
    # Note: readlines doesn't trim the line endings
    lines = f.readlines()

with open('data/test_reversed.txt', 'w') as f:
    # Alternatively you could use
    # f.writelines(reversed(lines))

    # Write the dog breeds to the file in reversed order
    for line in reversed(lines):
        f.write(line)

In [None]:
with open('data/test_reversed.txt', 'r') as f:
    # Note: readlines doesn't trim the line endings
    lines = f.readlines()

print(lines)

#### Change file cursor position

In [None]:
f = open('data/test.txt', 'r')

In [None]:
f.read(8)

In [None]:
f.read(30)

In [None]:
f.read(150)

In [None]:
f.read()

In [None]:
f.tell()    # get the current file position

In [None]:
f.seek(0)   # bring file cursor to initial position

In [None]:
f.read(10)

In [None]:
f.tell() 

In [None]:
print(f.read())

#### Vaja: Pretvorba vsebine datoteke

In [None]:
def lower2upper(input_str: str) -> str:
    r_str = input_str.upper()
    return r_str

def converter(source_file: str, dest_file: str):
    with open(source_file, 'r') as reader:
        low_content = reader.read()

    up_content = lower2upper(low_content)

    with open(dest_file, 'w') as writer:
        writer.write(up_content)
        
if __name__ == "__main__":
    # Script that converts a DOS like file to an Unix like file
    s_file = 'data/test.txt'
    d_file = 'data/test_converted.txt'
    converter(s_file, d_file)

In [None]:
def converter(file_path: str):
    src_path, suffix = file_path.split(".")
    dest_path = f"{src_path}_upper.{suffix}"

    with open(file_path, "r") as reader:
        content = reader.read()

    with open(dest_path, "w") as witer:
        witer.write(content.upper())


if __name__ == "__main__":
    converter("data/test.txt")

#### Vaja: Find the longest words

In [None]:
def longest_word(filename):
    with open(filename, 'r') as infile:
        words = infile.read().split()
    max_len = len(max(words, key=len))
    return [word for word in words if len(word) == max_len]

print(longest_word('data/search_file.txt'))

### Triki in nasveti pri branju in pisanju datotek

#### Writing to a File That Doesn’t Already Exist

In [None]:
with open('data/exists.txt', 'wt') as f:
    f.write('Hello\n')

In [None]:
with open('data/exists.txt', 'xt') as f:
    f.write('Hello\n')

In [None]:
import os

if not os.path.exists('data/exists.txt'):
    with open('data/exists.txt', 'wt') as f:
        f.write('Hello\n')
else:
    print('File already exists!')

#### Appending to a File

In [None]:
with open('data/exists.txt', 'a') as a_writer:
    a_writer.write('\nBeagle\n')

In [None]:
with open('data/exists.txt', 'r') as reader:
    print(reader.read())

#### Working With Two Files at the Same Time

In [None]:
d_path = 'data/test.txt'
d_r_path = 'data/test_reversed.txt'
with open(d_path, 'r') as reader, open(d_r_path, 'w') as writer:
    dog_breeds = reader.readlines()
    writer.writelines(reversed(dog_breeds))

#### Search for a string in text files

In [None]:
with open('data/search_file.txt') as f:
    if 'Python' in f.read():
        print("true")
        

In [None]:
# želimo najdet vse ponovitve iskane besede
import re

with open('data/search_file.txt') as f:
    content = f.read()
    find_results = [m.start() for m in re.finditer('Python', content)]
    print(find_results)

#### Counts words in a text file

In [None]:
from collections import Counter

with open('data/search_file.txt', 'r') as f:
    text = f.read()
    
# edit text
text_splited = text.split()
text_splited = [word.lower() for word in text_splited] # all lowercase
text_splited = [word.strip() for word in text_splited] # strip spaces
text_splited = [word.replace('.', '').replace('(', '').replace(')', '').replace(',', '') for word in text_splited if not word.startswith('http') ] # remove char . , ( ) an filter url

# boljše
text_splited = [''.join(c for c in word if c.isalnum()) for word in text_splited] 

wordcount = Counter(text_splited)
print(wordcount.most_common(10))

#### Replace String in File

In [None]:
d_path = 'data/test.txt'
d_r_path = 'data/test_edited.txt'
with open(d_path, 'r') as reader, open(d_r_path, 'w') as writer:
    for line in reader:
        # logika za zamenjavo in pogoji
        writer.write(line.replace('INFO', 'ERROR-MESSAGE'))

#### Counting lines in a file

In [None]:
def file_len(fname):
    with open(fname) as f:
        i = 0
        for i, l in enumerate(f):
            pass
    return i + 1

In [None]:
print(file_len('data/test.txt'))

#### Keeping the Last N Items

In [None]:
from collections import deque

def search(lines, pattern, history=5):
    previous_lines = deque(maxlen=history)
    for line in lines:
        if pattern in line:
            yield line, previous_lines
        previous_lines.append(line)

In [None]:
# Example use on a file
if __name__ == '__main__':
    with open('data/weblog.csv', 'r') as f:
        for line, prevlines in search(f, '/bootstrap-3.3.7/js/bootstrap.min.js HTTP/1.1,304', 5):
            for pline in prevlines:
                print(pline, end='')
            print(line, end='')
            print('-'*20)

In [None]:
q = deque(maxlen=3)

In [None]:
q.append(1)

In [None]:
q.append(2)

In [None]:
q.append(3)

In [None]:
q

In [None]:
q.append(4)

In [None]:
q

In [None]:
q.append(5)

In [None]:
q

In [None]:
q = deque()
q.append(1)
q.append(2)
q.append(3)
q

In [None]:
q.appendleft(4)
q

In [None]:
q.pop()
q

In [None]:
q.popleft()
q

#### Skipping the First Part of a file

In [None]:
with open('data/userdb.txt') as f:
    for line in f:
        print(line, end='')

In [None]:
# lahko kot vaja da probajo sami -> spodaj daljši način
from itertools import dropwhile

with open('data/userdb.txt') as f:
    for line in dropwhile(lambda line: line.startswith('#'), f):
        print(line, end='')

In [None]:
from itertools import islice

with open('data/userdb.txt') as f:
    for line in islice(f, 7, None):
        print(line, end='')

In [None]:
with open('data/userdb.txt') as f:
    # Skip over initial comments
    while True:
        line = next(f, '')
        if not line.startswith('#'):
            break
    
    # Process remaining lines
    while line:
        # Replace with useful processing
        print(line, end='')
        line = next(f, None)

In [None]:
# v primeru da hočemo zbrisat vse ki se začnejo na # 
with open('data/userdb.txt') as f:
    lines = (line for line in f if not line.startswith('#'))
    for line in lines:
        print(line, end='')

### Reading Multiple Files

    import fileinput

    for line in fileinput.input()
        process(line)

In [None]:
import fileinput
#with fileinput.input(files=('data/multiple_files/file1.txt', 'data/multiple_files/file2.txt')) as f:
with fileinput.input(['data/multiple_files/file1.txt', 'data/multiple_files/file2.txt']) as f:
    for line in f:
        print(line, end='')

In [None]:
!mkdir data/multiple_files
# naridmo 3 file v tej mapi in notri damo cat

    cat file1 file2 file3

In [None]:
# cat.py
import fileinput

with fileinput.input() as files:
    for line in files:
        if fileinput.isfirstline():
            print(f'\n--- Reading {fileinput.filename()} ---')
        print(' -> ' + line, end='')
    print()