# 09. Getting Data

## Imports

In [1]:
import csv

import requests
from bs4 import BeautifulSoup

## STDIN and STDOUT Objects

In [2]:
!cat ../data/file.txt | python3 ../support/egrep.py "[0-9]" | python3 ../support/line_count.py

2


In [3]:
!cat ../data/file.txt | python3 ../support/most_common_words.py 1

6	is


## Working with Files

#### Text files

In [4]:
# Writing to file with pre-cleaning

file_for_writing = open("../data/temp.txt", "w")

In [5]:
# Opening file for reading

file_for_reading1 = open("../data/temp.txt")
file_for_reading2 = open("../data/temp.txt", "r")

In [6]:
# Opening file for append data

file_for_append = open("../data/temp.txt", "a")

In [7]:
# When working with files, you need to close them to record changes

file_for_append.close()
file_for_reading2.close()
file_for_reading1.close()
file_for_writing.close()

In [8]:
# `with-open` construction closes file automatically

with open("../data/temp.txt") as temp:
    data = temp.read()

#### Delimited files

In [9]:
with open("../data/file.csv") as file:
    reader = csv.reader(file, delimiter=",")
    for row in reader:
        date = row[0]
        company = row[1]
        price = row[2]

print(date, company, price)

27.08.2024 magnit 5128.0


In [10]:
with open("../data/file.csv") as file:
    reader = csv.DictReader(file, delimiter=",")
    for row in reader:
        data = row["date"]
        company = row["company"]
        price = row["price"]

print(date, company, price)

27.08.2024 magnit 5128.0


In [11]:
prices = {"key": "value"}

with open("../data/temp.csv", "w") as file:
    writer = csv.writer(file, delimiter=",")
    for company, price in prices.items():
        writer.writerow([company, price])

## Getting Data from Internet

In [12]:
url = "https://raw.githubusercontent.com/joelgrus/data/master/getting-data.html"

html = requests.get(url).text
soup = BeautifulSoup(html, "html5lib")

In [13]:
# First tag <p>

first_paragraph = soup.find("p")

In [14]:
# Text of first tag <p>

first_paragraph_text = soup.p.text

In [15]:
# Words from text of first tag <p>

first_paragraph_words = soup.p.text.split()

In [16]:
# Tag with specific id

first_paragraph_id = soup.p.get("id")

In [17]:
# List with all tags <p>

all_paragraphs = soup.find_all("p")

In [18]:
# List with all tags <p> with specific ids

paragraphs_with_ids = [p for p in soup("p") if p.get("id")]

In [19]:
# List with all tags <p> with specific class

important_paragraphs = soup("p", {"class": "important"})
important_paragraphs = soup("p", "important")
important_paragraphs = [p for p in soup("p") if "important" in p.get("class", [])]

In [20]:
# Search for nested tags

spans_inside_divs = [span for div in soup("div") for span in div("span")]