## File I/O

In [1]:
import pandas as pd
import json
import requests
import lxml # html.parser가 파이썬에 이미 내장되어있으나, 성능이 좋지 않아 lxml 사용
from bs4 import BeautifulSoup
from random import randint

In [2]:
f = open("./sample_data/users.txt", 'r')

In [3]:
f

<_io.TextIOWrapper name='./sample_data/users.txt' mode='r' encoding='UTF-8'>

In [4]:
f.readline()  # read a line

''

In [5]:
f = open("./sample_data/README.md", "r")

In [6]:
f

<_io.TextIOWrapper name='./sample_data/README.md' mode='r' encoding='UTF-8'>

In [7]:
f.readline()

'This directory includes a few sample datasets to get you started.\n'

In [8]:
f.readlines() # read lines and return as list

['\n',
 '*   `california_housing_data*.csv` is California housing data from the 1990 US\n',
 '    Census; more information is available at:\n',
 '    https://developers.google.com/machine-learning/crash-course/california-housing-data-description\n',
 '\n',
 '*   `mnist_*.csv` is a small sample of the\n',
 '    [MNIST database](https://en.wikipedia.org/wiki/MNIST_database), which is\n',
 '    described at: http://yann.lecun.com/exdb/mnist/\n',
 '\n',
 '*   `anscombe.json` contains a copy of\n',
 "    [Anscombe's quartet](https://en.wikipedia.org/wiki/Anscombe%27s_quartet); it\n",
 '    was originally described in\n',
 '\n',
 "    Anscombe, F. J. (1973). 'Graphs in Statistical Analysis'. American\n",
 '    Statistician. 27 (1): 17-21. JSTOR 2682899.\n',
 '\n',
 '    and our copy was prepared by the\n',
 '    [vega_datasets library](https://github.com/altair-viz/vega_datasets/blob/4f67bdaad10f45e3549984e17e1b3088c731503d/vega_datasets/_data/anscombe.json).\n']

In [9]:
f.close()

In [10]:
f = open('./hello.txt', 'w')

In [11]:
f.write('file I/O class\n')

15

In [12]:
f.write('contents')
f.write('- How to start')
f.write('- Installation')
f.write('-Features')

9

In [13]:
f.close()

In [14]:
f = open('./hello.txt', 'r')
f.readlines()

['file I/O class\n', 'contents- How to start- Installation-Features']

In [15]:
f = open('./hello.txt', 'a')
for i in range(1, 10 + 1):
  f.write(f'{i}')

In [16]:
f.close()

In [17]:
f = open('./hello.txt', 'a')
for i in range(11, 20 + 1):
  f.write(f'{i}\n')

f.close()

In [18]:
f = open('./hello.txt', 'r')
f.readlines()

['file I/O class\n',
 'contents- How to start- Installation-Features1234567891011\n',
 '12\n',
 '13\n',
 '14\n',
 '15\n',
 '16\n',
 '17\n',
 '18\n',
 '19\n',
 '20\n']

In [19]:
f = open('./hello.txt', 'a')
for i in range(21, 30 + 1):
  f.write(f'{i}\n')

f.close()

In [20]:
f = open('./hello.txt', 'r')
text_lines = f.readlines()
for line in text_lines:
  print(line)
f.close()

file I/O class

contents- How to start- Installation-Features1234567891011

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30



In [21]:
with open('./hello.txt', 'r') as f:
  result = []
  while True:
    line = f.readline()
    if not line: break
    result.append(line)
result

['file I/O class\n',
 'contents- How to start- Installation-Features1234567891011\n',
 '12\n',
 '13\n',
 '14\n',
 '15\n',
 '16\n',
 '17\n',
 '18\n',
 '19\n',
 '20\n',
 '21\n',
 '22\n',
 '23\n',
 '24\n',
 '25\n',
 '26\n',
 '27\n',
 '28\n',
 '29\n',
 '30\n']

In [22]:
with open('./learn-korean.txt', 'w', encoding='CP949') as f:
  f.write('안녕하세요\n')

In [23]:
with open('./learn-korean.txt', 'r', encoding='CP949') as f:
  res = f.readlines()

In [24]:
res

['안녕하세요\n']

In [25]:
comma_spread = 'John Doe, 26, CA, Student'

In [26]:
print(comma_spread.split(','))

['John Doe', ' 26', ' CA', ' Student']


In [27]:
df = pd.read_csv('./sample_data/california_housing_test.csv')
df

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-122.05,37.37,27.0,3885.0,661.0,1537.0,606.0,6.6085,344700.0
1,-118.30,34.26,43.0,1510.0,310.0,809.0,277.0,3.5990,176500.0
2,-117.81,33.78,27.0,3589.0,507.0,1484.0,495.0,5.7934,270500.0
3,-118.36,33.82,28.0,67.0,15.0,49.0,11.0,6.1359,330000.0
4,-119.67,36.33,19.0,1241.0,244.0,850.0,237.0,2.9375,81700.0
...,...,...,...,...,...,...,...,...,...
2995,-119.86,34.42,23.0,1450.0,642.0,1258.0,607.0,1.1790,225000.0
2996,-118.14,34.06,27.0,5257.0,1082.0,3496.0,1036.0,3.3906,237200.0
2997,-119.70,36.30,10.0,956.0,201.0,693.0,220.0,2.2895,62000.0
2998,-117.12,34.10,40.0,96.0,14.0,46.0,14.0,3.2708,162500.0


In [28]:
df[['longitude', 'latitude']]

Unnamed: 0,longitude,latitude
0,-122.05,37.37
1,-118.30,34.26
2,-117.81,33.78
3,-118.36,33.82
4,-119.67,36.33
...,...,...
2995,-119.86,34.42
2996,-118.14,34.06
2997,-119.70,36.30
2998,-117.12,34.10


In [29]:
# dict -> DataFrame -> csv

users = [{
        'name': 'Jane',
        'age': 15,
        'locale': 'CA'
    },
    {
        'name': 'Jessie',
        'age': 19,
        'locale': 'NY'
    },
    {
        'name': 'Ava',
        'age': 24,
        'locale': 'CA'
    },
]

In [30]:
df = df.from_dict(users)

In [31]:
df.to_csv('./users.csv')

In [32]:
with open('./users.csv') as f:
  for line in f.readlines():
    print(line)

,name,age,locale

0,Jane,15,CA

1,Jessie,19,NY

2,Ava,24,CA



In [33]:
with open('users.json', 'w') as f:
  user_dict = {}
  user_dict.setdefault('users', []).append({
      'name': 'John Doe',
      'age': 40,
      'locale': 'LA, USA',
  })
  json.dump(user_dict, f) # when you make json, append mode is not recommended

In [34]:
with open('users.json') as f:
  result = f.readlines()
  print(result)

['{"users": [{"name": "John Doe", "age": 40, "locale": "LA, USA"}]}']


In [35]:
type(result[0])

str

In [36]:
with open('users.json') as f:
  from_json = json.load(f)

In [37]:
from_json['users'][0]['name']

'John Doe'

In [38]:
response = requests.get('https://www.rottentomatoes.com/')
response.status_code

200

In [39]:
response.text

'<!DOCTYPE html>\n<html lang="en" dir="ltr" xmlns="http://www.w3.org/1999/xhtml" prefix="fb: http://www.facebook.com/2008/fbml og: http://opengraphprotocol.org/schema/">\n    <head prefix="og: http://ogp.me/ns# flixstertomatoes: http://ogp.me/ns/apps/flixstertomatoes#">\n        \n        <script src="https://cdn.cookielaw.org/consent/7e979733-6841-4fce-9182-515fac69187f/otSDKStub.js"\n            type="text/javascript"\n            charset="UTF-8"\n            data-domain-script="7e979733-6841-4fce-9182-515fac69187f"\n            integrity="sha384-WEHwEli88wqOiQd913F1utFZiwisa8XhCkbjLnbKEpFa/WbFcPKeGg7h4fdsv0Z/"\n            crossorigin="anonymous">\n        </script>\n        <script type="text/javascript">\n        function OptanonWrapper() { }\n        </script>\n        \n        \n        <script src="https://cdn.cookielaw.org/opt-out/otCCPAiab.js"\n            type="text/javascript"\n            charset="UTF-8"\n            ccpa-opt-out-ids="dummy"\n            ccpa-opt-out-geo=

In [40]:
soup = BeautifulSoup(response.text, 'lxml')

In [41]:
uls = soup.find_all('ul', attrs={'slot': 'list-items'})

In [42]:
len(uls)

3

In [43]:
streaming_chart = uls[0]

In [44]:
lis = streaming_chart.find_all('li')

In [45]:
len(lis)

10

In [46]:
lis[0]

<li>
<a href="/m/the_menu">
<span class="dynamic-text-list__item-title clamp clamp-1">The Menu</span>
</a>
<a class="dynamic-text-list__tomatometer-group" href="/m/the_menu">
<span class="icon icon--tiny icon__certified-fresh" slot="tomatometer-icon"></span>
<span class="b--medium " slot="tomatometer-value">
                                
                                    89%
                                
                            </span>
</a>
</li>

In [47]:
uri = 'https://www.rottentomatoes.com' + lis[0].find('a')['href']
uri

'https://www.rottentomatoes.com/m/the_menu'

In [48]:
spans = lis[0].find_all('span')
movie_name = spans[0].text

In [49]:
movie_score = int(spans[2].text.strip('\n %'))

In [50]:
result = []

for li in lis:
  uri = 'https://www.rottentomatoes.com' + li.find('a')['href']
  spans = li.find_all('span')
  movie_name = spans[0].text
  movie_score = spans[2].text.strip('\n %')
  print(uri, movie_name, movie_score)
  result.append({
      'uri': uri, 
      'movie_name' : movie_name, 
      'movie_score': movie_score,
      })

https://www.rottentomatoes.com/m/the_menu The Menu 89
https://www.rottentomatoes.com/m/puss_in_boots_the_last_wish Puss in Boots: The Last Wish 96
https://www.rottentomatoes.com/m/the_pale_blue_eye The Pale Blue Eye 64
https://www.rottentomatoes.com/m/the_banshees_of_inisherin The Banshees of Inisherin 97
https://www.rottentomatoes.com/m/the_fabelmans The Fabelmans 92
https://www.rottentomatoes.com/m/everything_everywhere_all_at_once Everything Everywhere All at Once 95
https://www.rottentomatoes.com/m/glass_onion_a_knives_out_mystery Glass Onion: A Knives Out Mystery 92
https://www.rottentomatoes.com/m/mars_one Mars One 97
https://www.rottentomatoes.com/m/the_hatchet_wielding_hitchhiker The Hatchet Wielding Hitchhiker - -
https://www.rottentomatoes.com/m/devotion_2022 Devotion 81


In [51]:
result

[{'uri': 'https://www.rottentomatoes.com/m/the_menu',
  'movie_name': 'The Menu',
  'movie_score': '89'},
 {'uri': 'https://www.rottentomatoes.com/m/puss_in_boots_the_last_wish',
  'movie_name': 'Puss in Boots: The Last Wish',
  'movie_score': '96'},
 {'uri': 'https://www.rottentomatoes.com/m/the_pale_blue_eye',
  'movie_name': 'The Pale Blue Eye',
  'movie_score': '64'},
 {'uri': 'https://www.rottentomatoes.com/m/the_banshees_of_inisherin',
  'movie_name': 'The Banshees of Inisherin',
  'movie_score': '97'},
 {'uri': 'https://www.rottentomatoes.com/m/the_fabelmans',
  'movie_name': 'The Fabelmans',
  'movie_score': '92'},
 {'uri': 'https://www.rottentomatoes.com/m/everything_everywhere_all_at_once',
  'movie_name': 'Everything Everywhere All at Once',
  'movie_score': '95'},
 {'uri': 'https://www.rottentomatoes.com/m/glass_onion_a_knives_out_mystery',
  'movie_name': 'Glass Onion: A Knives Out Mystery',
  'movie_score': '92'},
 {'uri': 'https://www.rottentomatoes.com/m/mars_one',
  'm

In [52]:
base_uri = 'https://jsonplaceholder.typicode.com/'

users_uri = base_uri + 'users'
users_uri

'https://jsonplaceholder.typicode.com/users'

In [53]:
response = requests.get(users_uri)

In [54]:
response.status_code

200

In [55]:
response.text

'[\n  {\n    "id": 1,\n    "name": "Leanne Graham",\n    "username": "Bret",\n    "email": "Sincere@april.biz",\n    "address": {\n      "street": "Kulas Light",\n      "suite": "Apt. 556",\n      "city": "Gwenborough",\n      "zipcode": "92998-3874",\n      "geo": {\n        "lat": "-37.3159",\n        "lng": "81.1496"\n      }\n    },\n    "phone": "1-770-736-8031 x56442",\n    "website": "hildegard.org",\n    "company": {\n      "name": "Romaguera-Crona",\n      "catchPhrase": "Multi-layered client-server neural-net",\n      "bs": "harness real-time e-markets"\n    }\n  },\n  {\n    "id": 2,\n    "name": "Ervin Howell",\n    "username": "Antonette",\n    "email": "Shanna@melissa.tv",\n    "address": {\n      "street": "Victor Plains",\n      "suite": "Suite 879",\n      "city": "Wisokyburgh",\n      "zipcode": "90566-7771",\n      "geo": {\n        "lat": "-43.9509",\n        "lng": "-34.4618"\n      }\n    },\n    "phone": "010-692-6593 x09125",\n    "website": "anastasia.net",\n  

In [56]:
result = response.json()

In [57]:
result

[{'id': 1,
  'name': 'Leanne Graham',
  'username': 'Bret',
  'email': 'Sincere@april.biz',
  'address': {'street': 'Kulas Light',
   'suite': 'Apt. 556',
   'city': 'Gwenborough',
   'zipcode': '92998-3874',
   'geo': {'lat': '-37.3159', 'lng': '81.1496'}},
  'phone': '1-770-736-8031 x56442',
  'website': 'hildegard.org',
  'company': {'name': 'Romaguera-Crona',
   'catchPhrase': 'Multi-layered client-server neural-net',
   'bs': 'harness real-time e-markets'}},
 {'id': 2,
  'name': 'Ervin Howell',
  'username': 'Antonette',
  'email': 'Shanna@melissa.tv',
  'address': {'street': 'Victor Plains',
   'suite': 'Suite 879',
   'city': 'Wisokyburgh',
   'zipcode': '90566-7771',
   'geo': {'lat': '-43.9509', 'lng': '-34.4618'}},
  'phone': '010-692-6593 x09125',
  'website': 'anastasia.net',
  'company': {'name': 'Deckow-Crist',
   'catchPhrase': 'Proactive didactic contingency',
   'bs': 'synergize scalable supply-chains'}},
 {'id': 3,
  'name': 'Clementine Bauch',
  'username': 'Samantha

In [58]:
result[0].keys()

dict_keys(['id', 'name', 'username', 'email', 'address', 'phone', 'website', 'company'])

In [59]:
response = requests.get('https://finance.yahoo.com/quote/GC%3DF?p=GC%3DF')
response.status_code

200

In [60]:
soup = BeautifulSoup(response.text, 'lxml')

In [61]:
uls = soup.find_all('div', attrs={'id': 'quoteNewsStream-0-Stream'})

In [62]:
uls

[<div class="tdv2-applet-stream Bdc(#e2e2e6) Pos(r) Z(1)" id="quoteNewsStream-0-Stream" style="max-width:900px"><ul class="My(0) P(0) Wow(bw) Ov(h)"><li class="js-stream-content Pos(r)"><div class="Py(14px) Pos(r)" data-test-locator="mega"><div class="Cf"><div class="Fl(start) Pos(r) Mt(2px) W(26.5%) Maw(220px)"><div class="H(0) Ov(h) Bdrs(2px)" style="padding-bottom:88%"><img alt="" class=" W(100%) Trsdu(0s)! Bdrs(2px)" src="https://s.yimg.com/uu/api/res/1.2/clny9EnGhKbu.baAdIq6aA--~B/Zmk9c3RyaW07aD0xOTM7cT04MDt3PTIyMDthcHBpZD15dGFjaHlvbg--/https://media.zenfs.com/en/coindesk_75/bfe3376f92bb834557a7e66d5ed8bf0a" srcset="https://s.yimg.com/uu/api/res/1.2/clny9EnGhKbu.baAdIq6aA--~B/Zmk9c3RyaW07aD0xOTM7cT04MDt3PTIyMDthcHBpZD15dGFjaHlvbg--/https://media.zenfs.com/en/coindesk_75/bfe3376f92bb834557a7e66d5ed8bf0a 1x,https://s.yimg.com/uu/api/res/1.2/qYPllZGVSvxqTEyQW5k79A--~B/Zmk9c3RyaW07aD0zODY7cT04MDt3PTQ0MDthcHBpZD15dGFjaHlvbg--/https://media.zenfs.com/en/coindesk_75/bfe3376f92bb834557a7e

In [63]:
len(uls)

1

In [64]:
article_board = uls[0]

In [65]:
lis = article_board.find_all('li')

In [66]:
len(lis)

24

In [67]:
article = lis[0]

In [68]:
article = article.find('a')

In [69]:
title = article.text
title

"First Mover Asia: What's Next for Bitcoin After Biggest One-Day Price Pop in 2 Months?"

In [70]:
base_uri = 'https://finance.yahoo.com'
uri = base_uri + article['href']

In [71]:
result = []

for li in lis:
  article = li.find('a')
  title = article.text
  uri = base_uri + article['href']
  result.append({
      'title': title,
      'uri': uri,
  })

result

[{'title': "First Mover Asia: What's Next for Bitcoin After Biggest One-Day Price Pop in 2 Months?",
  'uri': 'https://finance.yahoo.com/news/first-mover-asia-whats-next-030145094.html'},
 {'title': "Fed officials see 'eye-popping' inflation data in rearview after December slowdownYahoo Finance",
  'uri': 'https://finance.yahoo.com/news/fed-officials-see-eye-popping-inflation-data-in-rearview-after-december-slowdown-193316112.html'},
 {'title': 'December CPI: Inflation slows for 3rd month, falls 0.1% over last monthYahoo Finance Video',
  'uri': 'https://finance.yahoo.com/video/december-cpi-inflation-slows-3rd-142815795.html'},
 {'title': 'Gold set for weekly gain on smaller U.S. rate-hike bets',
  'uri': 'https://finance.yahoo.com/news/gold-set-weekly-gain-smaller-025545186.html'},
 {'title': 'Gold hovers near $1,900/oz after U.S. inflation data cements Fed slowdown betsReuters',
  'uri': 'https://finance.yahoo.com/news/gold-flat-traders-await-u-025942664.html'},
 {'title': 'GLOBAL MA

## Practice

In [72]:
# request rottentomatoes main page
response = requests.get('https://www.rottentomatoes.com/')
response.status_code # for test
response.text # for test

# Start parsing with BeautifulSoup
soup = BeautifulSoup(response.text, 'lxml')
# find streaming chart <ul> using find_all
uls = soup.find_all('ul', attrs={'slot':'list-items'})
# select first one of uls
streaming_chart = uls[0]
# find all li in streaming_chart
lis = streaming_chart.find_all('li')
len(lis) # for check

# Create empty list
result = []
# for all li in lis
for li in lis:
  # find and store data
  uri = 'https://www.rottentomatos.com' + li.find('a')['href']
  spans = li.find_all('span')
  movie_name = spans[0].text
  movie_score = spans[2].text.strip('\n %')
  #print(uri, movie_name, movie_score) # for test
  result.append({
      'uri': uri,
      'movie_name': movie_name,
      'movie_score': movie_score,
      })

In [73]:
df = df.from_dict(result)
df.to_csv('./rottentomato.csv')

In [74]:
result = pd.read_csv('./rottentomato.csv')
print(result)

   Unnamed: 0                                                uri  \
0           0           https://www.rottentomatos.com/m/the_menu   
1           1  https://www.rottentomatos.com/m/puss_in_boots_...   
2           2  https://www.rottentomatos.com/m/the_pale_blue_eye   
3           3  https://www.rottentomatos.com/m/the_banshees_o...   
4           4      https://www.rottentomatos.com/m/the_fabelmans   
5           5  https://www.rottentomatos.com/m/everything_eve...   
6           6  https://www.rottentomatos.com/m/glass_onion_a_...   
7           7           https://www.rottentomatos.com/m/mars_one   
8           8  https://www.rottentomatos.com/m/the_hatchet_wi...   
9           9      https://www.rottentomatos.com/m/devotion_2022   

                          movie_name movie_score  
0                           The Menu          89  
1       Puss in Boots: The Last Wish          96  
2                  The Pale Blue Eye          64  
3          The Banshees of Inisherin          9

In [75]:
json_result = result.transpose().to_dict()
json_result

{0: {'Unnamed: 0': 0,
  'uri': 'https://www.rottentomatos.com/m/the_menu',
  'movie_name': 'The Menu',
  'movie_score': '89'},
 1: {'Unnamed: 0': 1,
  'uri': 'https://www.rottentomatos.com/m/puss_in_boots_the_last_wish',
  'movie_name': 'Puss in Boots: The Last Wish',
  'movie_score': '96'},
 2: {'Unnamed: 0': 2,
  'uri': 'https://www.rottentomatos.com/m/the_pale_blue_eye',
  'movie_name': 'The Pale Blue Eye',
  'movie_score': '64'},
 3: {'Unnamed: 0': 3,
  'uri': 'https://www.rottentomatos.com/m/the_banshees_of_inisherin',
  'movie_name': 'The Banshees of Inisherin',
  'movie_score': '97'},
 4: {'Unnamed: 0': 4,
  'uri': 'https://www.rottentomatos.com/m/the_fabelmans',
  'movie_name': 'The Fabelmans',
  'movie_score': '92'},
 5: {'Unnamed: 0': 5,
  'uri': 'https://www.rottentomatos.com/m/everything_everywhere_all_at_once',
  'movie_name': 'Everything Everywhere All at Once',
  'movie_score': '95'},
 6: {'Unnamed: 0': 6,
  'uri': 'https://www.rottentomatos.com/m/glass_onion_a_knives_ou

In [76]:
with open('./rottentomato.json', 'w') as f:
  json.dump(json_result, f)

## Error Handling

In [77]:
try:
  number = int(input("숫자를 입력하세요: "))
except ValueError as e:
  print("You should input a number.")
  print(e)

숫자를 입력하세요: hello
You should input a number.
invalid literal for int() with base 10: 'hello'


## Practice

In [78]:
rand_list = [(randint(1, 1000), randint(1, 1000)) for _ in range(100)]

In [79]:
rand_mul_list = [(a, b, a * b)for a, b in rand_list]

In [80]:
df = pd.DataFrame(rand_mul_list)

In [81]:
df.to_csv('./tuple_mul_result.csv')