### DataFrame Load & Save

In [3]:
import numpy as np
import pandas as pd

#####  read_csv

In [6]:
df = pd.read_csv('data/users.csv')
print(df.columns)
print(df)

Index(['id', 'first_name', 'last_name', 'email', 'gender', 'ip_address'], dtype='object')
       id first_name  last_name                             email   gender  \
0       1     Norbie   Wrassell              nwrassell0@bbc.co.uk     Male   
1       2     Tybalt   Covendon          tcovendon1@indiegogo.com     Male   
2       3       Elyn       Volk                  evolk2@umich.edu   Female   
3       4       Todd   Carriage              tcarriage3@youku.com     Male   
4       5      Pryce   Brookzie               pbrookzie4@dmoz.org  Agender   
..    ...        ...        ...                               ...      ...   
995   996     Allsun   McTeague         amcteaguern@moonfruit.com   Female   
996   997       Viva     Epinoy                vepinoyro@cnbc.com   Female   
997   998      Heall    Mallett              hmallettrp@google.it     Male   
998   999     Conroy  Scandrett          cscandrettrq@cornell.edu     Male   
999  1000     Romain  Bellfield  rbellfieldrr@acquir

In [8]:
# 확장자가 다르더라고 csv 데이터면 read_csv로 처리 가능
df = pd.read_csv('data/users.txt')
print(df)

       id first_name  last_name                             email   gender  \
0       1     Norbie   Wrassell              nwrassell0@bbc.co.uk     Male   
1       2     Tybalt   Covendon          tcovendon1@indiegogo.com     Male   
2       3       Elyn       Volk                  evolk2@umich.edu   Female   
3       4       Todd   Carriage              tcarriage3@youku.com     Male   
4       5      Pryce   Brookzie               pbrookzie4@dmoz.org  Agender   
..    ...        ...        ...                               ...      ...   
995   996     Allsun   McTeague         amcteaguern@moonfruit.com   Female   
996   997       Viva     Epinoy                vepinoyro@cnbc.com   Female   
997   998      Heall    Mallett              hmallettrp@google.it     Male   
998   999     Conroy  Scandrett          cscandrettrq@cornell.edu     Male   
999  1000     Romain  Bellfield  rbellfieldrr@acquirethisname.com     Male   

          ip_address  
0    201.234.222.219  
1      90.113.205

In [10]:
# 구분자가 ,가 아닌 다른 문자인 경우에도 sep 인자를 넘겨주면 read_csv로 처리 가능
df = pd.read_csv('data/users.tsv', sep="\t")
print(df)

    id\tfirst_name\tlast_name\temail\tgender\tip_address
0    1\tNorbie\tWrassell\tnwrassell0@bbc.co.uk\tMal...  
1    2\tTybalt\tCovendon\ttcovendon1@indiegogo.com\...  
2    3\tElyn\tVolk\tevolk2@umich.edu\tFemale\t254.1...  
3    4\tTodd\tCarriage\ttcarriage3@youku.com\tMale\...  
4    5\tPryce\tBrookzie\tpbrookzie4@dmoz.org\tAgend...  
..                                                 ...  
995  996\tAllsun\tMcTeague\tamcteaguern@moonfruit.c...  
996  997\tViva\tEpinoy\tvepinoyro@cnbc.com\tFemale\...  
997  998\tHeall\tMallett\thmallettrp@google.it\tMal...  
998  999\tConroy\tScandrett\tcscandrettrq@cornell.e...  
999  1000\tRomain\tBellfield\trbellfieldrr@acquiret...  

[1000 rows x 1 columns]


In [13]:
# 첫 줄에 cilum lable이 없는 데이터
df = pd.read_csv(
    'data/users_headless.csv',
    header=None,    # 첫 줄에 있는 데이터가 header가 되지 않도록 함
    names=['id', 'first_name', 'last_name', 'email', 'gender', 'ip_address']
    )
df

Unnamed: 0,id,first_name,last_name,email,gender,ip_address
0,1,Norbie,Wrassell,nwrassell0@bbc.co.uk,Male,201.234.222.219
1,2,Tybalt,Covendon,tcovendon1@indiegogo.com,Male,90.113.205.35
2,3,Elyn,Volk,evolk2@umich.edu,Female,254.131.69.38
3,4,Todd,Carriage,tcarriage3@youku.com,Male,242.31.116.66
4,5,Pryce,Brookzie,pbrookzie4@dmoz.org,Agender,212.63.28.128
...,...,...,...,...,...,...
995,996,Allsun,McTeague,amcteaguern@moonfruit.com,Female,134.34.119.166
996,997,Viva,Epinoy,vepinoyro@cnbc.com,Female,93.2.144.250
997,998,Heall,Mallett,hmallettrp@google.it,Male,254.24.191.245
998,999,Conroy,Scandrett,cscandrettrq@cornell.edu,Male,150.254.219.221


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          1000 non-null   int64 
 1   first_name  1000 non-null   object
 2   last_name   1000 non-null   object
 3   email       1000 non-null   object
 4   gender      1000 non-null   object
 5   ip_address  1000 non-null   object
dtypes: int64(1), object(5)
memory usage: 47.0+ KB


---

##### to_csv

In [15]:
df.to_csv('data/users2.csv') # 기본값 : index = True, header = True

In [16]:
df.to_csv('data/users2.csv', index=False)

In [17]:
df.to_csv('data/users2.csv', index=False, header=False)

---

##### HTML 문서 내 table Load

- 정적 웹 페이지 내 table 태그 부분만 추춣해 DataFrame 객체로 변환 가능
- parsing 처리를 위한 lxml 패키지 필요

In [None]:
# ! pip install lxml

In [19]:
tables = pd.read_html('https://ko.wikipedia.org/wiki/%EB%8C%80%ED%95%9C%EB%AF%BC%EA%B5%AD%EC%9D%98_%EC%98%81%ED%99%94_%ED%9D%A5%ED%96%89_%EA%B8%B0%EB%A1%9D')
len(tables)

7

In [20]:
movie_df = tables[0]
movie_df

Unnamed: 0,순위,제목,감독,한국내 배급사,개봉일,관객수,기타
0,1,《명량》,김한민,CJ E&M,2014-07-30,17616141,영화진흥위원회 발권통계 기준
1,2,《극한직업》,이병헌,CJ엔터테인먼트,2019-01-23,16266480,영화진흥위원회 발권통계 기준
2,3,《신과함께: 죄와 벌》,김용화,롯데엔터테인먼트,2017-12-20,14414658,영화진흥위원회 발권통계 기준
3,4,《국제시장》,윤제균,CJ E&M,2014-12-17,14265222,영화진흥위원회 발권통계 기준
4,5,《어벤져스: 엔드게임》,루소 형제,월트디즈니컴퍼니코리아,2019-04-24,13977602,영화진흥위원회 발권통계 기준
...,...,...,...,...,...,...,...
62,63,《베를린》,류승완,CJ E&M,2013-01-30,7166688,영화진흥위원회 발권통계 기준
63,64,《마스터》,조의석,CJ E&M,2016-12-21,7150586,영화진흥위원회 발권통계 기준
64,65,《터널》,김성훈,쇼박스,2016-08-10,7120780,영화진흥위원회 발권통계 기준
65,66,《어벤져스》,조스 휘던,월트디즈니컴퍼니코리아,2012-04-26,7087971,영화진흥위원회 발권통계 기준


In [21]:
movie_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67 entries, 0 to 66
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   순위       67 non-null     int64 
 1   제목       67 non-null     object
 2   감독       67 non-null     object
 3   한국내 배급사  67 non-null     object
 4   개봉일      67 non-null     object
 5   관객수      67 non-null     int64 
 6   기타       67 non-null     object
dtypes: int64(2), object(5)
memory usage: 3.8+ KB


In [22]:
movie_df.describe()

Unnamed: 0,순위,관객수
count,67.0,67.0
mean,34.0,10108980.0
std,19.485037,2598352.0
min,1.0,7051660.0
25%,17.5,7667830.0
50%,34.0,9427588.0
75%,50.5,12290420.0
max,67.0,17616140.0


In [25]:
movie_df[movie_df['감독'] == '봉준호']

Unnamed: 0,순위,제목,감독,한국내 배급사,개봉일,관객수,기타
9,10,《괴물》,봉준호,쇼박스,2006-07-27,13019740,영화진흥위원회 공식통계 기준
30,31,《기생충》,봉준호,CJ엔터테인먼트,2019-05-30,10313735,영화진흥위원회 발권통계 기준
34,35,《설국열차》,봉준호,CJ E&M,2013-08-01,9354547,영화진흥위원회 발권통계 기준


---

##### [실습] titanic 승객 데이터 활용