# DataFrame 생성하기

#### DataFrame 생성하기
 - 데이터 구조를 표현하는 pandas의 기본 객체 중 하나
 - Series가 1차원 배열이라면 DataFrame은 2차원 배열
 - 2차원 배열이기 때문에 인덱스가 row 과 column 으로 구성됨
   + index 는 각 개별 데이터(row)를 의미
   + column 은 개별 속성을 의미
 - 데이터분석을 위해 다른 데이터 소스(database, 외부 파일)을 읽어 들여서 DataFrame을 생성하는 것이 일반적
 - Data Analysis, Machine Learning에서 Data 변형을 위해 가장 많이 사용

In [1]:
import pandas as pd

#### dictionary로부터 생성하기
 - dictionary의 key는 column으로 정의됨

In [3]:
data = {'a' : 100, 'b' : 200, 'c' : 300}    # 각 column를 동일한 값으로 정의하고자 할 때
pd.DataFrame(data, index=['x', 'y', 'z'])

Unnamed: 0,a,b,c
x,100,200,300
y,100,200,300
z,100,200,300


In [4]:
data = {'a' : [1, 2, 3], 'b' : [4, 5, 6], 'c' : [10, 11, 12]}   # 각 column 를 다른 값으로 정의하고자 할 때 리스트로 정의
pd.DataFrame(data, index=[0, 1, 2])

Unnamed: 0,a,b,c
0,1,4,10
1,2,5,11
2,3,6,12


#### Series로부터 생성하기
 - 각 Series의 인덱스는 column으로 정의됨

In [5]:
a = pd.Series([100, 200, 300], ['a', 'b', 'c'])
b = pd.Series([101, 201, 301], ['a', 'b', 'c'])
c = pd.Series([110, 210, 310], ['a', 'b', 'c'])

pd.DataFrame([a, b, c], index=[100, 101, 102])   # index를 명시 100, 101, 102

Unnamed: 0,a,b,c
100,100,200,300
101,101,201,301
102,110,210,310


In [6]:
a = pd.Series([100, 200, 300], ['a', 'b', 'd'])
b = pd.Series([101, 201, 301], ['a', 'b', 'k'])
c = pd.Series([110, 210, 310], ['a', 'b', 'c'])

pd.DataFrame([a, b, c], index=[100, 101, 102])   # 존재하는 모든 index를 컬럼 값으로 변경하는데 없는 값들은 NaN으로 표시됨

Unnamed: 0,a,b,d,k,c
100,100.0,200.0,300.0,,
101,101.0,201.0,,301.0,
102,110.0,210.0,,,310.0


#### csv 파일을 읽어 들여 DataFrame 생성하기
 - 데이터 분석을 위해 DataFrame을 생성하는 가장 일반적인 방법
 - 데이터 소스로부터 추출된 csv(comma separated values)파일로부터 생성
 - read_csv 함수 사용
   + sep - 각 데이터 값을 구별하기 위한 구분자(separator) 설정 
   + header - header를 무시할 경우 None 설정
   + index_col - index로 사용할 column 설정
   + usecols - 실제로 DataFrame에 로딩 할 columns만 설정

In [2]:
marathon_2017 = pd.read_csv("../marathon_data/marathon_results_2017.csv")
marathon_2017.head()

Unnamed: 0.1,Unnamed: 0,Bib,Name,Age,M/F,City,State,Country,Citizen,Unnamed: 9,...,25K,30K,35K,40K,Pace,Proj Time,Official Time,Overall,Gender,Division
0,0,11,"Kirui, Geoffrey",24,M,Keringet,,KEN,,,...,1:16:59,1:33:01,1:48:19,2:02:53,0:04:57,-,2:09:37,1,1,1
1,1,17,"Rupp, Galen",30,M,Portland,OR,USA,,,...,1:16:59,1:33:01,1:48:19,2:03:14,0:04:58,-,2:09:58,2,2,2
2,2,23,"Osako, Suguru",25,M,Machida-City,,JPN,,,...,1:17:00,1:33:01,1:48:31,2:03:38,0:04:59,-,2:10:28,3,3,3
3,3,21,"Biwott, Shadrack",32,M,Mammoth Lakes,CA,USA,,,...,1:17:00,1:33:01,1:48:58,2:04:35,0:05:03,-,2:12:08,4,4,4
4,4,9,"Chebet, Wilson",31,M,Marakwet,,KEN,,,...,1:16:59,1:33:01,1:48:41,2:05:00,0:05:04,-,2:12:35,5,5,5


In [3]:
# sep는 데이터 구분자, header가 없거나 무시하려면 None 으로 설정
marathon_2017 = pd.read_csv("../marathon_data/marathon_results_2017.csv", sep =',', header=None)
marathon_2017.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,,Bib,Name,Age,M/F,City,State,Country,Citizen,,...,25K,30K,35K,40K,Pace,Proj Time,Official Time,Overall,Gender,Division
1,0.0,11,"Kirui, Geoffrey",24,M,Keringet,,KEN,,,...,1:16:59,1:33:01,1:48:19,2:02:53,0:04:57,-,2:09:37,1,1,1
2,1.0,17,"Rupp, Galen",30,M,Portland,OR,USA,,,...,1:16:59,1:33:01,1:48:19,2:03:14,0:04:58,-,2:09:58,2,2,2
3,2.0,23,"Osako, Suguru",25,M,Machida-City,,JPN,,,...,1:17:00,1:33:01,1:48:31,2:03:38,0:04:59,-,2:10:28,3,3,3
4,3.0,21,"Biwott, Shadrack",32,M,Mammoth Lakes,CA,USA,,,...,1:17:00,1:33:01,1:48:58,2:04:35,0:05:03,-,2:12:08,4,4,4


In [4]:
marathon_2017 = pd.read_csv("../marathon_data/marathon_results_2017.csv", index_col='Age', usecols=['Age', '5K', '10K', '15K', '20K', 'Half', '25K',
       '30K', '35K', '40K', 'Official Time'])
marathon_2017

Unnamed: 0_level_0,5K,10K,15K,20K,Half,25K,30K,35K,40K,Official Time
Age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
24,0:15:25,0:30:28,0:45:44,1:01:15,1:04:35,1:16:59,1:33:01,1:48:19,2:02:53,2:09:37
30,0:15:24,0:30:27,0:45:44,1:01:15,1:04:35,1:16:59,1:33:01,1:48:19,2:03:14,2:09:58
25,0:15:25,0:30:29,0:45:44,1:01:16,1:04:36,1:17:00,1:33:01,1:48:31,2:03:38,2:10:28
32,0:15:25,0:30:29,0:45:44,1:01:19,1:04:45,1:17:00,1:33:01,1:48:58,2:04:35,2:12:08
31,0:15:25,0:30:28,0:45:44,1:01:15,1:04:35,1:16:59,1:33:01,1:48:41,2:05:00,2:12:35
...,...,...,...,...,...,...,...,...,...,...
61,0:46:44,1:35:41,2:23:35,3:12:44,3:23:31,4:12:06,5:03:08,5:55:18,6:46:57,7:09:39
25,0:32:03,1:05:33,1:52:17,2:49:41,3:00:26,3:50:19,4:50:01,5:53:48,6:54:21,7:16:59
57,0:53:11,1:43:36,2:32:36,-,3:36:24,4:15:21,5:06:37,6:00:33,6:54:38,7:19:37
64,0:40:34,1:27:19,2:17:17,3:11:40,3:22:30,4:06:10,5:07:09,6:06:07,6:56:08,7:20:44
