# < 판다스 입문 >
---

In [1]:
import pandas as pd

# ▶ dict 이용해 판단스 시리즈 만들기

In [5]:
student1 = pd.Series({'Korean':100, "English":80, "Math":90})
student1

Korean     100
English     80
Math        90
dtype: int64

# ▶ Normalization 
- 100점 기준 : 0 ~ 100점을 *0 ~ 1 사이의 값* 으로 변경  


### ```Normalize``` : 원소값 / 최대값(행or열) -> 0~1 사이 값으로 매핑

In [6]:
# 모든 원소의 값을 100으로 나눔
percentage = student1 / 100   # for문 돌리지 않아도 각각의 원소에 /100 적용 가능
percentage

Korean     1.0
English    0.8
Math       0.9
dtype: float64

# ▶ Series vs Series 연산

In [14]:
student1 = pd.Series({'Korean':100, "English":80, "Math":90})
student2 = pd.Series({'Math':75, "Korean":82, "English":97})

print(student1)
print()
print(student2)

Korean     100
English     80
Math        90
dtype: int64

Math       75
Korean     82
English    97
dtype: int64


In [16]:
# 두 학생의 과목별 점수로 사칙 연산 수행해보기
add = student1 + student2
type(add), add

(pandas.core.series.Series,
 English    177
 Korean     182
 Math       165
 dtype: int64)

In [17]:
sub = student1 - student2
type(sub), sub

(pandas.core.series.Series,
 English   -17
 Korean     18
 Math       15
 dtype: int64)

In [18]:
mul = student1 * student2
type(mul), mul

(pandas.core.series.Series,
 English    7760
 Korean     8200
 Math       6750
 dtype: int64)

In [19]:
div = student1 / student2
type(div), div

(pandas.core.series.Series,
 English    0.824742
 Korean     1.219512
 Math       1.200000
 dtype: float64)

In [22]:
# 사칙연산의 결과를 DataFrame으로 표현 (Series -> DataFrame)
result = pd.DataFrame([add, sub, mul, div],   # 행으로 추가됨!!
                      index=['add', 'sub', 'mul', 'div'])
result

Unnamed: 0,English,Korean,Math
add,177.0,182.0,165.0
sub,-17.0,18.0,15.0
mul,7760.0,8200.0,6750.0
div,0.824742,1.219512,1.2


# ▶ NaN 생성 및 fill_value 옵션 사용

In [23]:
student1 = pd.Series({'Korean':100, "English":80, "Math":90})
student2 = pd.Series({'Math':75, "Korean":82})

In [24]:
sr_add = student1.add(student2, fill_value=0)
sr_add

English     80.0
Korean     182.0
Math       165.0
dtype: float64

In [25]:
sr_sub = student1.sub(student2, fill_value=0)
sr_mul = student1.mul(student2, fill_value=0)
sr_div = student1.div(student2, fill_value=0)

In [26]:
# 사칙연산 결과를 Series -> DataFrame으로 표현
result = pd.DataFrame([sr_add, sr_sub, sr_mul, sr_div],   # 행으로 추가됨!!
                      index=['add', 'sub', 'mul', 'div'])
result

Unnamed: 0,English,Korean,Math
add,80.0,182.0,165.0
sub,80.0,18.0,15.0
mul,0.0,8200.0,6750.0
div,inf,1.219512,1.2


---
# ▶ 데이터 입출력
## ▷ header, index_col 옵션

In [30]:
import os
os.getcwd()

'D:\\Kamie\\mon_ML'

In [32]:
file_path = '../dataset/학습데이터처리_Data/read_csv_sample.csv'

In [40]:
df1 = pd.read_csv(file_path)
df1

Unnamed: 0,c0,c1,c2,c3
0,0,1,4,7
1,1,2,5,8
2,2,3,6,9


In [37]:
df2 = pd.read_csv(file_path, header=None)
df2

Unnamed: 0,0,1,2,3
0,c0,c1,c2,c3
1,0,1,4,7
2,1,2,5,8
3,2,3,6,9


In [38]:
df3 = pd.read_csv(file_path, index_col=None)
df3

Unnamed: 0,c0,c1,c2,c3
0,0,1,4,7
1,1,2,5,8
2,2,3,6,9


In [39]:
df4 = pd.read_csv(file_path, index_col='c0')
df4

Unnamed: 0_level_0,c1,c2,c3
c0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1,4,7
1,2,5,8
2,3,6,9


## ▷ excel -> DataFrame

In [43]:
df_excel = pd.read_excel("../dataset/학습데이터처리_Data/남북한발전전력량.xlsx",
                        engine='openpyxl')
df_excel.head()

Unnamed: 0,전력량 (억㎾h),발전 전력별,1990,1991,1992,1993,1994,1995,1996,1997,...,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016
0,남한,합계,1077,1186,1310,1444,1650,1847,2055,2244,...,4031,4224,4336,4747,4969,5096,5171,5220,5281,5404
1,,수력,64,51,49,60,41,55,52,54,...,50,56,56,65,78,77,84,78,58,66
2,,화력,484,573,696,803,1022,1122,1264,1420,...,2551,2658,2802,3196,3343,3430,3581,3427,3402,3523
3,,원자력,529,563,565,581,587,670,739,771,...,1429,1510,1478,1486,1547,1503,1388,1564,1648,1620
4,,신재생,-,-,-,-,-,-,-,-,...,-,-,-,-,-,86,118,151,173,195


In [44]:
df_json = pd.read_json("../dataset/학습데이터처리_Data/read_json_sample.json")
df_json

Unnamed: 0,name,year,developer,opensource
pandas,,2008,Wes Mckinneye,True
NumPy,,2006,Travis Oliphant,True
matplotlib,,2003,John D. Hunter,True


## ▷ url, html -> DataFrame

In [46]:
df_tables = pd.read_html("../dataset/학습데이터처리_Data/sample.html")
df_tables

[   Unnamed: 0  c0  c1  c2  c3
 0           0   0   1   4   7
 1           1   1   2   5   8
 2           2   2   3   6   9,
          name  year        developer  opensource
 0       NumPy  2006  Travis Oliphant        True
 1  matplotlib  2003   John D. Hunter        True
 2      pandas  2008    Wes Mckinneye        True]

In [47]:
# 읽어온 html 내에 몇 개의 테이블이 존재하는지 체그
len(df_tables)

2

In [52]:
# df_tables 원소를 iteration 하면서 각각 화면에 출력
for i in range(len(df_tables)):
    print(f"======== tables {i} ========")
    print(df_tables[i])
    print()

   Unnamed: 0  c0  c1  c2  c3
0           0   0   1   4   7
1           1   1   2   5   8
2           2   2   3   6   9

         name  year        developer  opensource
0       NumPy  2006  Travis Oliphant        True
1  matplotlib  2003   John D. Hunter        True
2      pandas  2008    Wes Mckinneye        True



# ▶ set_index()로 특정 컬럼을 index로 설정
## ▷ inplace=True로 원본 DataFrame 변경

In [62]:
data1 = {
    'name' : ['Kim', 'Lee', 'Park'],
    'algo' : ['A', 'A+', 'B'],
    'python' : ['A+', 'A', 'B+']
}

data2 = {
    'c0' : [1, 2, 3],
    'c1' : [4, 5, 6],
    'c2' : [1, 4, 7]
}

In [63]:
df1 = pd.DataFrame(data1)
df1

Unnamed: 0,name,algo,python
0,Kim,A,A+
1,Lee,A+,A
2,Park,B,B+


In [64]:
df1.set_index('name', inplace=True)
df1

Unnamed: 0_level_0,algo,python
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Kim,A,A+
Lee,A+,A
Park,B,B+


In [65]:
df2 = pd.DataFrame(data2)
df2

Unnamed: 0,c0,c1,c2
0,1,4,1
1,2,5,4
2,3,6,7


In [66]:
df2.set_index('c0', inplace=True)
df2

Unnamed: 0_level_0,c1,c2
c0,Unnamed: 1_level_1,Unnamed: 2_level_1
1,4,1
2,5,4
3,6,7


# ▶ DataFrame을 엑셀 sheet로 생성하기

In [67]:
writer = pd.ExcelWriter("./220405_df_excelwriter.xlsx")

In [68]:
df1.to_excel(writer, sheet_name="sheet1")
df2.to_excel(writer, sheet_name="sheet2")

In [69]:
writer.save()

# ▶ descibe() : 데이터 살펴보기
- 옵션
    - include='all'
    - include=['object']

In [80]:
df_json

Unnamed: 0,name,year,developer,opensource
pandas,,2008,Wes Mckinneye,True
NumPy,,2006,Travis Oliphant,True
matplotlib,,2003,John D. Hunter,True


In [79]:
df_json.describe(include='all')

Unnamed: 0,name,year,developer,opensource
count,3.0,3.0,3,3
unique,1.0,,3,1
top,,,Wes Mckinneye,True
freq,3.0,,1,3
mean,,2005.666667,,
std,,2.516611,,
min,,2003.0,,
25%,,2004.5,,
50%,,2006.0,,
75%,,2007.0,,


In [82]:
df_json.count()
# 유효한 값의 개수만 계산
# 각 열이 갖고 있는 데이터 개수를 시리즈 객체로 반환

name          3
year          3
developer     3
opensource    3
dtype: int64

In [86]:
df_json['name'].value_counts()
# 각 열의 고유값 개수 확인, 시리즈로 반환
# 옵션 : dropna=True로 NaN을 카운팅에서 제외 가능

# ?????????????????????????????????????????? 드롭나 안하면 NaN을 하나로 인식하나? 그렇대