# Python 주요 패키지 소개

## numpy

In [None]:
import numpy as np

### 1차원 벡터 생성하기

In [None]:
ar1 = np.array([1, 2, 3])
ar1

In [None]:
type(ar1)

In [None]:
dir(ar1)

In [None]:
ar1.ndim

In [None]:
ar1.shape

In [None]:
np.arange(11)

In [None]:
np.arange(1, 11)

In [None]:
np.arange(1, 21, 2)

### 벡터 원소의 형 변환

In [None]:
a = [1, 2, 3.0]

In [None]:
ar1 = np.array(a)
ar1

In [None]:
ar1.astype(int)

In [None]:
ar1.astype(str)

### 벡터의 인덱싱

In [None]:
ar1 = np.arange(1, 12, 2)
ar1

In [None]:
ar1[1]

In [None]:
ar1[-1]

### 벡터의 슬라이싱

In [None]:
ar1[:2]

In [None]:
ar1[2:]

In [None]:
ar1[:]

In [None]:
ar1[[5, 0, 3]]

### 벡터의 원소 정렬하기

In [None]:
from random import *
seed(1)
nums = choices(range(1, 11), k = 10)
nums

In [None]:
ar1 = np.array(nums)

In [None]:
np.sort(ar1)

In [None]:
ar1.sort()
ar1

### 벡터의 덧셈, 뺄셈 연산

In [None]:
a = np.array([1, 1])
b = np.array([-1, 1])

In [None]:
a + b

In [None]:
b + a

In [None]:
a + 1

In [None]:
a - b

In [None]:
c = np.array([2, 2])

In [None]:
(a + b) + c

In [None]:
a + (b + c)

In [None]:
[1, 1] + [-1, 1]

### 벡터의 스칼라배 연산

In [None]:
a * 0

In [None]:
b * 2

In [None]:
a * b

In [None]:
a / b

In [None]:
(a + b) * 2

In [None]:
a * 2 + b * 2

In [None]:
a * (2 + 3)

In [None]:
a * 2 + a * 3

### 벡터의 내적

In [None]:
sum(a*b)

In [None]:
sum(a*a)**(1/2)

In [None]:
def cosineSim(a, b):
    return 1 - sum(a*b) / (sum(a*a)**(1/2) * sum(b*b)**(1/2))

In [None]:
cosineSim(a, b)

### 2차원 행렬 생성하기

In [None]:
ar2 = np.array([[1, 2, 3], [4, 5, 6]])
ar2

In [None]:
type(ar2)

In [None]:
ar2.ndim

In [None]:
ar2.shape

In [None]:
np.zeros([2, 3])

In [None]:
np.zeros([2, 3], dtype = 'int')

In [None]:
np.ones([3, 2], dtype = 'str')

In [None]:
np.eye(3)

In [None]:
np.eye(3, dtype = 'int')

### 행렬의 인덱싱

In [None]:
ar1 = np.arange(1, 12, 2)

In [None]:
ar2 = ar1.reshape(2, 3)
ar2

In [None]:
ar2[0, 0]

In [None]:
ar2[0, 1]

In [None]:
ar2[-1, -1]

### 행렬의 슬라이싱

In [None]:
ar2[0:1, 0:1]

In [None]:
ar2[:, 1:3]

In [None]:
ar2[:, :]

In [None]:
ar2[[1, 0], :]

In [None]:
ar2[:, [2, 0, 1]]

### ndarray 객체의 차원 변환

In [None]:
ar1 = np.arange(12)
ar1

In [None]:
ar1.reshape(2, 6)

In [None]:
ar1.reshape(3, 4)

In [None]:
ar1.reshape(3, 5)  # 에러가 발생합니다. 

In [None]:
ar2 = ar1.reshape(4, 3)
ar2

In [None]:
ar2 = ar1.reshape(4, 3, order = 'F')
ar2

In [None]:
ar2.reshape(6, -1)

In [None]:
ar2.reshape(-1, 2)

### 행렬의 덧셈, 뺄셈 연산

In [None]:
A = np.array([[1, 2], [3, 4]])
A

In [None]:
B = np.array([[1, 3], [2, 4]])
B

In [None]:
A + B

In [None]:
B + A

In [None]:
A + 1

In [None]:
A - B

In [None]:
C = np.ones([2, 2])

In [None]:
(A + B) + C

In [None]:
A + (B + C)

### 행렬의 스칼라배 연산

In [None]:
A * 0

In [None]:
B * 2

In [None]:
A / 2

In [None]:
B / 3

In [None]:
A * 2 + B * 2

In [None]:
(A + B) * 2

In [None]:
A * (2 + 3)

In [None]:
A * 2 + A * 3

### 행렬의 곱셈 연산

In [None]:
np.matmul(A, B)

In [None]:
np.matmul(np.matmul(A, B), A)

In [None]:
C = np.matrix(A)
D = np.matrix(B)

In [None]:
C * D

In [None]:
C * D * C

### 행렬식 실습

In [None]:
import numpy.linalg as lin

In [None]:
lin.det(A)

In [None]:
lin.det(B)

In [None]:
M = np.array([[1, 2], [2, 4]])

In [None]:
lin.det(M)

### 역행렬 실습

In [None]:
AI = lin.inv(A)
type(AI)

In [None]:
np.matmul(A, AI)

In [None]:
np.matmul(AI, A)

In [None]:
CI = lin.inv(C)
type(CI)

In [None]:
C * CI

# pandas

In [None]:
import pandas as pd

### 1차원 시리즈 객체 생성하기

In [None]:
sr = pd.Series([1, 3, 5, 7, 9])
sr

In [None]:
type(sr)

In [None]:
sr.shape

In [None]:
sr.values

In [None]:
sr.index

In [None]:
sr.index = ['a', 'b', 'c', 'd', 'e']
sr

In [None]:
sr = pd.Series(data = [1, 3, 5, 7, 9], 
               index = ['a', 'b', 'c', 'd', 'e'])
sr

In [None]:
sr = pd.Series({'a':1, 'b':3, 'c':5, 'd':7, 'e':9})
sr

### 시리즈 객체의 인덱싱과 슬라이싱

In [None]:
sr[0]

In [None]:
sr[[0]]

In [None]:
sr[[1, 2]]

In [None]:
sr[1:3]

In [None]:
sr['a']

In [None]:
sr[['b', 'c']]

In [None]:
sr[sr >= 3]

In [None]:
sr = pd.Series(data = [1, 3, 5, 7, 9], index = range(1, 6))
sr

In [None]:
sr[[0]]

In [None]:
sr[[1]]

### 시리즈 객체의 원소 변경

In [None]:
sr + 1

In [None]:
sr * 2

In [None]:
sr[sr >= 5] = ['가', '나', '다']
sr

In [None]:
type(sr[1])

In [None]:
type(sr[3])

In [None]:
sr[sr == '다'] = np.nan
sr

In [None]:
sr.fillna(0)

In [None]:
sr.dropna()

In [None]:
nr = pd.Series('라', index = ['f'])

In [None]:
sr = sr.append(nr)
sr

### 2차원 데이터프레임 객체 생성하기

In [None]:
df = pd.DataFrame(data = [[85, 79, 92], [57, 76, 69], [98, 89, 74]], 
                  index = [1, 2, 3], 
                  columns = ['A반', 'B반', 'C반'])
df

In [None]:
df = pd.DataFrame(data = {'A반':[85, 79, 92], 
                          'B반':[57, 76, 69], 
                          'C반':[98, 89, 74]},
                  index = [1, 2, 3])
df

In [None]:
df = pd.DataFrame(data = [{'A반': 85, 'B반': 57, 'C반': 98},
                          {'A반': 79, 'B반': 76, 'C반': 89},
                          {'A반': 92, 'B반': 69, 'C반': 74}],
                  index = [1, 2, 3])
df

### 외부 파일을 데이터프레임으로 불러오기

In [None]:
df = pd.read_csv('https://bit.ly/2019_KBO_hitters')
df

In [None]:
type(df)

In [None]:
df.head(5)

In [None]:
df.tail(5)

### [참고] xlsx 파일 입출력

In [None]:
df.to_excel(excel_writer = '2019_KBO.xlsx', 
            sheet_name = '2019_KBO', 
            index = None)

In [None]:
df1 = pd.read_excel('2019_KBO.xlsx')
df1

### [참고] csv, txt 파일 입출력

In [None]:
df.to_csv('2019_KBO.csv', index = None)

In [None]:
df2 = pd.read_csv('2019_KBO.csv')
df2

In [None]:
df.to_csv('2019_KBO.txt', sep = '\t', index = None) 

In [None]:
df3 = pd.read_csv('2019_KBO.txt', sep = '\t')
df3

### 데이터프레임 객체 다루기

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.index

In [None]:
df.dtypes

### 데이터프레임 다루기 : 인덱스로 일부 확인

In [None]:
df.iloc[0]

In [None]:
df.iloc[[0]]

In [None]:
df.iloc[0:10]

In [None]:
df.iloc[0:10, : ]

In [None]:
df.iloc[0:10, 0:12]

In [None]:
df.iloc[[1, 10], [0, 6, 3]]

In [None]:
df.index = range(1, 301)
df

In [None]:
df.loc[1]

In [None]:
df.loc[[1]]

In [None]:
df.loc[1:10]

In [None]:
df.loc[1:10, '선수명':'도루']

In [None]:
df.loc[[1, 10], ['선수명', '홈런', '타석']]

### 데이터프레임 묶기 : 행 기준으로 합치기

In [None]:
df1 = df.loc[1:5, '선수명':'도루']
df1

In [None]:
df2 = df.loc[6:10, '선수명':'도루']
df2

In [None]:
df3 = pd.concat([df1, df2], axis = 0)
df3

### 데이터프레임 묶기 : 열 기준으로 합치기

In [None]:
df1 = df.loc[1:10, '선수명':'안타']
df1

In [None]:
df2 = df.loc[1:10, '홈런':'도루']
df2

In [None]:
df3 = pd.concat([df1, df2], axis = 1)
df3

### 데이터프레임 병합(merge)

In [None]:
df1 = df.loc[1:22:3, '선수명':'경기']
df1

In [None]:
df2 = df.loc[1:15:2, ['선수명', '타수', '안타']]
df2

In [None]:
pd.merge(df1, df2)

In [None]:
pd.merge(df1, df2, how = 'outer')

In [None]:
pd.merge(df1, df2, how = 'left')

In [None]:
pd.merge(df1, df2, how = 'right')

### 데이터프레임 객체 전처리 : 일부 컬럼명 변경

In [None]:
df1.columns = ['이름', '팀명', '경기']
df1

In [None]:
df1 = df1.rename(columns = {'이름':'선수이름'})
df1

In [None]:
pd.merge(df1, df2, left_on = '선수이름', right_on = '선수명')

### 데이터프레임 객체 전처리 : 일부 컬럼만 선택

In [None]:
df['팀명']

In [None]:
df.loc[:, '팀명']

In [None]:
df[['팀명']]

In [None]:
df.loc[:, ['팀명']]

In [None]:
df[['선수명', '팀명']]

In [None]:
df.loc[:, ['선수명', '팀명']]

### 데이터프레임 객체 전처리 : 일부 컬럼만 삭제

In [None]:
df.drop(range(101, 301))

In [None]:
df.drop(['BABIP'], axis = 1)

In [None]:
df

In [None]:
df.drop(['BABIP'], axis = 1, inplace = True)

In [None]:
df

In [None]:
df.drop(['wOBA', 'WAR'], axis = 1, inplace = True)

### 데이터프레임 객체 전처리 : 컬럼별 타입 변환

In [None]:
df['팀명'] = pd.Categorical(df['팀명'])

In [None]:
df['팀명']

In [None]:
df['팀명'].value_counts()

In [None]:
df = df.astype({'경기':str, '타석':float})
df.dtypes

In [None]:
df = df.astype({'경기':int, '타석':int})
df.dtypes

In [None]:
df.loc[:, '경기':'도루'] = df.loc[:, '경기':'도루'].astype(str)
df.dtypes

In [None]:
df.loc[:, '경기':'도루'] = df.loc[:, '경기':'도루'].astype(float)
df.dtypes

In [None]:
df.loc[:, '경기':'도루'] = df.loc[:, '경기':'도루'].astype(int)
df.dtypes

### 데이터프레임 객체 전처리 : 결측값 제거

In [None]:
df.isna()

In [None]:
df.isna().sum()

In [None]:
df = df.dropna()

In [None]:
df.info()

### 데이터프레임 객체 전처리 : 컬럼 간 연산

In [None]:
df['타수*'] = df['타석'] - df['타수']
df

In [None]:
df['타율*'] = np.round(df['안타'] / df['타수'], 3)
df

In [None]:
df['OPS*'] = df['출루율'] + df['장타율']
df

In [None]:
df = df.loc[ : , '선수명':'OPS']
df

### 데이터프레임 객체 전처리 : 조건을 만족하는 행 선택

In [None]:
df = df[df['타석'] >= 144 * 3.1]
df

In [None]:
df.shape

In [None]:
df.reset_index()

In [None]:
df = df.reset_index(drop = True)
df

In [None]:
df.index = range(1, 55)
df

In [None]:
df.sort_values(by = '안타', ascending = False).head(5)

In [None]:
df.sort_values(by = ['홈런', '삼진'], ascending = False)

In [None]:
df.sort_values(by = ['홈런', '삼진'], ascending = [False, True])

In [None]:
df[(df['홈런'] >= 20) & (df['OPS'] >= 0.9)]

In [None]:
df[(df['홈런'] >= 20) & (df['도루'] >= 20)]

## 기술통계

In [None]:
df.mean()

In [None]:
df['타율'].mean()

In [None]:
from scipy import stats

In [None]:
stats.trim_mean(df['타율'], 0.1)

In [None]:
stats.trim_mean(df.loc[:, '경기':'OPS'], 0.1)

In [None]:
df.median()

In [None]:
df['타율'].median()

In [None]:
df['타율'].min()

In [None]:
df['타율'].max()

In [None]:
df['타율'].max() - df['타율'].min()

In [None]:
df['타율'].quantile(0.90)

In [None]:
df['타율'].quantile([0.10, 0.90])

In [None]:
df['타율'].quantile(np.arange(0, 1.1, 0.1))

In [None]:
df['타율'].quantile(np.arange(0, 1.1, 0.25))

In [None]:
df['타율'].quantile(0.25)

In [None]:
df['타율'].quantile(0.75)

In [None]:
df['타율'].quantile(0.75) - df['타율'].quantile(0.25)

In [None]:
df['타율'].var()

In [None]:
df['타율'].std()

In [None]:
df['타율'].skew()

In [None]:
df['타율'].kurtosis()

In [None]:
df.cov()

In [None]:
df['안타'].cov(df['홈런'])

In [None]:
df.corr()

In [None]:
df['안타'].corr(df['홈런'])

### [참고] 데이터 표준화

In [None]:
scaled = stats.zscore(df['안타'])

In [None]:
scaled.mean()

In [None]:
scaled.std()

### 도수분포표

In [None]:
df['타율'].describe()

In [None]:
bins = np.arange(0.20, 0.40, 0.02)
bins

In [None]:
freq, bins = np.histogram(df['타율'], bins)
freq

In [None]:
freq / len(df['타율'])

### 가상의 키(신장) 데이터 생성

In [None]:
np.random.seed(seed = 1234)

In [None]:
heights = np.random.normal(loc = 172.4, scale = 5.7, size = 10000)

In [None]:
type(heights)

In [None]:
heights = pd.Series(heights)

## dfply

In [None]:
# !pip install dfply

In [None]:
from dfply import *

### 특정 조건을 만족하는 행과 일부 컬럼 선택

In [None]:
df1 = pd.read_excel(io = '2019_KBO.xlsx')

In [None]:
df2 = (df1 >> 
       filter_by(X.타석 >= 1) >> 
       select(X.팀명, X.타수, X.안타, X.홈런))
df2.shape

### 집계함수로 팀타율 데이터 생성

In [None]:
teamStat1 = (df2 >>
             group_by(X.팀명) >>
             summarize(팀타수 = X.타수.sum(),
                       팀안타 = X.안타.sum()) >>
             mutate(팀타율 = np.round(X.팀안타 / X.팀타수, 3)))

In [None]:
teamStat1.sort_values(by = '팀타율', ascending = False, inplace = True)
teamStat1

### [참고] pandas 함수로 팀타율 데이터 생성

In [None]:
teamHits = df2.groupby('팀명').sum()['안타']
teamAbs = df2.groupby('팀명').sum()['타수']
teamHitAvg = np.round(teamHits / teamAbs, 3)
teamHitAvg.sort_values(ascending = False, inplace = True)
teamHitAvg

### 집계함수로 팀별 인당 평균 홈런 데이터 생성

In [None]:
teamStat2 = (df2 >>
             group_by(X.팀명) >>
             summarize(팀홈런 = X.홈런.sum(),
                       팀인원 = X.홈런.count()) >>
             mutate(인당홈런 = np.round(X.팀홈런 / X.팀인원, 2)))

In [None]:
teamStat2.sort_values(by = '인당홈런', ascending = False, inplace = True)
teamStat2

### [참고] pandas 함수로 팀별 인당 평균 홈런 데이터 생성

In [None]:
teamHrs = df2.groupby('팀명').sum()['홈런']
teamCnt = df2.groupby('팀명').count()['홈런']
teamHrAvg = np.round(teamHrs / teamCnt, 2)
teamHrAvg.sort_values(ascending = False, inplace = True)
teamHrAvg

## matplotlib

### 히스토그램

In [None]:
import matplotlib.pyplot as plt
from matplotlib import font_manager

In [None]:
font_manager.findSystemFonts(fontext = 'ttf')

In [None]:
plt.rc('font', family = 'AppleGothic')

In [None]:
plt.figure(figsize = (6, 4), dpi = 100)
plt.hist(df['타율'], bins, color = 'gold', ec = 'gray', density = True)
plt.title('타율 히스토그램', fontsize = 14)
plt.xlabel('타율', fontsize = 12)
plt.show()

In [None]:
plt.figure(figsize = (6, 4), dpi = 100)
plt.hist(df['타율'], bins, color = 'gold', ec = 'gray', density = True)
plt.title('타율 히스토그램', fontsize = 14)
plt.xlabel('타율', fontsize = 12)

# 커널밀도추정(Kernel Density Estimation) 곡선을 추가합니다.
import seaborn as sns
sns.distplot(df['타율'], 
             hist = False, 
             color = 'red', 
             kde_kws = {'linewidth' : 2})

plt.show()

### 상자수염그림

In [None]:
plt.figure(figsize = (6, 4), dpi = 100)
plt.boxplot(df['타율'], sym = 'bo', 
            boxprops = dict(c = 'blue', lw = 2),
            whiskerprops = dict(c = 'red', ls = '--'))
plt.title('타율 상자수염그림', fontsize = 14)
plt.xticks([1], ['타율'])
plt.show()

### 산점도

In [None]:
plt.figure(figsize = (6, 4), dpi = 100)
plt.scatter(df['출루율'], df['장타율'], c = 'gray')
plt.title('출루율 vs 장타율 관계', fontsize = 14)
plt.axvline(df['출루율'].mean(), c = 'b', ls = '--', lw = 1)
plt.axhline(df['장타율'].mean(), c = 'b', ls = '--', lw = 1)
plt.show()

In [None]:
plt.figure(figsize = (6, 4), dpi = 100)
plt.scatter(df['출루율'], df['장타율'], c = 'gray')
plt.title('출루율 vs 장타율 관계', fontsize = 14)
plt.axvline(df['출루율'].mean(), c = 'b', ls = '--', lw = 1)
plt.axhline(df['장타율'].mean(), c = 'b', ls = '--', lw = 1)

my = df[df['팀명'] == 'LG']
plt.scatter(my['출루율'], my['장타율'], c = 'red')

for x, y, label in zip(my['출루율'], my['장타율'], my['선수명']):
    plt.text(x + 0.002, y, label, 
             ha = 'left', va = 'center', c = 'darkblue')

plt.show()

### 세로 막대그래프

In [None]:
plt.figure(figsize = (6, 4), dpi = 100)
plt.bar(teamStat1['팀명'], teamStat1['팀타율'], color = 'orange')
plt.title('팀타율 막대그래프')
plt.xlabel('팀명')
plt.ylabel('팀타율')
plt.show()

In [None]:
plt.figure(figsize = (6, 4), dpi = 100)
plt.bar(teamStat1['팀명'], teamStat1['팀타율'], color = 'orange')
plt.title('팀타율 막대그래프')
plt.xlabel('팀명')
plt.ylabel('팀타율')

plt.ylim(0, 0.35)
for x, y in zip(teamStat1['팀명'], teamStat1['팀타율']):
    plt.text(x, y + 0.01, y, 
             ha = 'center', va = 'bottom', c = 'k')

plt.show()

### 가로 막대그래프

In [None]:
plt.figure(figsize = (6, 4), dpi = 100)
plt.barh(teamStat2['팀명'], teamStat2['인당홈런'], color = 'skyblue')
plt.title('팀별 인당홈런 막대그래프')
plt.xlabel('인당홈런')
plt.ylabel('팀명')
plt.gca().invert_yaxis()
plt.show()

In [None]:
plt.figure(figsize = (6, 4), dpi = 100)
plt.barh(teamStat2['팀명'], teamStat2['인당홈런'], color = 'skyblue')
plt.title('팀별 인당홈런 막대그래프')
plt.xlabel('인당홈런')
plt.ylabel('팀명')
plt.gca().invert_yaxis()

plt.xlim(0, 6)
for x, y in zip(teamStat2['인당홈런'], teamStat2['팀명']):
    plt.text(x + 0.1, y, x, 
             ha = 'left', va = 'center', c = 'k')

plt.show()

## End of Document