# DataFrame

In [1]:
import warnings
from IPython.display import display, HTML

In [2]:
### HTML display 조정
display(HTML("<style>.container {width :98% !important;}</style>"))

### Warning 제거
warnings.filterwarnings('ignore')

## Package Load

In [3]:
import numpy as np
import pandas as pd

### (1) Package Reload

In [4]:
import importlib

In [5]:
importlib.reload(np)

<module 'numpy' from '/usr/local/lib/python3.7/site-packages/numpy/__init__.py'>

### (2) Find Package List

In [6]:
### Package modules
def check_packages(mod):
    for subpackage in dir(mod):
        subpackage_module = getattr(mod, subpackage)
        if hasattr(subpackage_module, '__all__'):
            print(f"{mod.__name__}.{subpackage}의 하위 패키지")
            for module in [x for x in dir(subpackage_module) \
                           if '__' not in x and x[0] != '_']:
                print(f"  {module}")
        print("-------------------")

In [7]:
from sklearn import *
import sklearn
import statsmodels.api
import statsmodels.stats
import scipy.stats
# import statsmodels.formula.api # R 포맷으로 적용하는 파이썬 패키지
# dir(statsmodels.formula.api)

# scipy 필요에 따라 아래것들 확인 필요
# 대표적으로는 scipy.stats
# scipy.integrate
# scipy.interpolate
# scipy.linalg
# scipy.misc
# scipy.ndimage
# scipy.odr
# scipy.optimize

# check_packages(scipy.stats)
# check_pacages(sklearn)

In [8]:
### Help로 사용법 확인
# help(pd.concat)

## Basic Pandas

### (1) Create DataFrame

#### List, NDArray

In [9]:
col_name1=['col1']
list1 = [1, 2, 3]
array1 = np.array(list1)
print('array1 shape:', array1.shape)

# 리스트를 이용해 DataFrame 생성
df_list1 = pd.DataFrame(list1, columns=col_name1)
print('1차원 리스트로 만든 DataFrame:\n', df_list1)

# 넘파이 ndarray를 이용해 DataFrame 생성
df_array1 = pd.DataFrame(array1, columns=col_name1)
print('1차원 ndarray로 만든 DataFrame:\n', df_array1)

array1 shape: (3,)
1차원 리스트로 만든 DataFrame:
    col1
0     1
1     2
2     3
1차원 ndarray로 만든 DataFrame:
    col1
0     1
1     2
2     3


In [10]:
# 3개의 컬럼명이 필요함.
col_name2=['col1', 'col2', 'col3']
list2 = [[1, 2, 3], [11, 12, 13]]
# 2행x3열 형태의 리스트와 ndarray 생성한 뒤 이를 DataFrame으로 변환.
array2 = np.array(list2)
print('array2 shape:', array2.shape)

df_list2 = pd.DataFrame(list2, columns=col_name2)
print('2차원 리스트로 만든 DataFrame:\n', df_list2)

df_array2 = pd.DataFrame(array2, columns=col_name2)
print('2차원 ndarray로 만든 DataFrame:\n', df_array2)

array2 shape: (2, 3)
2차원 리스트로 만든 DataFrame:
    col1  col2  col3
0     1     2     3
1    11    12    13
2차원 ndarray로 만든 DataFrame:
    col1  col2  col3
0     1     2     3
1    11    12    13


#### Dictionary

In [11]:
# Key는 컬럼명으로 매핑, Value는 리스트 형(또는 ndarray)
df_dict = pd.DataFrame({'col1': [1, 11], 'col2': [2, 22], 'col3': [3, 33]})
print('딕셔너리로 만든 DataFrame:\n', df_dict)

딕셔너리로 만든 DataFrame:
    col1  col2  col3
0     1     2     3
1    11    22    33


#### Samples

In [12]:
dataset = np.array([['kor', 70], ['math', 80]])
df = pd.DataFrame(dataset, columns=['class', 'score'])
df = pd.DataFrame(data=[['kor', 70], ['math', 80]], columns=['class','score'])
df = pd.DataFrame({'class': ['kor', 'math'], 'score': [70, 80]})

values = [[1,2,3], [4,5,6], [7,8,9], [10,11,12], [13,14,15], [16,17,18], [19,20,21]]
index = pd.MultiIndex.from_tuples([('row1', 'val1'), ('row1', 'val2'),
                                   ('row2', 'val1'), ('row2', 'val2'), ('row2', 'val3'), 
                                   ('row3', 'val2'),('row3', 'val3')]) # 인덱스 설정
multi_df = pd.DataFrame(values, columns=['col1', 'col2', 'col3'], index=index)

data_df = pd.DataFrame({'Name': ['Chulmin', 'Eunkyung','Jinwoong','Soobeom'],
                        'Year': [2011, 2016, 2015, 2015],
                        'Gender': ['Male', 'Female', 'Male', 'Male']}, index=['one','two','three','four'])

score = pd.DataFrame({'국어': [100, 80], '수학':[75, 90], '영어':[90, 95]}, index=['장화', '홍련'])

HR1 = pd.DataFrame({'이름':['장화', '홍련'], '부서':['영업', '회계'], '직급':['팀장', '사원']})
HR2 = pd.DataFrame({'이름':['콩쥐', '팥쥐'], '직급':['사원', '팀장'], '부서':['영업', '인사']})

product = pd.DataFrame({'상품코드':['G1', 'G2', 'G3', 'G4'], '상품명':['우유', '감자', '빵', '치킨']})
sale = pd.DataFrame({'주문번호': [1001, 1002, 1002, 1003, 1004], '상품코드': ['G4', 'G3', 'G1', 'G3', 'G5'], '주문수량': [1, 4, 2, 2, 3]})

score2 = pd.DataFrame({'학년':[1, 1, 1, 1, 2, 2], 
                       '반':['A', 'A', 'B', 'B', 'C', 'C'], 
                       '성별':['여자', '남자', '여자', '남자', '여자', '남자'],
                       '성적': [76, 88, 85, 72, 68, 70]})

landmark = pd.DataFrame({'name':['광화문','호미곶', '첨성대'], 'location':['서울 종로구 사직로 161', '경북 포항시 남구 호미곶면 대보리 150', '경북 경주시 인왕동 839-1']})

students = pd.DataFrame({'이름':['장화', '홍련', '콩쥐', '팥쥐', '해님', '달님'], '국어': [70, 85, 90, 100, 60, 85], '수학':[65, 100, 80, 95, 90, 70]})

### (2) Data Read/Write

#### Read

In [13]:
iris = pd.read_csv('./ADP_book_ver01/data/iris.csv', na_values='NA', encoding='utf8') # NA 값에 'NA' 문자열 지정
iris.head()

Unnamed: 0,sepal length,sepal width,petal length,petal width,target
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [14]:
ins_old = pd.read_csv('./ADP_book_ver01/data/travel.csv', sep=';') # csv 분할 기호 추가 가능
ins_old.head()

Unnamed: 0,Age,Government Sector Worker,GraduateOrNot,AnnualIncome,FamilyMembers,ChronicDiseases,FrequentFlyer,EverTravelledAbroad,TravelInsurance
0,31,1,1,400000,6,1,0,0,0
1,31,0,1,1250000,7,0,0,0,0
2,34,0,1,500000,4,1,0,0,1
3,28,0,1,700000,3,1,0,0,0
4,28,0,1,700000,8,1,1,0,0


In [15]:
arima_data = pd.read_csv('./ADP_book_ver01/data/arima_data.csv', names=['day', 'price']) # column명 추가
arima_data.head()

Unnamed: 0,day,price
0,2013-01-01,3794
1,2013-02-01,3863
2,2013-03-01,5190
3,2013-04-01,5783
4,2013-05-01,6298


In [16]:
usage = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/adp/15/problem2_usage.csv') # url 지정 가능
usage.head()

Unnamed: 0,timestamp,usage
0,2551780740,583.7395
1,2534068740,1018.0731
2,2545732740,1034.5041
3,2545559940,350.5153
4,2550247140,652.4857


In [17]:
titanic_df = pd.read_csv('./pymlrev2-main/1장/titanic_train.csv')
titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


#### Write

In [18]:
### DataFrame to other objects
array3 = df_dict.values
print('df_dict.values 타입:', type(array3), 'df_dict.values shape:', array3.shape)
print(array3)

list3 = df_dict.values.tolist()
print('df_dict.values.tolist() 타입:', type(list3))
print(list3)

dict3 = df_dict.to_dict('list')
print('\n df_dict.to_dict() 타입:', type(dict3))
print(dict3)

df_dict.values 타입: <class 'numpy.ndarray'> df_dict.values shape: (2, 3)
[[ 1  2  3]
 [11 22 33]]
df_dict.values.tolist() 타입: <class 'list'>
[[1, 2, 3], [11, 22, 33]]

 df_dict.to_dict() 타입: <class 'dict'>
{'col1': [1, 11], 'col2': [2, 22], 'col3': [3, 33]}


In [19]:
### Write to CSV
# data.to_csv('result.csv', header=True, index=False, encoding='utf8')

### (3) Describe Data

#### Type, Shape

In [20]:
### 요약 정보
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [21]:
### Shape
print('DataFrame 크기: ', titanic_df.shape)

DataFrame 크기:  (891, 12)


#### Data Statistics

In [22]:
### 통계 정보
titanic_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [23]:
### value_counts
# count 통계 정보
# Series로 반환됨
value_counts = titanic_df['Pclass'].value_counts()
print(type(value_counts))
print(value_counts)

<class 'pandas.core.series.Series'>
3    491
1    216
2    184
Name: Pclass, dtype: int64


In [24]:
print('titanic_df 데이터 건수:', titanic_df.shape[0])
print(' Sex 값 분포 :\n',titanic_df['Sex'].value_counts())
print('\n Cabin 값 분포 :\n',titanic_df['Cabin'].value_counts())

titanic_df 데이터 건수: 891
 Sex 값 분포 :
 male      577
female    314
Name: Sex, dtype: int64

 Cabin 값 분포 :
 C23 C25 C27    4
B96 B98        4
G6             4
F2             3
C22 C26        3
              ..
B79            1
D49            1
B19            1
B101           1
A7             1
Name: Cabin, Length: 147, dtype: int64


In [25]:
# value_counts()는 디폴트로 dropna=True이므로 value_counts(dropna=True)와 동일.
print(titanic_df['Embarked'].value_counts())
print(titanic_df['Embarked'].value_counts(dropna=False)) # Na를 별도로 출력

S    644
C    168
Q     77
Name: Embarked, dtype: int64
S      644
C      168
Q       77
NaN      2
Name: Embarked, dtype: int64


#### Print Data

In [26]:
### 앞 부분 출력
iris.head()

Unnamed: 0,sepal length,sepal width,petal length,petal width,target
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [27]:
### 뒷 부분 출력
iris.tail()

Unnamed: 0,sepal length,sepal width,petal length,petal width,target
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica
149,5.9,3.0,5.1,1.8,Iris-virginica


In [28]:
### 모든 행 출력 option
# pd.set_option('display.max_columns' None)
# pd.set_option('display.max_rows' None)

In [29]:
titanic_df.head(3) # 출력할 숫자 지정 가능

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


### (4) Data types

In [30]:
### type 출력
iris.dtypes

sepal length    float64
sepal width     float64
petal length    float64
petal width     float64
target           object
dtype: object

In [31]:
### type 변경
iris['sepal length'] = iris['sepal width'].astype('int')
iris[['sepal width', 'petal length']] = iris[['sepal width', 'petal length']].astype('int')
iris.head(3)

Unnamed: 0,sepal length,sepal width,petal length,petal width,target
0,3,3,1,0.2,Iris-setosa
1,3,3,1,0.2,Iris-setosa
2,3,3,1,0.2,Iris-setosa


In [32]:
### categorical data 처리
ins_old['GraduateOrNot'] = ins_old.GraduateOrNot.astype('category')
ins_old['GraduateOrNot'].head()

0    1
1    1
2    1
3    1
4    1
Name: GraduateOrNot, dtype: category
Categories (2, int64): [0, 1]

### (5) Index, Column

#### Index

In [33]:
### index 확인
df.index

RangeIndex(start=0, stop=2, step=1)

In [34]:
indexes = titanic_df.index # index class

print(type(indexes.values)) # values는 array 형태
print(indexes.values.shape)

<class 'numpy.ndarray'>
(891,)


In [35]:
### index slicing
print(indexes[:5].values)
print(indexes.values[:5])
print(indexes[6])

[0 1 2 3 4]
[0 1 2 3 4]
6


In [36]:
### index 변경: 부분 변경은 불가
df.index = ['A', 'B']
df.index

Index(['A', 'B'], dtype='object')

In [37]:
df

Unnamed: 0,class,score
A,kor,70
B,math,80


In [38]:
### set index
df.set_index('class', drop=True, append=False, inplace=True) # 기존 index drop
df

Unnamed: 0_level_0,score
class,Unnamed: 1_level_1
kor,70
math,80


In [39]:
### reset index
df.reset_index(drop=False, inplace=True) # 기존 index가 컬럼으로 들어감
df

Unnamed: 0,class,score
0,kor,70
1,math,80


In [40]:
titanic_reset_df = titanic_df.reset_index(inplace=False)
titanic_reset_df.head(3)

Unnamed: 0,index,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [41]:
print('### before reset_index ###')
value_counts = titanic_df['Pclass'].value_counts()
print(value_counts)
print('value_counts 객체 변수 타입:', type(value_counts))

new_value_counts = value_counts.reset_index(inplace=False) ### 기존 index가 column으로 생성됨
print('### After reset_index ###')
print(new_value_counts)
print('new_value_counts 객체 변수 타입:', type(new_value_counts))

### before reset_index ###
3    491
1    216
2    184
Name: Pclass, dtype: int64
value_counts 객체 변수 타입: <class 'pandas.core.series.Series'>
### After reset_index ###
   index  Pclass
0      3     491
1      1     216
2      2     184
new_value_counts 객체 변수 타입: <class 'pandas.core.frame.DataFrame'>


In [42]:
### reindex
# index에 맞는 row로 dataframe 생성
# 원본 dataframe에 없는 경우 NA
df.reindex([1, 0, 2])

Unnamed: 0,class,score
1,math,80.0
0,kor,70.0
2,,


#### Multi index

In [43]:
### Multi-index
# 주로 group 객체에서 자주 사용됨
multi_df

Unnamed: 0,Unnamed: 1,col1,col2,col3
row1,val1,1,2,3
row1,val2,4,5,6
row2,val1,7,8,9
row2,val2,10,11,12
row2,val3,13,14,15
row3,val2,16,17,18
row3,val3,19,20,21


In [44]:
multi_df.index

MultiIndex([('row1', 'val1'),
            ('row1', 'val2'),
            ('row2', 'val1'),
            ('row2', 'val2'),
            ('row2', 'val3'),
            ('row3', 'val2'),
            ('row3', 'val3')],
           )

In [45]:
# get data from multi-index
multi_df.index.get_level_values(0)

Index(['row1', 'row1', 'row2', 'row2', 'row2', 'row3', 'row3'], dtype='object')

In [46]:
# drop some multi-index
multi_df.droplevel([0])

Unnamed: 0,col1,col2,col3
val1,1,2,3
val2,4,5,6
val1,7,8,9
val2,10,11,12
val3,13,14,15
val2,16,17,18
val3,19,20,21


#### Column

In [47]:
iris.columns

Index(['sepal length', 'sepal width', 'petal length', 'petal width', 'target'], dtype='object')

In [48]:
### 특정 column 제외
iris.columns.difference(['target'])

Index(['petal length', 'petal width', 'sepal length', 'sepal width'], dtype='object')

In [49]:
iris.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'class']
iris.head(3)

Unnamed: 0,sepal length,sepal width,petal length,petal width,class
0,3,3,1,0.2,Iris-setosa
1,3,3,1,0.2,Iris-setosa
2,3,3,1,0.2,Iris-setosa


In [50]:
### column명 변경
iris.columns = iris.columns.str.replace(' ', '_') # text 추출 후 변경
iris.head(3)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,3,3,1,0.2,Iris-setosa
1,3,3,1,0.2,Iris-setosa
2,3,3,1,0.2,Iris-setosa


### (6) Series
- DataFrame의 한 열로 사용

In [51]:
type(iris.sepal_length)

pandas.core.series.Series

In [52]:
# array로 변환: value 추출
iris.sepal_length.values
# np.array(iris.sepal_length) # 동일

array([3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 4, 4, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 3, 2, 2, 2, 2, 3, 2, 2, 2, 3,
       3, 2, 2, 2, 3, 2, 2, 2, 2, 3, 2, 3, 2, 2, 2, 2, 2, 2, 3, 3, 3, 2,
       3, 2, 2, 3, 2, 2, 2, 3, 2, 2, 2, 2, 3, 2, 3, 2, 3, 3, 2, 2, 2, 3,
       3, 2, 3, 2, 2, 3, 3, 3, 2, 2, 3, 2, 2, 2, 3, 3, 2, 3, 2, 3, 2, 3,
       2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3])

In [53]:
titanic_pclass = titanic_df['Pclass']
print(type(titanic_pclass))

<class 'pandas.core.series.Series'>


In [54]:
titanic_pclass.head()

0    3
1    1
2    3
3    1
4    3
Name: Pclass, dtype: int64

In [55]:
value_counts = titanic_df['Pclass'].value_counts()
print(value_counts)

3    491
1    216
2    184
Name: Pclass, dtype: int64


## DataFrame Handling

### (1) Slicing

#### Select Row

In [56]:
# indexing 가능한 형태가 인수로 들어가면 row 선택
iris[1:4]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
1,3,3,1,0.2,Iris-setosa
2,3,3,1,0.2,Iris-setosa
3,3,3,1,0.2,Iris-setosa


In [57]:
titanic_df[titanic_df['Pclass']==3].head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [58]:
# index로 query
# 파이썬 예약어가 들어가거나, 공백이 있으면 사용 불가
iris.query('index == 0')

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,3,3,1,0.2,Iris-setosa


In [59]:
iris.query('petal_length > 5 & petal_length > 3')

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
100,3,3,6,2.5,Iris-virginica
105,3,3,6,2.1,Iris-virginica
107,2,2,6,1.8,Iris-virginica
109,3,3,6,2.5,Iris-virginica
117,3,3,6,2.2,Iris-virginica
118,2,2,6,2.3,Iris-virginica
122,2,2,6,2.0,Iris-virginica
125,3,3,6,1.8,Iris-virginica
130,2,2,6,1.9,Iris-virginica
131,3,3,6,2.0,Iris-virginica


In [60]:
iris.query('petal_length > 5')[["petal_width"]]

Unnamed: 0,petal_width
100,2.5
105,2.1
107,1.8
109,2.5
117,2.2
118,2.3
122,2.0
125,1.8
130,1.9
131,2.0


#### Select Column

In [61]:
iris['sepal_length'].head(4)

0    3
1    3
2    3
3    3
Name: sepal_length, dtype: int64

In [62]:
iris.sepal_length.head(4)

0    3
1    3
2    3
3    3
Name: sepal_length, dtype: int64

In [63]:
iris[['sepal_length', 'sepal_width']].head(4) # dataframe 형태

Unnamed: 0,sepal_length,sepal_width
0,3,3
1,3,3
2,3,3
3,3,3


#### Select Both (loc, iloc)

In [64]:
### iloc: index loc
iris.iloc[[1,3,5], 2:4]

Unnamed: 0,petal_length,petal_width
1,1,0.2
3,1,0.2
5,1,0.4


In [65]:
iris.iloc[:, [True, True, False, True, False]]

Unnamed: 0,sepal_length,sepal_width,petal_width
0,3,3,0.2
1,3,3,0.2
2,3,3,0.2
3,3,3,0.2
4,3,3,0.2
...,...,...,...
145,3,3,2.3
146,2,2,1.9
147,3,3,2.0
148,3,3,2.3


In [66]:
iris.loc[[1, 2], 'sepal_length':'petal_length'] # loc에서는 end slicing도 포함됨

Unnamed: 0,sepal_length,sepal_width,petal_length
1,3,3,1
2,3,3,1


In [67]:
data_df.loc['one', 'Name']

'Chulmin'

In [68]:
data_df.iloc[0, 0]

'Chulmin'

In [69]:
print("\n 맨 마지막 칼럼 데이터 [:, -1] \n", data_df.iloc[:, -1])
print("\n 맨 마지막 칼럼을 제외한 모든 데이터 [:, :-1] \n", data_df.iloc[: , :-1])


 맨 마지막 칼럼 데이터 [:, -1] 
 one        Male
two      Female
three      Male
four       Male
Name: Gender, dtype: object

 맨 마지막 칼럼을 제외한 모든 데이터 [:, :-1] 
            Name  Year
one     Chulmin  2011
two    Eunkyung  2016
three  Jinwoong  2015
four    Soobeom  2015


In [70]:
print('위치기반 iloc slicing\n', data_df.iloc[0:1, 0],'\n')
print('명칭기반 loc slicing\n', data_df.loc['one':'two', 'Name'])

위치기반 iloc slicing
 one    Chulmin
Name: Name, dtype: object 

명칭기반 loc slicing
 one     Chulmin
two    Eunkyung
Name: Name, dtype: object


In [71]:
### Multi-index slicing
multi_df.loc['row2'] # first index (row2)

Unnamed: 0,col1,col2,col3
val1,7,8,9
val2,10,11,12
val3,13,14,15


In [72]:
multi_df.loc[('row2', 'val2')] # row2 - val2

col1    10
col2    11
col3    12
Name: (row2, val2), dtype: int64

In [73]:
multi_df.loc[('row2', 'val2'), 'col3'] # row2 - val2 index의 col3 값

12

In [74]:
multi_df.loc[('row1', 'val2'):('row3', 'val2')] # tuple slicing

Unnamed: 0,Unnamed: 1,col1,col2,col3
row1,val2,4,5,6
row2,val1,7,8,9
row2,val2,10,11,12
row2,val3,13,14,15
row3,val2,16,17,18


### (2) Data Modifying

#### Change

In [75]:
score ### original table

Unnamed: 0,국어,수학,영어
장화,100,75,90
홍련,80,90,95


In [76]:
score.loc['홍련', '영어'] = 100
score['국어'] = score['국어'] - 5
score

Unnamed: 0,국어,수학,영어
장화,95,75,90
홍련,75,90,100


#### Insert

In [77]:
new_students = pd.DataFrame({'국어': [70, 85], '수학':[65, 100], '영어':[95, 65]}, index=['콩쥐', '팥쥐'])

### new df concat
score = pd.concat([score, new_students], axis=0) # append method is deprecated
score

Unnamed: 0,국어,수학,영어
장화,95,75,90
홍련,75,90,100
콩쥐,70,65,95
팥쥐,85,100,65


In [78]:
score['과학'] = [80, 70, 90, 85]
score['학년'] = 1
score

Unnamed: 0,국어,수학,영어,과학,학년
장화,95,75,90,80,1
홍련,75,90,100,70,1
콩쥐,70,65,95,90,1
팥쥐,85,100,65,85,1


In [79]:
score['과학'] = score['과학'] + 5
score['총점'] = score['국어'] + score['수학'] + score['영어'] + score['과학']
score

Unnamed: 0,국어,수학,영어,과학,학년,총점
장화,95,75,90,85,1,345
홍련,75,90,100,75,1,340
콩쥐,70,65,95,95,1,325
팥쥐,85,100,65,90,1,340


In [80]:
titanic_df['Age_0'] = 0
titanic_df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_0
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0


In [81]:
titanic_df['Age_by_10'] = titanic_df['Age'] * 10
titanic_df['Family_No'] = titanic_df['SibSp'] + titanic_df['Parch'] + 1
titanic_df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_0,Age_by_10,Family_No
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0,220.0,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0,380.0,2
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0,260.0,1


In [82]:
titanic_df['Age_by_10'] = titanic_df['Age_by_10'] + 100
titanic_df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_0,Age_by_10,Family_No
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0,320.0,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0,480.0,2
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0,360.0,1


#### Delete

In [83]:
### drop
score.drop('장화', inplace=True)
score.drop(columns = ['과학', '학년', '총점'], inplace=True)
score

Unnamed: 0,국어,수학,영어
홍련,75,90,100
콩쥐,70,65,95
팥쥐,85,100,65


In [84]:
titanic_drop_df = titanic_df.drop('Age_0', axis=1)
titanic_drop_df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_by_10,Family_No
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,320.0,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,480.0,2
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,360.0,1


In [85]:
titanic_df.head(3) # 원본을 변화시키지는 않음 (inplace=False)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_0,Age_by_10,Family_No
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0,320.0,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0,480.0,2
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0,360.0,1


In [86]:
drop_result = titanic_df.drop(['Age_0', 'Age_by_10', 'Family_No'], axis=1, inplace=True)
print('inplace=True 로 drop 후 반환된 값:', drop_result)
titanic_df.head(3)

inplace=True 로 drop 후 반환된 값: None


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [87]:
print('#### before axis 0 drop ####')
print(titanic_df['PassengerId'].head(3))

titanic_df.drop([0, 1, 2], axis=0, inplace=True)

print('#### after axis 0 drop ####')
print(titanic_df['PassengerId'].head(3))

#### before axis 0 drop ####
0    1
1    2
2    3
Name: PassengerId, dtype: int64
#### after axis 0 drop ####
3    4
4    5
5    6
Name: PassengerId, dtype: int64


### (3) Join

#### Concat

In [88]:
HR1 # original table

Unnamed: 0,이름,부서,직급
0,장화,영업,팀장
1,홍련,회계,사원


In [89]:
HR2 # original table

Unnamed: 0,이름,직급,부서
0,콩쥐,사원,영업
1,팥쥐,팀장,인사


In [90]:
pd.concat([HR1, HR2], axis=0) # 열 방향 결합 (아래로)

Unnamed: 0,이름,부서,직급
0,장화,영업,팀장
1,홍련,회계,사원
0,콩쥐,영업,사원
1,팥쥐,인사,팀장


In [91]:
pd.concat([HR1, HR2], axis=0, ignore_index=True) # 기존 index를 무시하고, row 결합

Unnamed: 0,이름,부서,직급
0,장화,영업,팀장
1,홍련,회계,사원
2,콩쥐,영업,사원
3,팥쥐,인사,팀장


In [92]:
HR3 = pd.DataFrame({'이름': ['콩쥐','팥쥐'], '부서': ['영업', '인사'], '급여': [3500, 2800]})
HR3

Unnamed: 0,이름,부서,급여
0,콩쥐,영업,3500
1,팥쥐,인사,2800


In [93]:
pd.concat([HR1, HR3], axis=0, ignore_index=True) # 두 df의 컬럼이 다른 경우, NA 생성

Unnamed: 0,이름,부서,직급,급여
0,장화,영업,팀장,
1,홍련,회계,사원,
2,콩쥐,영업,,3500.0
3,팥쥐,인사,,2800.0


In [94]:
HR4 = pd.Series({1: 2500}, name='급여')
pd.concat([HR1, HR4], axis=1) # column 결합 (행 방향 결합)

Unnamed: 0,이름,부서,직급,급여
0,장화,영업,팀장,
1,홍련,회계,사원,2500.0


In [95]:
HR5 = pd.DataFrame({'급여': [4500, 3000, 3500]})
pd.concat([HR1, HR5], axis=1)

Unnamed: 0,이름,부서,직급,급여
0,장화,영업,팀장,4500
1,홍련,회계,사원,3000
2,,,,3500


In [96]:
product # original table

Unnamed: 0,상품코드,상품명
0,G1,우유
1,G2,감자
2,G3,빵
3,G4,치킨


In [97]:
sale # original table

Unnamed: 0,주문번호,상품코드,주문수량
0,1001,G4,1
1,1002,G3,4
2,1002,G1,2
3,1003,G3,2
4,1004,G5,3


#### Merge
- How: inner, outer, left, right, cross

In [98]:
### 조인 수행 방식
sale.merge(product, on='상품코드', how='inner') # source.merge(target, on='column')

Unnamed: 0,주문번호,상품코드,주문수량,상품명
0,1001,G4,1,치킨
1,1002,G3,4,빵
2,1003,G3,2,빵
3,1002,G1,2,우유


In [99]:
sale.merge(product, on='상품코드', how='outer', sort=True)

Unnamed: 0,주문번호,상품코드,주문수량,상품명
0,1002.0,G1,2.0,우유
1,,G2,,감자
2,1002.0,G3,4.0,빵
3,1003.0,G3,2.0,빵
4,1001.0,G4,1.0,치킨
5,1004.0,G5,3.0,


In [100]:
sale.merge(product, on='상품코드', how='left')

Unnamed: 0,주문번호,상품코드,주문수량,상품명
0,1001,G4,1,치킨
1,1002,G3,4,빵
2,1002,G1,2,우유
3,1003,G3,2,빵
4,1004,G5,3,


In [101]:
product.columns = ['코드', '상품명']
sale.merge(product, left_on='상품코드', right_on='코드', how='inner', suffixes=['A', 'B']) # 컬럼명이 다른 경우, 조인 컬럼 모두가 테이블에 생성됨

Unnamed: 0,주문번호,상품코드,주문수량,코드,상품명
0,1001,G4,1,G4,치킨
1,1002,G3,4,G3,빵
2,1003,G3,2,G3,빵
3,1002,G1,2,G1,우유


### (4) Reconstruct
- Qcut, Crosstab, Pivot, Melt

In [102]:
### qcut
# numeric to categorical
iris['petal width level'] = pd.qcut(iris['petal_width'], q=3, labels=['short', 'middle', 'long'])

pd.Series(iris['petal width level']).value_counts()

middle    52
short     50
long      48
Name: petal width level, dtype: int64

In [103]:
### Contingency table
# crosstab(열 변수, 행 변수)
# dropna, normalize(all, index, columns) option
pd.crosstab(iris['petal width level'], iris['class'])

class,Iris-setosa,Iris-versicolor,Iris-virginica
petal width level,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
short,50,0,0
middle,0,48,4
long,0,2,46


In [104]:
score2 # original table

Unnamed: 0,학년,반,성별,성적
0,1,A,여자,76
1,1,A,남자,88
2,1,B,여자,85
3,1,B,남자,72
4,2,C,여자,68
5,2,C,남자,70


In [105]:
### Pivot
# aggfunc 옵션으로 집계함수 지정 가능
score2.pivot_table(index='학년', columns='성별', values='성적', aggfunc=np.mean) # 학년 별로 평균을 내 줌

성별,남자,여자
학년,Unnamed: 1_level_1,Unnamed: 2_level_1
1,80.0,80.5
2,70.0,68.0


In [106]:
score2 = score2.pivot_table(index=['학년', '반'], columns='성별', values='성적')
score2

Unnamed: 0_level_0,성별,남자,여자
학년,반,Unnamed: 2_level_1,Unnamed: 3_level_1
1,A,88,76
1,B,72,85
2,C,70,68


In [107]:
### Melt: pivot의 반대
score2.reset_index().melt(id_vars=['학년', '반'], var_name='성별', value_name='성적')

Unnamed: 0,학년,반,성별,성적
0,1,A,남자,88
1,1,B,남자,72
2,2,C,남자,70
3,1,A,여자,76
4,1,B,여자,85
5,2,C,여자,68


### (5) Apply Functions

#### Apply

In [108]:
score2 # original table

Unnamed: 0_level_0,성별,남자,여자
학년,반,Unnamed: 2_level_1,Unnamed: 3_level_1
1,A,88,76
1,B,72,85
2,C,70,68


In [109]:
score2.apply(np.sqrt, axis=0)

Unnamed: 0_level_0,성별,남자,여자
학년,반,Unnamed: 2_level_1,Unnamed: 3_level_1
1,A,9.380832,8.717798
1,B,8.485281,9.219544
2,C,8.3666,8.246211


In [110]:
score2.apply(np.max, axis=0)

성별
남자    88
여자    85
dtype: int64

In [111]:
# 사용자 정의 함수
def plus_five(val):
    return val + 5

score2.apply(plus_five)

Unnamed: 0_level_0,성별,남자,여자
학년,반,Unnamed: 2_level_1,Unnamed: 3_level_1
1,A,93,81
1,B,77,90
2,C,75,73


In [112]:
def class_avg(df):
    return np.ceil((df['남자'] + df['여자'])/2) # ceil: 올림하여 정수 반환

score2.apply(class_avg, axis=1)

학년  반
1   A    82.0
    B    79.0
2   C    69.0
dtype: float64

#### Map

In [113]:
### Map and lambda function
# 컬럼의 각 원소에 lambda 함수 반영
score2['남자'].map(lambda x: x + 5)

학년  반
1   A    93
    B    77
2   C    75
Name: 남자, dtype: int64

In [114]:
a = [1, 2, 3]
squares = map(lambda x: x**2, a)
list(squares)

[1, 4, 9]

In [115]:
titanic_df['Name_len'] = titanic_df['Name'].apply(lambda x: len(x))
titanic_df[['Name', 'Name_len']].head(3)

Unnamed: 0,Name,Name_len
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",44
4,"Allen, Mr. William Henry",24
5,"Moran, Mr. James",16


In [116]:
titanic_df['Child_Adult'] = titanic_df['Age'].apply(lambda x : 'Child' if x <=15 else 'Adult')
titanic_df.loc[4:8, ['Age','Child_Adult']]

Unnamed: 0,Age,Child_Adult
4,35.0,Adult
5,,Adult
6,54.0,Adult
7,2.0,Child
8,27.0,Adult


In [117]:
titanic_df['Age_cat'] = titanic_df['Age'].apply(lambda x : 'Child' if x<=15 else ('Adult' if x <= 60 else 'Elderly')) # 이중 조건문
titanic_df['Age_cat'].value_counts()

Adult      606
Elderly    199
Child       83
Name: Age_cat, dtype: int64

In [118]:
### lambda for binning
# 나이에 따라 세분화된 분류를 수행하는 함수 생성. 
def get_category(age):
    if age <= 5: cat = 'Baby'
    elif age <= 12: cat = 'Child'
    elif age <= 18: cat = 'Teenager'
    elif age <= 25: cat = 'Student'
    elif age <= 35: cat = 'Young Adult'
    elif age <= 60: cat = 'Adult'
    else : cat = 'Elderly'
    
    return cat

# lambda 식에 위에서 생성한 get_category() 함수를 반환값으로 지정. 
# get_category(X)는 입력값으로 ‘Age’ 컬럼 값을 받아서 해당하는 cat 반환
titanic_df['Age_cat'] = titanic_df['Age'].apply(lambda x : get_category(x))
titanic_df[['Age', 'Age_cat']].head()

Unnamed: 0,Age,Age_cat
3,35.0,Young Adult
4,35.0,Young Adult
5,,Elderly
6,54.0,Adult
7,2.0,Baby


### (6) Character Processing
- str은 Series의 속성이므로, column을 지정해 줘야 함
- 파이썬 내장 str 함수들과 조합해서 사용

In [119]:
landmark # original table

Unnamed: 0,name,location
0,광화문,서울 종로구 사직로 161
1,호미곶,경북 포항시 남구 호미곶면 대보리 150
2,첨성대,경북 경주시 인왕동 839-1


In [120]:
landmark['location'].str[3:6]

0    종로구
1    포항시
2    경주시
Name: location, dtype: object

In [121]:
landmark['location'].str.split(" ", expand=True) # 컬럼으로 분할

Unnamed: 0,0,1,2,3,4,5
0,서울,종로구,사직로,161,,
1,경북,포항시,남구,호미곶면,대보리,150.0
2,경북,경주시,인왕동,839-1,,


In [122]:
landmark['loc_1'] = landmark['location'].str.split(" ").str[0] # 특정 값만 넣음
landmark

Unnamed: 0,name,location,loc_1
0,광화문,서울 종로구 사직로 161,서울
1,호미곶,경북 포항시 남구 호미곶면 대보리 150,경북
2,첨성대,경북 경주시 인왕동 839-1,경북


In [123]:
landmark['name'] + ':' + landmark['location'] # column string join

0            광화문:서울 종로구 사직로 161
1    호미곶:경북 포항시 남구 호미곶면 대보리 150
2          첨성대:경북 경주시 인왕동 839-1
dtype: object

In [124]:
### 문자열 탐색
landmark['location'].str.startswith('서울')

0     True
1    False
2    False
Name: location, dtype: bool

In [125]:
landmark['location'].str.endswith('1')

0     True
1    False
2     True
Name: location, dtype: bool

In [126]:
landmark['location'].str.contains('1')

0    True
1    True
2    True
Name: location, dtype: bool

In [127]:
titanic_df['Cabin'] = titanic_df['Cabin'].str[:1] # 특정 값만 추출하여 저장
titanic_df['Cabin'].loc[10:12]

10      G
11      C
12    NaN
Name: Cabin, dtype: object

### (7) Datetime Processing

In [128]:
import time
import datetime as dt

#### Datetime

In [129]:
### 현재 날짜와 시간 산출
today = dt.datetime.today()
today

datetime.datetime(2023, 10, 29, 0, 23, 58, 165789)

In [130]:
dt.datetime.today().year # year/month/day/hour/minute/second/microsecond # 현재 연도/월/일/시간/분/초/마이크로초 산출

2023

In [131]:
dt.datetime.today().date() # 날짜 부분

datetime.date(2023, 10, 29)

In [132]:
dt.datetime.today().time() # 시간 부분

datetime.time(0, 23, 58, 176012)

In [133]:
### date, time
date_part = dt.date(2019, 1, 1)
time_part = dt.time(10, 0, 5)

In [134]:
### merge
dt.datetime.combine(date_part, time_part)

datetime.datetime(2019, 1, 1, 10, 0, 5)

In [135]:
dt.datetime.today().weekday() # 요일 산출

6

In [136]:
### 일반 str to datetime
# column을 datetime으로 변환하는 경우는 Data Types 참조
date_time = dt.datetime.strptime('2021-12-25 00:00:00', '%Y-%m-%d %H:%M:%S')
date_time

datetime.datetime(2021, 12, 25, 0, 0)

In [137]:
### datetime to str
time_today = dt.datetime.today()
time_today.strftime('%Y-%m-%d %H:%M:%S')

'2023-10-29 00:23:58'

In [138]:
### datetime 형식 변경도 가능
time_today.strftime("%y-%m-%d")

'23-10-29'

In [139]:
time_today.strftime('%H:%M') # 특정 부분만 뽑아 쓰기

'00:23'

In [140]:
time_today.strftime("%y년 %m-%d")

'23년 10-29'

In [141]:
### timestamp to datetime
ts = 2551780740
date_time = dt.datetime.fromtimestamp(ts)
date_time

datetime.datetime(2050, 11, 11, 20, 59)

In [142]:
### datetime to timestamp
ts_2 = time.mktime(date_time.timetuple())
ts_2

2551780740.0

In [143]:
### Time difference
time_today = dt.datetime.today()
time_today + dt.timedelta(days=100) # seconds/microseconds, milliseconds, minutes, hours, weeks option 사용 가능

datetime.datetime(2024, 2, 6, 0, 23, 58, 216009)

In [144]:
### 연/월/일 조정 (str)
# zfill: str 자리수 맞추기
f"{str(dt.datetime.today().year + 5)}-{str(dt.datetime.today().month).zfill(2)}-{str(dt.datetime.today().day).zfill(2)}"

'2028-10-29'

In [145]:
### 연/월/일 조정
# year, month, day, hour, minute, second, ...
today.replace(day=today.day + int(np.random.randn() - 4))

datetime.datetime(2023, 10, 26, 0, 23, 58, 165789)

In [146]:
today.replace(year=2022, minute=2)

datetime.datetime(2022, 10, 29, 0, 2, 58, 165789)

In [147]:
### timedelta: 시간 간격
week_1 = dt.timedelta(weeks=1)
day_2 = dt.timedelta(days=2)

In [148]:
### datetime 연산 (with timedelta)
dt.date(2023, 2, 10) + week_1 # 1 week 추가

datetime.date(2023, 2, 17)

#### Datetime with DataFrame

In [149]:
### string to datetime 변환
arima_data['day'] = pd.to_datetime(arima_data['day'], format="%Y-%m-%d")
# pd.to_datetime(arima_data['day'], format="%m/%d/%Y %I:%M:%S %p")
# pd.to_datetime(arima_data['day'], format="%Y-%m-%d %H:%M:%S")

In [150]:
def convert_string_to_timedelta(string):
    # type: (str) -> datetime.timedelta
    if not string:
        raise ValueError('{} is not a valid timedelta string'.format(string))
    
    # get days
    tmp = string.split('.')
    if len(tmp) == 2:
        days = int(tmp[0])
        tmp = tmp[1]
    elif len(tmp) == 1:
        days = 0
        tmp = tmp[0]
    else:
        raise ValueError('{} is not a valid timedelta string'.format(string))
    # get total seconds
    tmp = tmp.split(':')
    if len(tmp) != 3:
        raise ValueError('{} is not a valid timedelta string'.format(string))
    totsec = int(tmp[2]) + int(tmp[1]) * 60 + int(tmp[0]) * 3600
        
    return dt.timedelta(days, totsec)

In [151]:
### time diff 계산
diff_day = arima_data['day'] - arima_data['day'].shift(1)

# time diff가 1시간 이상인 index 추출 (시간 단위가 나오는 데이터에 활용)
big_diff_list = [i for i in diff_day.index if diff_day[i] > convert_string_to_timedelta('01:00:00')]

In [152]:
### datetime 요소 추출
# year/month/day/hour/minute/second/microsecond
arima_data['day'].dt.year.head()

0    2013
1    2013
2    2013
3    2013
4    2013
Name: day, dtype: int64

In [153]:
### 요일 추출
arima_data['day'].dt.day_name().head() # 영어 요일

0      Tuesday
1       Friday
2       Friday
3       Monday
4    Wednesday
Name: day, dtype: object

In [154]:
arima_data['day'].dt.weekday.head() # 숫자 요일

0    1
1    4
2    4
3    0
4    2
Name: day, dtype: int64

In [155]:
### object to datetime
arima_data['day'] = arima_data['day'].astype('datetime64')
arima_data['day'].dtypes

dtype('<M8[ns]')

In [156]:
### timestamp to datetime
usage['time'] = pd.to_datetime(usage.timestamp, unit='s')
usage['time'].head()

0   2050-11-11 11:59:00
1   2050-04-20 11:59:00
2   2050-09-02 11:59:00
3   2050-08-31 11:59:00
4   2050-10-24 17:59:00
Name: time, dtype: datetime64[ns]

## DataFrame Exploring

### (1) Explore with Conditions

In [157]:
students[students['이름']=='장화']

Unnamed: 0,이름,국어,수학
0,장화,70,65


In [158]:
students[(students['국어']>=80) & (students['수학']>=80)]

Unnamed: 0,이름,국어,수학
1,홍련,85,100
2,콩쥐,90,80
3,팥쥐,100,95


In [159]:
students[(students['국어']>=80) | (students['수학']>=80)]

Unnamed: 0,이름,국어,수학
1,홍련,85,100
2,콩쥐,90,80
3,팥쥐,100,95
4,해님,60,90
5,달님,85,70


In [160]:
students.loc[6, '이름':'수학'] = ['별님', 50, 60]  # column 추가
students.loc[(students['국어']>=80) & (students['수학']>=70), '합격'] = 'Pass' # if statement
students.loc[students['합격']!='Pass', '합격'] = 'Fail'
students

### np.select, np.where도 사용 가능

Unnamed: 0,이름,국어,수학,합격
0,장화,70.0,65.0,Fail
1,홍련,85.0,100.0,Pass
2,콩쥐,90.0,80.0,Pass
3,팥쥐,100.0,95.0,Pass
4,해님,60.0,90.0,Fail
5,달님,85.0,70.0,Pass
6,별님,50.0,60.0,Fail


In [161]:
titanic_boolean = titanic_df[titanic_df['Age'] > 60]
titanic_boolean.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Name_len,Child_Adult,Age_cat
33,34,0,2,"Wheadon, Mr. Edward H",male,66.0,0,0,C.A. 24579,10.5,,S,21,Adult,Elderly
54,55,0,1,"Ostby, Mr. Engelhart Cornelius",male,65.0,0,1,113509,61.9792,B,C,30,Adult,Elderly
96,97,0,1,"Goldschmidt, Mr. George B",male,71.0,0,0,PC 17754,34.6542,A,C,25,Adult,Elderly
116,117,0,3,"Connors, Mr. Patrick",male,70.5,0,0,370369,7.75,,Q,20,Adult,Elderly
170,171,0,1,"Van der hoef, Mr. Wyckoff",male,61.0,0,0,111240,33.5,B,S,25,Adult,Elderly


In [162]:
titanic_df[titanic_df['Age'] > 60][['Name', 'Age']].head(3)

Unnamed: 0,Name,Age
33,"Wheadon, Mr. Edward H",66.0
54,"Ostby, Mr. Engelhart Cornelius",65.0
96,"Goldschmidt, Mr. George B",71.0


In [163]:
titanic_df.loc[titanic_df['Age'] > 60, ['Name', 'Age']].head(3)

Unnamed: 0,Name,Age
33,"Wheadon, Mr. Edward H",66.0
54,"Ostby, Mr. Engelhart Cornelius",65.0
96,"Goldschmidt, Mr. George B",71.0


In [164]:
titanic_df[ (titanic_df['Age'] > 60) & (titanic_df['Pclass']==1) & (titanic_df['Sex']=='female')]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Name_len,Child_Adult,Age_cat
275,276,1,1,"Andrews, Miss. Kornelia Theodosia",female,63.0,1,0,13502,77.9583,D,S,33,Adult,Elderly
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B,,41,Adult,Elderly


In [165]:
cond1 = titanic_df['Age'] > 60
cond2 = titanic_df['Pclass']==1
cond3 = titanic_df['Sex']=='female'

titanic_df[cond1 & cond2 & cond3]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Name_len,Child_Adult,Age_cat
275,276,1,1,"Andrews, Miss. Kornelia Theodosia",female,63.0,1,0,13502,77.9583,D,S,33,Adult,Elderly
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B,,41,Adult,Elderly


### (2) Sort

#### Index sort

In [166]:
iris.sort_index(ascending=False, inplace=True) # 내림차순, 원본 데이터에 반영
iris

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class,petal width level
149,3,3,5,1.8,Iris-virginica,long
148,3,3,5,2.3,Iris-virginica,long
147,3,3,5,2.0,Iris-virginica,long
146,2,2,5,1.9,Iris-virginica,long
145,3,3,5,2.3,Iris-virginica,long
...,...,...,...,...,...,...
4,3,3,1,0.2,Iris-setosa,short
3,3,3,1,0.2,Iris-setosa,short
2,3,3,1,0.2,Iris-setosa,short
1,3,3,1,0.2,Iris-setosa,short


In [167]:
iris.sort_index(axis=1, ascending=True, inplace=True) # 오름차순 column명 정렬
iris

Unnamed: 0,class,petal width level,petal_length,petal_width,sepal_length,sepal_width
149,Iris-virginica,long,5,1.8,3,3
148,Iris-virginica,long,5,2.3,3,3
147,Iris-virginica,long,5,2.0,3,3
146,Iris-virginica,long,5,1.9,2,2
145,Iris-virginica,long,5,2.3,3,3
...,...,...,...,...,...,...
4,Iris-setosa,short,1,0.2,3,3
3,Iris-setosa,short,1,0.2,3,3
2,Iris-setosa,short,1,0.2,3,3
1,Iris-setosa,short,1,0.2,3,3


#### Value sort

In [168]:
iris.sort_values('petal_length') # 특정 컬럼 기준으로 정렬

Unnamed: 0,class,petal width level,petal_length,petal_width,sepal_length,sepal_width
0,Iris-setosa,short,1,0.2,3,3
28,Iris-setosa,short,1,0.2,3,3
29,Iris-setosa,short,1,0.2,3,3
30,Iris-setosa,short,1,0.2,3,3
31,Iris-setosa,short,1,0.4,3,3
...,...,...,...,...,...,...
130,Iris-virginica,long,6,1.9,2,2
117,Iris-virginica,long,6,2.2,3,3
118,Iris-virginica,long,6,2.3,2,2
100,Iris-virginica,long,6,2.5,3,3


In [169]:
iris.sort_values(['petal_width', 'sepal_length']) # 여러 개 컬럼을 지정하면, 앞 컬럼이 우선 기준

Unnamed: 0,class,petal width level,petal_length,petal_width,sepal_length,sepal_width
37,Iris-setosa,short,1,0.1,3,3
34,Iris-setosa,short,1,0.1,3,3
13,Iris-setosa,short,1,0.1,3,3
12,Iris-setosa,short,1,0.1,3,3
9,Iris-setosa,short,1,0.1,3,3
...,...,...,...,...,...,...
140,Iris-virginica,long,5,2.4,3,3
136,Iris-virginica,long,5,2.4,3,3
144,Iris-virginica,long,5,2.5,3,3
109,Iris-virginica,long,6,2.5,3,3


In [170]:
titanic_sorted = titanic_df.sort_values(by=['Name'])
titanic_sorted.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Name_len,Child_Adult,Age_cat
845,846,0,3,"Abbing, Mr. Anthony",male,42.0,0,0,C.A. 5547,7.55,,S,19,Adult,Adult
746,747,0,3,"Abbott, Mr. Rossmore Edward",male,16.0,1,1,C.A. 2673,20.25,,S,27,Adult,Teenager
279,280,1,3,"Abbott, Mrs. Stanton (Rosa Hunt)",female,35.0,1,1,C.A. 2673,20.25,,S,32,Adult,Young Adult


In [171]:
titanic_sorted = titanic_df.sort_values(by=['Pclass', 'Name'], ascending=False)
titanic_sorted.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Name_len,Child_Adult,Age_cat
868,869,0,3,"van Melkebeke, Mr. Philemon",male,,0,0,345777,9.5,,S,27,Adult,Elderly
153,154,0,3,"van Billiard, Mr. Austin Blyler",male,40.5,0,2,A/5. 851,14.5,,S,31,Adult,Adult
282,283,0,3,"de Pelsmaeker, Mr. Alfons",male,16.0,0,0,345778,9.5,,S,25,Adult,Teenager


### (3) Summurize/Aggrigation
- 집계 함수들: count(), sum(), mean(), median(), min(), max(), std(), var(), quantile(n), first(), last(), describe()
- groupby options
    - axis: 행(0), 열(1), default 0
    - as_index: index 출력 여부. default True
    - sort: 정렬 여부
    - dropna: 결측값 제거 여부

In [172]:
iris['class'] = iris['class'].map({'Iris-setosa': 0, 'Iris-virsicolor': 1, 'Iris-virginica': 2})
iris.head()

Unnamed: 0,class,petal width level,petal_length,petal_width,sepal_length,sepal_width
149,2.0,long,5,1.8,3,3
148,2.0,long,5,2.3,3,3
147,2.0,long,5,2.0,3,3
146,2.0,long,5,1.9,2,2
145,2.0,long,5,2.3,3,3


#### Simple aggrigation
- Count, Mean, (n)Unique

In [173]:
### count
titanic_df.count()

PassengerId    888
Survived       888
Pclass         888
Name           888
Sex            888
Age            711
SibSp          888
Parch          888
Ticket         888
Fare           888
Cabin          203
Embarked       886
Name_len       888
Child_Adult    888
Age_cat        888
dtype: int64

In [174]:
### mean
titanic_df[['Age', 'Fare']].mean()

Age     29.703474
Fare    32.215643
dtype: float64

In [175]:
### iris class의 원소 확인
iris['class'].value_counts()

0.0    50
2.0    50
Name: class, dtype: int64

In [176]:
### unique 원소 배열 반환
iris['class'].unique()

array([ 2., nan,  0.])

In [177]:
### unique 원소의 개수
iris['class'].nunique()

2

#### GroupBy
- groupby('기준 열').집계함수()

In [178]:
iris.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 150 entries, 149 to 0
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   class              100 non-null    float64 
 1   petal width level  150 non-null    category
 2   petal_length       150 non-null    int64   
 3   petal_width        150 non-null    float64 
 4   sepal_length       150 non-null    int64   
 5   sepal_width        150 non-null    int64   
dtypes: category(1), float64(2), int64(3)
memory usage: 7.3 KB


In [179]:
iris.drop('petal width level', axis=1).groupby(by='class').mean() # class별 평균

Unnamed: 0_level_0,petal_length,petal_width,sepal_length,sepal_width
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.0,1.0,0.244,3.04,3.04
2.0,5.1,2.026,2.58,2.58


In [180]:
iris.drop('petal width level', axis=1).groupby(by='class').median()# class별 중앙값

Unnamed: 0_level_0,petal_length,petal_width,sepal_length,sepal_width
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.0,1,0.2,3,3
2.0,5,2.0,3,3


In [181]:
titanic_groupby = titanic_df.groupby(by='Pclass')
titanic_groupby

<pandas.core.groupby.generic.DataFrameGroupBy object at 0xffff7b97f450>

In [182]:
### group의 요소 뽑아내기
# get_group
titanic_groupby.get_group(1) # Pclass==1인 data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Name_len,Child_Adult,Age_cat
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C,S,44,Adult,Young Adult
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E,S,23,Adult,Adult
11,12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26.5500,C,S,24,Adult,Adult
23,24,1,1,"Sloper, Mr. William Thompson",male,28.0,0,0,113788,35.5000,A,S,28,Adult,Young Adult
27,28,0,1,"Fortune, Mr. Charles Alexander",male,19.0,3,2,19950,263.0000,C,S,30,Adult,Student
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
871,872,1,1,"Beckwith, Mrs. Richard Leonard (Sallie Monypeny)",female,47.0,1,1,11751,52.5542,D,S,48,Adult,Adult
872,873,0,1,"Carlsson, Mr. Frans Olof",male,33.0,0,0,695,5.0000,B,S,24,Adult,Young Adult
879,880,1,1,"Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)",female,56.0,0,1,11767,83.1583,C,C,45,Adult,Adult
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B,S,28,Adult,Student


In [183]:
titanic_groupby_df = titanic_df.groupby('Pclass').count()
titanic_groupby_df

Unnamed: 0_level_0,PassengerId,Survived,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Name_len,Child_Adult,Age_cat
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,215,215,215,215,185,215,215,215,215,175,213,215,215,215
2,184,184,184,184,173,184,184,184,184,16,184,184,184,184
3,489,489,489,489,353,489,489,489,489,12,489,489,489,489


In [184]:
titanic_groupby_df = titanic_df.groupby('Pclass', as_index=False).count() ### as_index 옵션으로 index로 들어갈 값을 column으로 만들 수 있음
titanic_groupby_df

Unnamed: 0,Pclass,PassengerId,Survived,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Name_len,Child_Adult,Age_cat
0,1,215,215,215,215,185,215,215,215,215,175,213,215,215,215
1,2,184,184,184,184,173,184,184,184,184,16,184,184,184,184
2,3,489,489,489,489,353,489,489,489,489,12,489,489,489,489


In [185]:
titanic_groupby_df = titanic_df.groupby('Pclass')[['PassengerId', 'Survived']].count()
titanic_groupby_df

Unnamed: 0_level_0,PassengerId,Survived
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1
1,215,215
2,184,184
3,489,489


In [186]:
titanic_df.groupby(['Sex', 'Survived'])['Survived'].count()

Sex     Survived
female  0            81
        1           231
male    0           467
        1           109
Name: Survived, dtype: int64

In [187]:
### agg
titanic_df.groupby('Pclass')['Age'].agg([max, min])

Unnamed: 0_level_0,max,min
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1
1,80.0,0.92
2,70.0,0.67
3,74.0,0.42


In [188]:
### column에 따른 agg 함수를 다르게 지정
agg_format = {'Age':'max', 'SibSp':'sum', 'Fare':'mean'}
titanic_df.groupby('Pclass').agg(agg_format)

Unnamed: 0_level_0,Age,SibSp,Fare
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,80.0,89,84.214554
2,70.0,74,20.662183
3,74.0,301,13.70045


### (4) Sampling

In [189]:
### Random sampling
iris.sample(n=5)

Unnamed: 0,class,petal width level,petal_length,petal_width,sepal_length,sepal_width
42,0.0,short,1,0.2,3,3
17,0.0,short,1,0.3,3,3
41,0.0,short,1,0.3,2,2
100,2.0,long,6,2.5,3,3
71,,middle,4,1.3,2,2


In [190]:
iris.sample(frac=0.05)

Unnamed: 0,class,petal width level,petal_length,petal_width,sepal_length,sepal_width
25,0.0,short,1,0.2,3,3
104,2.0,long,5,2.2,3,3
55,,middle,4,1.3,2,2
59,,middle,3,1.4,2,2
35,0.0,short,1,0.2,3,3
79,,middle,3,1.0,2,2
69,,middle,3,1.1,2,2
144,2.0,long,5,2.5,3,3


In [191]:
iris.sample(frac=0.05, replace=True, random_state=0).reset_index() # index reset, sampling seed 고정, 복원추출

Unnamed: 0,index,class,petal width level,petal_length,petal_width,sepal_length,sepal_width
0,102,2.0,long,5,2.1,3,3
1,32,0.0,short,1,0.1,4,4
2,82,,middle,3,1.2,2,2
3,46,0.0,short,1,0.2,3,3
4,140,2.0,long,5,2.4,3,3
5,128,2.0,long,5,2.1,2,2
6,113,2.0,long,5,2.0,2,2
7,62,,middle,4,1.0,2,2


In [192]:
weights = np.where(iris['class']=='setosa', 10, 2) # setosa class에 가중치 부여
iris.sample(frac=0.05, weights=weights).reset_index()

Unnamed: 0,index,class,petal width level,petal_length,petal_width,sepal_length,sepal_width
0,114,2.0,long,5,2.4,2,2
1,123,2.0,long,4,1.8,2,2
2,121,2.0,long,4,2.0,2,2
3,48,0.0,short,1,0.2,3,3
4,16,0.0,short,1,0.4,3,3
5,71,,middle,4,1.3,2,2
6,94,,middle,4,1.3,2,2
7,42,0.0,short,1,0.2,3,3


In [193]:
iris.groupby('class', group_keys=False).apply(lambda x: x.sample(frac=0.05)) # 층화추출. 각 class에서 sampling

Unnamed: 0,class,petal width level,petal_length,petal_width,sepal_length,sepal_width
27,0.0,short,1,0.2,3,3
10,0.0,short,1,0.2,3,3
135,2.0,long,6,2.3,3,3
105,2.0,long,6,2.1,3,3


## Numpy

### (1) Create Array

#### List

In [194]:
array1 = np.array([1, 2, 3])
print('array1 type:', type(array1))
print('array1 array 형태:', array1.shape)

array2 = np.array([[1, 2, 3],[2, 3, 4]])
print('array2 type:', type(array2))
print('array2 array 형태:', array2.shape)

array3 = np.array([[1, 2, 3]])
print('array3 type:', type(array3))
print('array3 array 형태:', array3.shape)

print('array1: {0}차원, array2: {1}차원, array3: {2}차원'.format(array1.ndim, array2.ndim, array3.ndim))

array1 type: <class 'numpy.ndarray'>
array1 array 형태: (3,)
array2 type: <class 'numpy.ndarray'>
array2 array 형태: (2, 3)
array3 type: <class 'numpy.ndarray'>
array3 array 형태: (1, 3)
array1: 1차원, array2: 2차원, array3: 2차원


In [195]:
list1 = [1, 2, 3]
print(type(list1))
array1 = np.array(list1)
print(type(array1))
print(array1, array1.dtype)

<class 'list'>
<class 'numpy.ndarray'>
[1 2 3] int64


In [196]:
list2 = [1, 2, 'test']
array2 = np.array(list2)
print(array2, array2.dtype)

list3 = [1, 2, 3.0]
array3 = np.array(list3)
print(array3, array3.dtype)

['1' '2' 'test'] <U21
[1. 2. 3.] float64


In [197]:
array_int = np.array([1, 2, 3])
array_float = array_int.astype('float64')
print(array_float, array_float.dtype)

array_int1= array_float.astype('int32')
print(array_int1, array_int1.dtype)

array_float1 = np.array([1.1, 2.1, 3.1])
array_int2= array_float1.astype('int32')
print(array_int2, array_int2.dtype)

[1. 2. 3.] float64
[1 2 3] int32
[1 2 3] int32


In [198]:
np.array([1, 2, 'A']) # 하나의 type만 가능 -> str로 변환됨

array(['1', '2', 'A'], dtype='<U21')

In [199]:
sample_array = np.array([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]]) # 2차원
sample_array

array([[ 1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10]])

In [200]:
sample_array.shape

(2, 5)

#### Another Methods

In [201]:
### arange, zeros, ones
sequence_array = np.arange(10)
print(sequence_array)
print(sequence_array.dtype, sequence_array.shape)

zero_array = np.zeros((3, 2),dtype='int32')
print(zero_array)
print(zero_array.dtype, zero_array.shape)

one_array = np.ones((3, 2))
print(one_array)
print(one_array.dtype, one_array.shape)

[0 1 2 3 4 5 6 7 8 9]
int64 (10,)
[[0 0]
 [0 0]
 [0 0]]
int32 (3, 2)
[[1. 1.]
 [1. 1.]
 [1. 1.]]
float64 (3, 2)


In [202]:
print(np.zeros(4))
print(np.zeros([2, 3])) # input이 list
print(np.ones(3))

[0. 0. 0. 0.]
[[0. 0. 0.]
 [0. 0. 0.]]
[1. 1. 1.]


In [203]:
### zeros_like, ones_like
# 특정 array 모양의 0 array
np.zeros_like(zero_array)

array([[0, 0],
       [0, 0],
       [0, 0]], dtype=int32)

In [204]:
### Empty
# 임의의 값 (거의 0)을 갖는 배열 생성
np.empty(4)

array([0., 0., 0., 0.])

In [205]:
### tile
# 해당 원소로 반복된 array 생성
np.tile('A', 5) 

array(['A', 'A', 'A', 'A', 'A'], dtype='<U1')

In [206]:
### repeat
# 배열 반복
# 숫자 2를 7번 반복한 배열 반환
print(np.repeat(2, 7))

# 2 x 2 array x를 각 axis에 대해 repeat
x = np.array([[1, 2],[3, 4]])
# 1. axis = None 일 경우 shape에 상관없이 flatten
print(np.repeat(x, 3))

# axis = 0 일경우 축 1(가장 바깥쪽 괄호) 기준으로 생각
print(np.repeat(x, 3, axis=0))

# axis = 1 일경우 축 2(다음 바깥쪽 괄호)를 기준으로 생각   
print(np.repeat(x, 3, axis=1))

[2 2 2 2 2 2 2]
[1 1 1 2 2 2 3 3 3 4 4 4]
[[1 2]
 [1 2]
 [1 2]
 [3 4]
 [3 4]
 [3 4]]
[[1 1 1 2 2 2]
 [3 3 3 4 4 4]]


In [207]:
### arrange
np.arange(1, 10, 2) # range 함수와 유사

array([1, 3, 5, 7, 9])

In [208]:
### linspace
np.linspace(0, 15, 100) # 0 ~ 15 사이에 100개의 원소를 가지는 array

array([ 0.        ,  0.15151515,  0.3030303 ,  0.45454545,  0.60606061,
        0.75757576,  0.90909091,  1.06060606,  1.21212121,  1.36363636,
        1.51515152,  1.66666667,  1.81818182,  1.96969697,  2.12121212,
        2.27272727,  2.42424242,  2.57575758,  2.72727273,  2.87878788,
        3.03030303,  3.18181818,  3.33333333,  3.48484848,  3.63636364,
        3.78787879,  3.93939394,  4.09090909,  4.24242424,  4.39393939,
        4.54545455,  4.6969697 ,  4.84848485,  5.        ,  5.15151515,
        5.3030303 ,  5.45454545,  5.60606061,  5.75757576,  5.90909091,
        6.06060606,  6.21212121,  6.36363636,  6.51515152,  6.66666667,
        6.81818182,  6.96969697,  7.12121212,  7.27272727,  7.42424242,
        7.57575758,  7.72727273,  7.87878788,  8.03030303,  8.18181818,
        8.33333333,  8.48484848,  8.63636364,  8.78787879,  8.93939394,
        9.09090909,  9.24242424,  9.39393939,  9.54545455,  9.6969697 ,
        9.84848485, 10.        , 10.15151515, 10.3030303 , 10.45

In [209]:
### logspace
np.logspace(-3, 1, 5) # log(-3)에서 log(1) 까지 범위에서 5개의 원소를 가지는 array

array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01])

In [210]:
### setdiff
np.setdiff1d(np.arange(10), [1]) # 1차원 배열에서 리스트에 해당하는 원소 제외한 array

array([0, 2, 3, 4, 5, 6, 7, 8, 9])

In [211]:
### meshgrid
# 두 개의 배열을 직사각형 grid로 반환
x = np.linspace(1, 10, 10)
y = np.linspace(11, 20, 10)

X,Y = np.meshgrid(x, y)
print(X)
print(Y)

[[ 1.  2.  3.  4.  5.  6.  7.  8.  9. 10.]
 [ 1.  2.  3.  4.  5.  6.  7.  8.  9. 10.]
 [ 1.  2.  3.  4.  5.  6.  7.  8.  9. 10.]
 [ 1.  2.  3.  4.  5.  6.  7.  8.  9. 10.]
 [ 1.  2.  3.  4.  5.  6.  7.  8.  9. 10.]
 [ 1.  2.  3.  4.  5.  6.  7.  8.  9. 10.]
 [ 1.  2.  3.  4.  5.  6.  7.  8.  9. 10.]
 [ 1.  2.  3.  4.  5.  6.  7.  8.  9. 10.]
 [ 1.  2.  3.  4.  5.  6.  7.  8.  9. 10.]
 [ 1.  2.  3.  4.  5.  6.  7.  8.  9. 10.]]
[[11. 11. 11. 11. 11. 11. 11. 11. 11. 11.]
 [12. 12. 12. 12. 12. 12. 12. 12. 12. 12.]
 [13. 13. 13. 13. 13. 13. 13. 13. 13. 13.]
 [14. 14. 14. 14. 14. 14. 14. 14. 14. 14.]
 [15. 15. 15. 15. 15. 15. 15. 15. 15. 15.]
 [16. 16. 16. 16. 16. 16. 16. 16. 16. 16.]
 [17. 17. 17. 17. 17. 17. 17. 17. 17. 17.]
 [18. 18. 18. 18. 18. 18. 18. 18. 18. 18.]
 [19. 19. 19. 19. 19. 19. 19. 19. 19. 19.]
 [20. 20. 20. 20. 20. 20. 20. 20. 20. 20.]]


In [212]:
### newaxis
# 축 추가
# None을 넣었을 때와 동일

# 1D array
arr = np.arange(4)

row_vec = arr[np.newaxis, :]
print(row_vec.shape)

col_vec = arr[:, np.newaxis]
print(col_vec.shape)

(1, 4)
(4, 1)


### (2) Array Handling

#### Arithmetic Operation

In [213]:
sample_array_2 = np.array([1, 2, 3, 4, 5])
sample_array_2

array([1, 2, 3, 4, 5])

In [214]:
sample_array_2 + 2 # 모든 원소에 적용

array([3, 4, 5, 6, 7])

In [215]:
sample_array_2 * 2

array([ 2,  4,  6,  8, 10])

#### Reshape

In [216]:
array1 = np.arange(10)
print('array1:\n', array1)

array2 = array1.reshape(2,5)
print('array2:\n',array2)

array3 = array1.reshape(5,2)
print('array3:\n',array3)

array1:
 [0 1 2 3 4 5 6 7 8 9]
array2:
 [[0 1 2 3 4]
 [5 6 7 8 9]]
array3:
 [[0 1]
 [2 3]
 [4 5]
 [6 7]
 [8 9]]


In [217]:
array1 = np.arange(10)
print(array1)

array2 = array1.reshape(-1, 5) # -가 아닌 shape를 우선 맞춤
print('array2 shape:', array2.shape)

array3 = array1.reshape(5, -1)
print('array3 shape:', array3.shape)

[0 1 2 3 4 5 6 7 8 9]
array2 shape: (2, 5)
array3 shape: (5, 2)


In [218]:
array1 = np.arange(10)
array4 = array1.reshape(-1, 5)

In [219]:
array1 = np.arange(8)
array3d = array1.reshape((2, 2, 2))
print('array3d:\n', array3d.tolist())

# 3차원 ndarray를 2차원 ndarray로 변환
array5 = array3d.reshape(-1, 1)
print('array5:\n', array5.tolist())
print('array5 shape:', array5.shape)

# 1차원 ndarray를 2차원 ndarray로 변환
array6 = array1.reshape(-1, 1)
print('array6:\n', array6.tolist())
print('array6 shape:', array6.shape)

array3d:
 [[[0, 1], [2, 3]], [[4, 5], [6, 7]]]
array5:
 [[0], [1], [2], [3], [4], [5], [6], [7]]
array5 shape: (8, 1)
array6:
 [[0], [1], [2], [3], [4], [5], [6], [7]]
array6 shape: (8, 1)


In [220]:
### ravel
# ravel은 flatten과 유사. 대신 원본을 복사하지 않음
data = pd.read_csv('./ADP_book_ver01/data/insurance.csv')
y = np.array(data['charges'])
y = y.reshape(len(y), 1)

# ravel 전후 비교
print(y.shape, y.ravel().shape)

(1338, 1) (1338,)


#### Concat, Split

In [221]:
### 배열 합치기
np.vstack([sample_array, sample_array_2]) # 행으로 합치기 -> 열 shape가 동일해야 함

array([[ 1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10],
       [ 1,  2,  3,  4,  5]])

In [222]:
np.hstack([sample_array.reshape(-1, 2), sample_array_2.reshape(-1, 1)]) # 열로 합치기 

array([[ 1,  2,  1],
       [ 3,  4,  2],
       [ 5,  6,  3],
       [ 7,  8,  4],
       [ 9, 10,  5]])

In [223]:
sample_array.reshape(1, 10)

array([[ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10]])

In [224]:
sample_array_2.reshape(-1, 1)

array([[1],
       [2],
       [3],
       [4],
       [5]])

In [225]:
a = np.array([[0, 1, 2], [3, 4, 5]])
b = np.array([[6, 7, 8], [9, 10, 11]])

print(np.concatenate((a, b), axis=0)) # 행으로 합치기
print(np.concatenate((a, b), axis=1)) # 열로 합치기

[[ 0  1  2]
 [ 3  4  5]
 [ 6  7  8]
 [ 9 10 11]]
[[ 0  1  2  6  7  8]
 [ 3  4  5  9 10 11]]


In [226]:
### 배열 분리
a = np.array([1, 2, 3, 4, 5, 6])

np.array_split(a, 4)
# np.split(a, 4) # 동일 기능이나, 배열이 동일 개수로 안 나눠지면 에러 발생

[array([1, 2]), array([3, 4]), array([5]), array([6])]

### (3) Array Exploring

#### Indexing, Slicing

In [227]:
# 1에서 부터 9 까지의 1차원 ndarray 생성 
array1 = np.arange(start=1, stop=10)
print('array1:', array1)

# index는 0 부터 시작하므로 array1[2]는 3번째 index 위치의 데이터 값을 의미
value = array1[2]
print('value:', value)
print(type(value))

array1: [1 2 3 4 5 6 7 8 9]
value: 3
<class 'numpy.int64'>


In [228]:
# 값 치환
array1[0] = 9
array1[8] = 0
print('array1:', array1)

array1: [9 2 3 4 5 6 7 8 0]


In [229]:
print('맨 뒤의 값:', array1[-1], ', 맨 뒤에서 두번째 값:', array1[-2])

맨 뒤의 값: 0 , 맨 뒤에서 두번째 값: 8


In [230]:
d1_array = np.array([1, 2, 3, 4, 5])
d1_array

array([1, 2, 3, 4, 5])

In [231]:
d1_array[0]

1

In [232]:
d1_array[1:3]

array([2, 3])

In [233]:
d2_array = np.array([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]])
d2_array

array([[ 1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10]])

In [234]:
d2_array[0, 3]

4

In [235]:
d2_array[1, 2:4]

array([8, 9])

In [236]:
array1d = np.arange(start=1, stop=10)
array2d = array1d.reshape(3, 3)
print(array2d)

print('(row=0,col=0) index 가리키는 값:', array2d[0, 0])
print('(row=0,col=1) index 가리키는 값:', array2d[0, 1])
print('(row=1,col=0) index 가리키는 값:', array2d[1, 0])
print('(row=2,col=2) index 가리키는 값:', array2d[2, 2])

[[1 2 3]
 [4 5 6]
 [7 8 9]]
(row=0,col=0) index 가리키는 값: 1
(row=0,col=1) index 가리키는 값: 2
(row=1,col=0) index 가리키는 값: 4
(row=2,col=2) index 가리키는 값: 9


In [237]:
### Fancy indexing
array1d = np.arange(start=1, stop=10)
array2d = array1d.reshape(3, 3)

array3 = array2d[[0, 1], 2]
print('array2d[[0, 1], 2] => ', array3.tolist())

array4 = array2d[[0, 1], 0:2]
print('array2d[[0, 1], 0:2] => ', array4.tolist())

array5 = array2d[[0, 1]]
print('array2d[[0, 1]] => ', array5.tolist())

array2d[[0, 1], 2] =>  [3, 6]
array2d[[0, 1], 0:2] =>  [[1, 2], [4, 5]]
array2d[[0, 1]] =>  [[1, 2, 3], [4, 5, 6]]


In [238]:
### Boolean indexing
array1d = np.arange(start=1, stop=10)
# [ ] 안에 array1d > 5 Boolean indexing을 적용 
array3 = array1d[array1d > 5]
print('array1d > 5 불린 인덱싱 결과 값 :', array3)

array1d > 5 불린 인덱싱 결과 값 : [6 7 8 9]


In [239]:
array1d > 5

array([False, False, False, False, False,  True,  True,  True,  True])

In [240]:
boolean_indexes = np.array([False, False, False, False, False,  True,  True,  True,  True])
array3 = array1d[boolean_indexes]
print('불린 인덱스로 필터링 결과 :', array3)

불린 인덱스로 필터링 결과 : [6 7 8 9]


In [241]:
indexes = np.array([5,6,7,8])
array4 = array1d[indexes]
print('일반 인덱스로 필터링 결과 :', array4)

일반 인덱스로 필터링 결과 : [6 7 8 9]


In [242]:
array1 = np.arange(start=1, stop=10)
array3 = array1[0:3]
print(array3)

[1 2 3]


In [243]:
array1 = np.arange(start=1, stop=10)
array4 = array1[:3]
print(array4)

array5 = array1[3:]
print(array5)

array6 = array1[:]
print(array6)

[1 2 3]
[4 5 6 7 8 9]
[1 2 3 4 5 6 7 8 9]


In [244]:
array1d = np.arange(start=1, stop=10)
array2d = array1d.reshape(3,3)
print('array2d:\n',array2d)

print('array2d[0:2, 0:2] \n', array2d[0:2, 0:2])
print('array2d[1:3, 0:3] \n', array2d[1:3, 0:3])
print('array2d[1:3, :] \n', array2d[1:3, :])
print('array2d[:, :] \n', array2d[:, :])
print('array2d[:2, 1:] \n', array2d[:2, 1:])
print('array2d[:2, 0] \n', array2d[:2, 0])

array2d:
 [[1 2 3]
 [4 5 6]
 [7 8 9]]
array2d[0:2, 0:2] 
 [[1 2]
 [4 5]]
array2d[1:3, 0:3] 
 [[4 5 6]
 [7 8 9]]
array2d[1:3, :] 
 [[4 5 6]
 [7 8 9]]
array2d[:, :] 
 [[1 2 3]
 [4 5 6]
 [7 8 9]]
array2d[:2, 1:] 
 [[2 3]
 [5 6]]
array2d[:2, 0] 
 [1 4]


In [245]:
print(array2d[0])
print(array2d[1])
print('array2d[0] shape:', array2d[0].shape, 'array2d[1] shape:', array2d[1].shape )

[1 2 3]
[4 5 6]
array2d[0] shape: (3,) array2d[1] shape: (3,)


#### Explore with Conditions
- Select, Where

In [246]:
### select
condition_list = [(students['국어'] >= 90), (students['국어'] >= 80) & (students['국어'] < 90), (students['국어'] >= 70) & (students['국어'] < 80)]
choice_list = ['A', 'B', 'C']

np.select(condition_list, choice_list, default='F')

array(['C', 'B', 'A', 'A', 'F', 'B', 'F'], dtype='<U1')

In [247]:
students['점수'] = np.select(condition_list, choice_list, default='F')
students

Unnamed: 0,이름,국어,수학,합격,점수
0,장화,70.0,65.0,Fail,C
1,홍련,85.0,100.0,Pass,B
2,콩쥐,90.0,80.0,Pass,A
3,팥쥐,100.0,95.0,Pass,A
4,해님,60.0,90.0,Fail,F
5,달님,85.0,70.0,Pass,B
6,별님,50.0,60.0,Fail,F


In [248]:
### where
# excel의 if와 유사
np.where(students['국어'] >=90, 'P', 'F')

array(['F', 'F', 'P', 'P', 'F', 'F', 'F'], dtype='<U1')

#### Sort

In [249]:
org_array = np.array([3, 1, 9, 5]) 
print('원본 행렬:', org_array)

# np.sort()로 정렬: 원본은 그대로 두고 정렬본 반환
sort_array1 = np.sort(org_array)         
print ('np.sort() 호출 후 반환된 정렬 행렬:', sort_array1) 
print('np.sort ) 호출 후 원본 행렬:', org_array)

# ndarray.sort()로 정렬: 원본 변경
sort_array2 = org_array.sort()
print('org_array.sort() 호출 후 반환된 행렬:', sort_array2)
print('org_array.sort() 호출 후 원본 행렬:', org_array)

원본 행렬: [3 1 9 5]
np.sort() 호출 후 반환된 정렬 행렬: [1 3 5 9]
np.sort ) 호출 후 원본 행렬: [3 1 9 5]
org_array.sort() 호출 후 반환된 행렬: None
org_array.sort() 호출 후 원본 행렬: [1 3 5 9]


In [250]:
sort_array1_desc = np.sort(org_array)[::-1]
print('내림차순으로 정렬:', sort_array1_desc)

내림차순으로 정렬: [9 5 3 1]


In [251]:
array2d = np.array([[8, 12], [7, 1]])

sort_array2d_axis0 = np.sort(array2d, axis=0)
print('로우 방향으로 정렬:\n', sort_array2d_axis0)

sort_array2d_axis1 = np.sort(array2d, axis=1)
print('컬럼 방향으로 정렬:\n', sort_array2d_axis1)

로우 방향으로 정렬:
 [[ 7  1]
 [ 8 12]]
컬럼 방향으로 정렬:
 [[ 8 12]
 [ 1  7]]


In [252]:
### argsort: Sort index
org_array = np.array([ 3, 1, 9, 5]) 
sort_indices = np.argsort(org_array)
print(type(sort_indices))
print('행렬 정렬 시 원본 행렬의 인덱스:', sort_indices)

<class 'numpy.ndarray'>
행렬 정렬 시 원본 행렬의 인덱스: [1 0 3 2]


In [253]:
org_array = np.array([3, 1, 9, 5]) 
sort_indices_desc = np.argsort(org_array)[::-1]
print('행렬 내림차순 정렬 시 원본 행렬의 인덱스:', sort_indices_desc)

행렬 내림차순 정렬 시 원본 행렬의 인덱스: [2 3 0 1]


In [254]:
name_array = np.array(['John', 'Mike', 'Sarah', 'Kate', 'Samuel'])
score_array= np.array([78, 95, 84, 98, 88])

sort_indices_asc = np.argsort(score_array)
print('성적 오름차순 정렬 시 score_array의 인덱스:', sort_indices_asc)
print('성적 오름차순으로 name_array의 이름 출력:', name_array[sort_indices_asc])

성적 오름차순 정렬 시 score_array의 인덱스: [0 2 4 1 3]
성적 오름차순으로 name_array의 이름 출력: ['John' 'Sarah' 'Samuel' 'Mike' 'Kate']


### (4) Numpy Functions

#### Special Values

In [255]:
### Infty
# 무한대 값
np.infty

inf

In [256]:
### NaN
# Missing Value 값
np.NaN

nan

In [257]:
### isnan
# Missing Value인지 여부
print(np.isnan(np.NaN))
print(np.isnan(3))

True
False


#### Comparison

In [258]:
### allclose
# 허용 오차 내에서, 두 배열이 같은지 확인
print(np.allclose([1e10, 1e-7], [1.00001e10, 1e-8]))
print(np.allclose([1e10, 1e-8], [1.00001e10, 1e-9]))
print(np.allclose([1e10, 1e-8], [1.0001e10, 1e-9]))
print(np.allclose([1.0, np.nan], [1.0, np.nan]))
print(np.allclose([1.0, np.nan], [1.0, np.nan], equal_nan=True)) # 동시에 NaN이면 같은 것으로 취급

False
True
False
False
True


In [259]:
### in1d
# 첫번째 배열이 두번째 배열의 원소를 포함하고 있는지 여부의 불리언 배열 반환
x = np.array([1, 2, 3, 4, 5, 6])
y = np.array([2, 4])

np.in1d(x, y)

array([False,  True, False,  True, False, False])

In [260]:
### intersect1d. union1d,  setdiff1d, detxor1d
x = np.array([1, 2, 3, 4])
y = np.array([3, 4, 6, 5])

print(np.intersect1d(x, y)) # 두 개의 배열 x, y 의 교집합을 정렬하여 반환
print(np.union1d(x, y)) # 두 개의 배열 x, y의 합집합을 정렬하여 
print(np.setdiff1d(x, y)) # 첫번째 배열 x로 부터 두번째 배열 y를 뺀 차집합을 반환
print(np.setxor1d(x, y)) # 두 배열 x, y의 합집합에서 교집합을 뺀 대칭차집합을 반환

[3 4]
[1 2 3 4 5 6]
[1 2]
[1 2 5 6]


#### Statictics

In [261]:
### unique
# unique 원소 반환
np.unique([5, 5, 3, 3, 4])

array([3, 4, 5])

In [262]:
### mean, median, std, var, quantile, percentile
a = np.array([-1.722, -1.5423, -0.332, 0.223, 1.4355, 1.337, 2.0432])

print(np.mean(a))
print(np.median(a))
print(np.std(a))
print(np.var(a))
print(np.quantile(a, 0.25))
print(np.percentile(a, 25))

0.20605714285714288
0.223
1.3747521832338083
1.8899435653061225
-0.9371499999999999
-0.9371499999999999


In [263]:
### rint, rount, ceil, floor
# 각 요소 반올림

print(np.rint(a)) # 정수 반올림
print(np.round(a, 2)) # 특정 자릿수 반올림
print(np.ceil(a)) # 올림
print(np.floor(a)) # 내림

[-2. -2. -0.  0.  1.  1.  2.]
[-1.72 -1.54 -0.33  0.22  1.44  1.34  2.04]
[-1. -1. -0.  1.  2.  2.  3.]
[-2. -2. -1.  0.  1.  1.  2.]


In [264]:
### cov, corrcoef
# 공분산과 상관계수 행렬
b = np.array([-1, -2, 0, 0.223, 3, 1, 2.04])

print(np.cov(b, a))
print(np.corrcoef(b, a))

[[2.94838414 2.33259876]
 [2.33259876 2.20493416]]
[[1.         0.91485063]
 [0.91485063 1.        ]]


In [265]:
### power, square, sqrt, exp, expm1, log, log1p, log10
# 거듭제곱, 제곱, 제곱근, 지수변환, -1 지수변환, 로그변환, +1 로그변환, 밑 10 로그변환
print(np.power([2, 3], 2))
print(np.square([2, 3]))
print(np.sqrt([2, 3]))
print(np.exp([2, 3]))
print(np.expm1([2, 3]))
print(np.log([2, 3]))
print(np.log1p([2, 3]))
print(np.log10([2, 3]))

[4 9]
[4 9]
[1.41421356 1.73205081]
[ 7.3890561  20.08553692]
[ 6.3890561  19.08553692]
[0.69314718 1.09861229]
[1.09861229 1.38629436]
[0.30103    0.47712125]


In [266]:
### percentile
# 백분위수
np.percentile(x, 50)

2.5

In [267]:
### sign, abs
# 부호, 절대값
print(np.sign(a))
print(np.abs(a))

[-1. -1. -1.  1.  1.  1.  1.]
[1.722  1.5423 0.332  0.223  1.4355 1.337  2.0432]


In [268]:
### argmin, argmax
# 최소, 최대의 index 반환
x = np.array([5, 4, 3, 2, 1, 0])

print(np.argmin(x)) # x.argmin()
print(np.argmax(x)) # x.argmax()

5
0


In [269]:
### argwhere
# 특정 값을 만족시키는 index 배열
x = np.array([5, 4, 3, 2, 1, 0, 2])
np.argwhere(x==2)

array([[3],
       [6]])

In [270]:
### polyfit
# np.polyfit(x, y, degree)
x, y = iris['sepal_length'], iris['petal_length']
b1, b0 = np.polyfit(x, y, 1)
b1, b0 # 높은 차원부터 출력

(-1.2714082952215775, 6.671660621353109)

#### Sampling

In [271]:
### random.rand
# parameter shape로 random array 생성
print(np.random.rand(40, 1).shape)
print(np.random.rand(20, 2).shape)
print(np.random.rand(20, 2, 2).shape)

(40, 1)
(20, 2)
(20, 2, 2)


In [272]:
### random.randn
# parameter shape로 정규분포 sampling
np.random.randn(2, 4)

array([[ 0.18653633, -0.60878079,  1.64681873,  0.53989565],
       [-1.29761006,  0.52449927, -0.49114375,  1.33413813]])

In [273]:
### random.randint
# 주어진 범위에서 n개의 sample 추출하여 array 생성
np.random.randint(1, 10, 3) # 1~10, 3개

array([7, 5, 7])

In [274]:
### random.normal
# 평균과 표준편차를 갖는 정규분포에서 특정 Shape의 array 생성 
np.random.normal(0, 1, 5000)

array([ 0.03494328, -1.45061003,  0.28488795, ..., -0.36068054,
        0.28642379, -0.98135417])

In [275]:
### random.uniform
# 각 인수를 범위로 갖는 이산균일분포에서 특정 Shape의 array 생성
np.random.uniform(1, 10, (2, 2))

array([[8.69527661, 9.11008462],
       [5.89332035, 5.00210961]])

#### Matrix

In [276]:
### transpose
A = np.array([[1, 2], [3, 4]])
transpose_mat = np.transpose(A)
print('A의 전치 행렬:\n', transpose_mat)

A의 전치 행렬:
 [[1 3]
 [2 4]]


In [277]:
# 내적 곱
test_arr = np.array([[1, 0, 1], [2, 2, 2]])
np.dot(test_arr, test_arr.T)

array([[ 2,  4],
       [ 4, 12]])

In [278]:
A = np.array([[1, 2, 3], [4, 5, 6]])
B = np.array([[7, 8], [9, 10], [11, 12]])

dot_product = np.dot(A, B)
print('행렬 내적 결과:\n', dot_product)

행렬 내적 결과:
 [[ 58  64]
 [139 154]]


In [279]:
# 행렬 곱: 2차원에서는 동일 기능 수행
np.matmul(test_arr, test_arr.T)

array([[ 2,  4],
       [ 4, 12]])

In [280]:
# 대각 요소에 특정 값 채우기
test_arr = np.array([[1, 0], [2, 2]])
np.fill_diagonal(test_arr, 5)
test_arr

array([[5, 0],
       [2, 5]])

In [281]:
### diag
# 대각 행렬 요소 반환
np.diag(test_arr)

array([5, 5])

#### Others

In [282]:
### set_printoptions
np.set_printoptions(precision=3) # 소수점 둘째자리까지 표현

In [283]:
### to_frame
# Series to Dataframe
pd.Series(iris['petal width level']).value_counts().to_frame()

Unnamed: 0,petal width level
middle,52
short,50
long,48
