### 판다스, 넘파이 응용

판다스 기본형 DataFrame, Series의 각 데이터 타입 Numpy 타입을 사용

In [22]:
import pandas as pd
import numpy as np


In [23]:
# 시리즈 생성
s1 = pd.Series(data=np.random.randn(10))
s1

0    1.082963
1   -1.232899
2   -0.262479
3   -0.439726
4   -2.003497
5    0.540173
6    0.066913
7   -1.413351
8   -0.285707
9   -1.067068
dtype: float64

In [24]:
# 음수 제거 / 10을 곱해서 10단위 수로 만들고 / 반올림해서 소수점 
s2 = s1.abs().map(lambda x: x* 10).round()    # abs => 절대값
s2

0    11.0
1    12.0
2     3.0
3     4.0
4    20.0
5     5.0
6     1.0
7    14.0
8     3.0
9    11.0
dtype: float64

In [25]:
np.mean(s2)

8.4

In [26]:
s2.info()

<class 'pandas.core.series.Series'>
RangeIndex: 10 entries, 0 to 9
Series name: None
Non-Null Count  Dtype  
--------------  -----  
10 non-null     float64
dtypes: float64(1)
memory usage: 212.0 bytes


In [27]:
# 판다스에서 데이터으 가장 기본 통계 함수
s2.describe()
# std => 표준편차

count    10.000000
mean      8.400000
std       6.113737
min       1.000000
25%       3.250000
50%       8.000000
75%      11.750000
max      20.000000
dtype: float64

In [28]:
s3 = pd.Series(data=[1, 3, 5, np.nan] * 5)
s3

0     1.0
1     3.0
2     5.0
3     NaN
4     1.0
5     3.0
6     5.0
7     NaN
8     1.0
9     3.0
10    5.0
11    NaN
12    1.0
13    3.0
14    5.0
15    NaN
16    1.0
17    3.0
18    5.0
19    NaN
dtype: float64

In [29]:
pd.Series(data=[1, 3, 5, None] * 5)

0     1.0
1     3.0
2     5.0
3     NaN
4     1.0
5     3.0
6     5.0
7     NaN
8     1.0
9     3.0
10    5.0
11    NaN
12    1.0
13    3.0
14    5.0
15    NaN
16    1.0
17    3.0
18    5.0
19    NaN
dtype: float64

In [30]:
s3.info()

<class 'pandas.core.series.Series'>
RangeIndex: 20 entries, 0 to 19
Series name: None
Non-Null Count  Dtype  
--------------  -----  
15 non-null     float64
dtypes: float64(1)
memory usage: 292.0 bytes


In [31]:
# 결측치 때문에 20개 데이터에서 통계를 구할 때 15개 밖에 안 나옴
s3.describe()

count    15.000000
mean      3.000000
std       1.690309
min       1.000000
25%       1.000000
50%       3.000000
75%       5.000000
max       5.000000
dtype: float64

In [32]:
# 값별 빈도(count) 개수 // option(속성) dropna=False // normalize=True
s3.value_counts(dropna=False, normalize=True)

1.0    0.25
3.0    0.25
5.0    0.25
NaN    0.25
Name: proportion, dtype: float64

In [33]:
# 결측치 확인 함수
s3.isnull().sum()

5

### 데이터 프레임과 넘파이

In [34]:
# 데이터프레임 생성 -> 이렇게 만들일은 거의 없음
# C# Bogus로 샘플데이터 생성과 동일작업
size = 20
df1 = pd.DataFrame(data={
    'class' : [['A', 'B', 'C', 'D', 'F'][np.random.randint(0, 5)] for _ in range(0, size)],
    'year' : [np.random.randint(2010, 2024) for _ in range(0, size)],
    'month' : [np.random.randint(1, 13) for _ in range(0, size)],
    'val1' : [np.random.randint(1, 11) for _ in range(0, size)],
    'val2' : [np.random.randint(100, 1000) for _ in range(0, size)],
    'val3' : [np.random.randint(10000, 20000) for _ in range(0, size)],
})
df1

Unnamed: 0,class,year,month,val1,val2,val3
0,B,2020,8,9,594,12435
1,D,2019,7,1,695,16471
2,B,2018,1,9,826,11548
3,B,2023,9,6,802,17700
4,F,2020,10,4,716,12688
5,B,2022,6,4,863,17346
6,B,2014,1,2,809,19097
7,A,2014,11,4,951,16942
8,B,2010,2,5,201,16341
9,B,2019,4,2,484,13901


In [35]:
# shape 출력값 2차원 배열과 1차원 배열 의미가 상이
df1.shape

(20, 6)

In [36]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   class   20 non-null     object
 1   year    20 non-null     int64 
 2   month   20 non-null     int64 
 3   val1    20 non-null     int64 
 4   val2    20 non-null     int64 
 5   val3    20 non-null     int64 
dtypes: int64(5), object(1)
memory usage: 1.1+ KB


In [37]:
df1.describe()

Unnamed: 0,year,month,val1,val2,val3
count,20.0,20.0,20.0,20.0,20.0
mean,2017.25,6.5,4.5,676.6,14889.25
std,4.278157,3.516877,2.46021,231.818305,3127.684024
min,2010.0,1.0,1.0,201.0,10039.0
25%,2014.0,3.75,2.75,525.25,12477.75
50%,2019.0,6.5,4.0,731.0,15121.0
75%,2020.0,10.0,6.0,835.25,17393.75
max,2023.0,12.0,9.0,993.0,19885.0
