### 판다스, 넘파이 응용

판다스 기본형 DataFrame, Series의 각 데이터타입 Numpy 타입을 사용

In [1]:
import pandas as pd
import numpy as np

#### 시리즈와 넘파이

In [7]:
# 시리즈 생성
s1 = pd.Series(data=np.random.randn(100))
s1

0    -1.323445
1    -0.681144
2    -1.076769
3    -0.388929
4    -0.980213
        ...   
95    0.844174
96   -0.824904
97   -0.654466
98    1.255201
99    2.314792
Length: 100, dtype: float64

In [8]:
# 음수제거, 10을 곱해서 10단위 수로 만들고, 반올림해서 소수점
s2 = s1.abs().map(lambda x: x * 10).round()
s2

0     13.0
1      7.0
2     11.0
3      4.0
4     10.0
      ... 
95     8.0
96     8.0
97     7.0
98    13.0
99    23.0
Length: 100, dtype: float64

In [9]:
np.mean(s2)

7.78

In [11]:
# 시리즈, DF의 기본적인 정보
s2.info()

<class 'pandas.core.series.Series'>
RangeIndex: 100 entries, 0 to 99
Series name: None
Non-Null Count  Dtype  
--------------  -----  
100 non-null    float64
dtypes: float64(1)
memory usage: 932.0 bytes


In [13]:
# 판다스 데이터 기본통계 함수
s2.describe()

count    100.000000
mean       7.780000
std        5.902542
min        0.000000
25%        3.000000
50%        7.000000
75%       11.000000
max       34.000000
dtype: float64

In [14]:
s3 = pd.Series(data=[1, 3, 5, np.nan] * 5)
s3

0     1.0
1     3.0
2     5.0
3     NaN
4     1.0
5     3.0
6     5.0
7     NaN
8     1.0
9     3.0
10    5.0
11    NaN
12    1.0
13    3.0
14    5.0
15    NaN
16    1.0
17    3.0
18    5.0
19    NaN
dtype: float64

In [16]:
s3.info()

<class 'pandas.core.series.Series'>
RangeIndex: 20 entries, 0 to 19
Series name: None
Non-Null Count  Dtype  
--------------  -----  
15 non-null     float64
dtypes: float64(1)
memory usage: 292.0 bytes


In [18]:
# 결측치 때문에 20개 데이터에서 통계를 구할 때 15개밖에 안나옴
s3.describe()

count    15.000000
mean      3.000000
std       1.690309
min       1.000000
25%       1.000000
50%       3.000000
75%       5.000000
max       5.000000
dtype: float64

In [23]:
# 값별 빈도(count)수, 속성(option) dropna=False, normalize=True/False
s3.value_counts(dropna=True, normalize=True)

1.0    0.333333
3.0    0.333333
5.0    0.333333
Name: proportion, dtype: float64

In [71]:
s3.shape

(20,)

In [26]:
# 결측치 확인함수
s3.isnull().sum()

5

#### 데이터프레임과 넘파이

In [74]:
# 데이터프레임 생성 -> 이렇게 만들일은 거의 없음
# C# Bogus로 샘플데이터 생성과 동일작업
size = 10
df1 = pd.DataFrame(data={
    'class': [['A', 'B', 'C', 'D', 'F'][np.random.randint(0, 5)] for _ in range(0, size)],
    'year': [np.random.randint(2010, 2024) for _ in range(0, size)],
    'month': [np.random.randint(1, 13) for _ in range(0, size)],
    'val1': [np.random.randint(1, 11) for _ in range(0, size)],
    'val2': [np.random.randint(100, 1000) for _ in range(0, size)],
    'val3': [np.random.randint(10000, 20000) for _ in range(0, size)],
})
df1

Unnamed: 0,class,year,month,val1,val2,val3
0,F,2020,7,3,800,10143
1,B,2019,10,2,421,12406
2,D,2010,9,8,468,13861
3,D,2014,5,1,750,14314
4,F,2023,7,9,420,13439
5,B,2014,11,3,797,13649
6,F,2021,6,5,159,11746
7,F,2019,10,7,140,12251
8,B,2013,1,10,660,17156
9,B,2010,8,3,836,10580


In [76]:
# shape 출력값 2차원배열과 1차원배열 의미가 상이
df1.shape

(10, 6)

In [77]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   class   10 non-null     object
 1   year    10 non-null     int64 
 2   month   10 non-null     int64 
 3   val1    10 non-null     int64 
 4   val2    10 non-null     int64 
 5   val3    10 non-null     int64 
dtypes: int64(5), object(1)
memory usage: 612.0+ bytes


In [78]:
df1.describe()

Unnamed: 0,year,month,val1,val2,val3
count,10.0,10.0,10.0,10.0,10.0
mean,2016.3,7.4,5.1,545.1,12954.5
std,4.667857,2.951459,3.17805,262.130353,2023.16239
min,2010.0,1.0,1.0,140.0,10143.0
25%,2013.25,6.25,3.0,420.25,11872.25
50%,2016.5,7.5,4.0,564.0,12922.5
75%,2019.75,9.75,7.75,785.25,13808.0
max,2023.0,11.0,10.0,836.0,17156.0
