In [1]:
# pandas 패키지

# 클래스
# 1. Series: 1차원 배열, numpy의 array를 확장, 열들은 서로 다른 자료형 OK, 열 내에서는 동일한 자료형
# 2. DataFrame: Series를 열로 갖는 2차원 배열

import pandas as pd

s = pd.Series([10,20,30], index=['a','b','c'])
s

a    10
b    20
c    30
dtype: int64

In [2]:
df = pd.DataFrame(s, columns=['A'])
df

Unnamed: 0,A
a,10
b,20
c,30


In [3]:
s.__class__

pandas.core.series.Series

In [4]:
df.__class__

pandas.core.frame.DataFrame

In [5]:
s.to_numpy()

array([10, 20, 30])

In [6]:
df.to_numpy()

array([[10],
       [20],
       [30]])

In [7]:
s.to_numpy().shape

(3,)

In [8]:
df.to_numpy().shape

(3, 1)

In [9]:
s.index

Index(['a', 'b', 'c'], dtype='object')

In [10]:
df.index

Index(['a', 'b', 'c'], dtype='object')

In [11]:
df.columns

Index(['A'], dtype='object')

In [12]:
d = {'one': pd.Series([1, 2], index=['a','b']),
     'two': pd.Series([1, 2, 3], index=['a','b','c'])}
df = pd.DataFrame(d)
df

Unnamed: 0,one,two
a,1.0,1
b,2.0,2
c,,3


In [13]:
import numpy as np

x = np.arange(4).reshape(2,2)
df = pd.DataFrame(x)
df.columns = ['A', 'B']
df.index=[1,2]
df

Unnamed: 0,A,B
1,0,1
2,2,3


In [14]:
faithful = pd.read_csv('faithful.csv')
faithful.__class__

pandas.core.frame.DataFrame

In [15]:
faithful.shape

(272, 2)

In [16]:
faithful.to_csv('./new_faithful.csv')

In [17]:
import statsmodels.api as sm

trees = sm.datasets.get_rdataset('trees')['data']
mtcars = sm.datasets.get_rdataset('mtcars')['data']
iris = sm.datasets.get_rdataset('iris')['data']

# dir(trees)
print(trees.__class__) # == type(trees)
print(mtcars.__class__)
print(iris.__class__)

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


In [19]:
s1 = pd.Series([1.0, 2, 3]) # 위치 인덱스
sa = pd.Series([1.0, 2, 3], index=['a','b','c']) # 이름 인덱스
sc = pd.Series([1.0, 1, 2], index=['a','b','d'])
sn = pd.Series([1,2,1,3,3,5,3,4])

s1

0    1.0
1    2.0
2    3.0
dtype: float64

In [20]:
s1 - 2 # 브로드캐스팅 연산

0   -1.0
1    0.0
2    1.0
dtype: float64

In [21]:
sa

a    1.0
b    2.0
c    3.0
dtype: float64

In [22]:
sc

a    1.0
b    1.0
d    2.0
dtype: float64

In [23]:
sac = sa+sc
sac

a    2.0
b    3.0
c    NaN
d    NaN
dtype: float64

In [24]:
# Series 결측치(NaN) 처리

sac.isnull()

a    False
b    False
c     True
d     True
dtype: bool

In [25]:
sac.notnull()

a     True
b     True
c    False
d    False
dtype: bool

In [26]:
sac.fillna(-1.0)

a    2.0
b    3.0
c   -1.0
d   -1.0
dtype: float64

In [27]:
sx = sac.dropna()
sx

a    2.0
b    3.0
dtype: float64

In [28]:
sy = sx.append(sc[:2])
sy

  sy = sx.append(sc[:2])


a    2.0
b    3.0
a    1.0
b    1.0
dtype: float64

In [29]:
sy.drop('a')

b    3.0
b    1.0
dtype: float64

In [30]:
##
sn.head()

0    1
1    2
2    1
3    3
4    3
dtype: int64

In [31]:
sn.describe()

count    8.00000
mean     2.75000
std      1.38873
min      1.00000
25%      1.75000
50%      3.00000
75%      3.25000
max      5.00000
dtype: float64

In [32]:
list(sn.unique())

[1, 2, 3, 5, 4]

In [33]:
sn.replace([1,2],0)

0    0
1    0
2    0
3    3
4    3
5    5
6    3
7    4
dtype: int64

In [34]:
tree4 = trees[0:4].copy()
tree4.index = ['B', 'E', 'C', 'A']
tree4

Unnamed: 0,Girth,Height,Volume
B,8.3,70,10.3
E,8.6,65,10.3
C,8.8,63,10.2
A,10.5,72,16.4


In [35]:
tree4[tree4>10]

Unnamed: 0,Girth,Height,Volume
B,,70,10.3
E,,65,10.3
C,,63,10.2
A,10.5,72,16.4


In [36]:
tree4[tree4.Height>65]

Unnamed: 0,Girth,Height,Volume
B,8.3,70,10.3
A,10.5,72,16.4


In [37]:
tree4['E':'C']

Unnamed: 0,Girth,Height,Volume
E,8.6,65,10.3
C,8.8,63,10.2


In [38]:
tree4[['Girth', 'Height']] # 열 선택

Unnamed: 0,Girth,Height
B,8.3,70
E,8.6,65
C,8.8,63
A,10.5,72


In [39]:
tree4.iloc[[0,2], [0,1]] # 위치 지정

Unnamed: 0,Girth,Height
B,8.3,70
C,8.8,63


In [40]:
tree4.iloc[:2, [0,1]]

Unnamed: 0,Girth,Height
B,8.3,70
E,8.6,65


In [41]:
tree4.iloc[:2, 0:1]

Unnamed: 0,Girth
B,8.3
E,8.6


In [42]:
tree4.loc['E':'C'] # 이름 지정

Unnamed: 0,Girth,Height,Volume
E,8.6,65,10.3
C,8.8,63,10.2


In [43]:
tree4.loc[['E','C'], ['Girth', 'Height']]

Unnamed: 0,Girth,Height
E,8.6,65
C,8.8,63


In [44]:
tree4.loc['E':'C', 'Girth':'Height']

Unnamed: 0,Girth,Height
E,8.6,65
C,8.8,63


In [45]:
tree4.T

Unnamed: 0,B,E,C,A
Girth,8.3,8.6,8.8,10.5
Height,70.0,65.0,63.0,72.0
Volume,10.3,10.3,10.2,16.4


In [46]:
tree4.sort_index()

Unnamed: 0,Girth,Height,Volume
A,10.5,72,16.4
B,8.3,70,10.3
C,8.8,63,10.2
E,8.6,65,10.3


In [47]:
tree4.sort_values('Height')

Unnamed: 0,Girth,Height,Volume
C,8.8,63,10.2
E,8.6,65,10.3
B,8.3,70,10.3
A,10.5,72,16.4


In [48]:
tree4.mean()

Girth      9.05
Height    67.50
Volume    11.80
dtype: float64

In [49]:
tree4.mean(1)

B    29.533333
E    27.966667
C    27.333333
A    32.966667
dtype: float64

In [50]:
tree4.cumsum()

Unnamed: 0,Girth,Height,Volume
B,8.3,70,10.3
E,16.9,135,20.6
C,25.7,198,30.8
A,36.2,270,47.2


In [51]:
tree4.apply(np.cumsum)

Unnamed: 0,Girth,Height,Volume
B,8.3,70,10.3
E,16.9,135,20.6
C,25.7,198,30.8
A,36.2,270,47.2


In [52]:
tree4.apply(lambda x: x.max())

Girth     10.5
Height    72.0
Volume    16.4
dtype: float64

In [53]:
mtx = mtcars[['mpg', 'cyl', 'hp', 'gear']].copy()
mtx.head()

Unnamed: 0_level_0,mpg,cyl,hp,gear
rownames,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Mazda RX4,21.0,6,110,4
Mazda RX4 Wag,21.0,6,110,4
Datsun 710,22.8,4,93,4
Hornet 4 Drive,21.4,6,110,3
Hornet Sportabout,18.7,8,175,3


In [54]:
pd.crosstab(mtx.cyl, mtx.gear)

gear,3,4,5
cyl,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4,1,8,2
6,2,4,1
8,12,0,2


In [55]:
mtn = mtx.groupby(by=['cyl', 'gear']).mean()
mtn

Unnamed: 0_level_0,Unnamed: 1_level_0,mpg,hp
cyl,gear,Unnamed: 2_level_1,Unnamed: 3_level_1
4,3,21.5,97.0
4,4,26.925,76.0
4,5,28.2,102.0
6,3,19.75,107.5
6,4,19.75,116.5
6,5,19.7,175.0
8,3,15.05,194.166667
8,5,15.4,299.5


In [56]:
mtn['mpg'].unstack(0)

cyl,4,6,8
gear,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3,21.5,19.75,15.05
4,26.925,19.75,
5,28.2,19.7,15.4


In [57]:
mi = mtn.index
lvs0 = mi.levels[0]
cds0 = mi.codes[0]
lvx0 = lvs0[cds0].values
lvx0

array([4, 4, 4, 6, 6, 6, 8, 8])

In [58]:
lvs1 = mi.levels[1]
cds1 = mi.codes[1]
lvx1 = lvs1[cds1].values
lvx1

array([3, 4, 5, 3, 4, 5, 3, 5])

In [59]:
mpgx = mtn['mpg'].to_numpy()
d = {mi.names[0]: lvx0, mi.names[1]: lvx1, 'mpg':mpgx}
mpga = pd.DataFrame(d)
mpga

Unnamed: 0,cyl,gear,mpg
0,4,3,21.5
1,4,4,26.925
2,4,5,28.2
3,6,3,19.75
4,6,4,19.75
5,6,5,19.7
6,8,3,15.05
7,8,5,15.4


In [60]:
mpga.pivot(index='gear', columns='cyl', values='mpg')

cyl,4,6,8
gear,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3,21.5,19.75,15.05
4,26.925,19.75,
5,28.2,19.7,15.4
