In [1]:
import numpy as np
import pandas as pd

### 1. pandas 자료구조
pandas는 series와 dataframe을 사용한다.
#### 1-1. series

In [3]:
# define Series
obj = pd.Series([4,7,-5,3])
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [4]:
# check values. it doesn't need paranthesis
obj.values

array([ 4,  7, -5,  3])

In [7]:
# check index
obj.index

dtype('int64')

In [6]:
# check types
obj.dtypes

dtype('int64')

In [9]:
# change index
obj2 = pd.Series([4,7,-5,3], index = ['a','b','c','d'])
obj2

a    4
b    7
c   -5
d    3
dtype: int64

In [12]:
# create Series with python dictionary
sdata = {'a':100, 'b':200, 'c': 1, 'd':100}
obj3 = pd.Series(sdata)
obj3

a    100
b    200
c      1
d    100
dtype: int64

In [13]:
# name the series
obj3.name = 'salary'
obj3.index.name = 'Names'
obj3

Names
a    100
b    200
c      1
d    100
Name: salary, dtype: int64

In [17]:
# change index 
obj3.index = ['A', 'B', 'C', 'D']
obj3.index.name = 'Names'
obj3

Names
A    100
B    200
C      1
D    100
Name: salary, dtype: int64

#### 1-2 Data Frame 
Data Frame can be defined with 'dictionary' or 'array from numpy'

In [32]:
# define df with dictionary
data = {'name': ['Daeun', 'Junyoung', 'Dajun'],
       'year': ['1992', '1993', '2030'],
       'points': [123, 100, 111]}
df = pd.DataFrame(data)
df

Unnamed: 0,name,year,points
0,Daeun,1992,123
1,Junyoung,1993,100
2,Dajun,2030,111


In [33]:
df.index

RangeIndex(start=0, stop=3, step=1)

In [34]:
df.columns

Index(['name', 'year', 'points'], dtype='object')

In [35]:
df.values

array([['Daeun', '1992', 123],
       ['Junyoung', '1993', 100],
       ['Dajun', '2030', 111]], dtype=object)

In [37]:
df.index.name = 'Num'
df.columns.name = 'Info'
df

Info,name,year,points
Num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Daeun,1992,123
1,Junyoung,1993,100
2,Dajun,2030,111


In [40]:
# we can set up col and index name as we create pandas dataframe
df2 = pd.DataFrame(data, columns = ['year', 'name', 'points','penalty'],
                  index = ['one', 'two', 'three'])
df2

Unnamed: 0,year,name,points,penalty
one,1992,Daeun,123,
two,1993,Junyoung,100,
three,2030,Dajun,111,


if new column is added, NaN takes over the new place.
even though we put columns in different order , it finds the right column from the dictionary

### 3. DataFrame Indexing

In [58]:
# define new DF
data = {"names": ["Kilho", "Kilho", "Kilho", "Charles", "Charles"],
           "year": [2014, 2015, 2016, 2015, 2016],
           "points": [1.5, 1.7, 3.6, 2.4, 2.9]}
df = pd.DataFrame(data, columns = ['year', 'names', 'points', 'penalty'],
                 index = ['one', 'two', 'three', 'four', 'five'])
df

Unnamed: 0,year,names,points,penalty
one,2014,Kilho,1.5,
two,2015,Kilho,1.7,
three,2016,Kilho,3.6,
four,2015,Charles,2.4,
five,2016,Charles,2.9,


#### 3-1 handling a Column

In [59]:
df.year

one      2014
two      2015
three    2016
four     2015
five     2016
Name: year, dtype: int64

In [60]:
df['year']

one      2014
two      2015
three    2016
four     2015
five     2016
Name: year, dtype: int64

In [61]:
df[['year', 'names']]

Unnamed: 0,year,names
one,2014,Kilho
two,2015,Kilho
three,2016,Kilho
four,2015,Charles
five,2016,Charles


In [62]:
#df.penalty = 0.5
df['penalty'] = 0.5
df

Unnamed: 0,year,names,points,penalty
one,2014,Kilho,1.5,0.5
two,2015,Kilho,1.7,0.5
three,2016,Kilho,3.6,0.5
four,2015,Charles,2.4,0.5
five,2016,Charles,2.9,0.5


In [64]:
df.penalty = [0.1, 0.2, 0.3, 0.4, 0.5] #penalty already exists
#df.zeros = np.arange(5) #try to add new column
df

Unnamed: 0,year,names,points,penalty
one,2014,Kilho,1.5,0.1
two,2015,Kilho,1.7,0.2
three,2016,Kilho,3.6,0.3
four,2015,Charles,2.4,0.4
five,2016,Charles,2.9,0.5


'.'을 이용해 열을 고칠 수 있다. 그러나 '.'을 이용해 새롭게 열을 추가하는 것은 안됨.

새롭게 열을 추가하려면 df.['new column'] 을 활용해야함

In [65]:
df['zeros'] = np.arange(5)
df

Unnamed: 0,year,names,points,penalty,zeros
one,2014,Kilho,1.5,0.1,0
two,2015,Kilho,1.7,0.2,1
three,2016,Kilho,3.6,0.3,2
four,2015,Charles,2.4,0.4,3
five,2016,Charles,2.9,0.5,4


In [68]:
# we can add column with Series and choose index we want
tmp = pd.Series([-2, -4, -1], index = ['two', 'four', 'five'])
df['debt'] = tmp
df

Unnamed: 0,year,names,points,penalty,zeros,debt
one,2014,Kilho,1.5,0.1,0,
two,2015,Kilho,1.7,0.2,1,-2.0
three,2016,Kilho,3.6,0.3,2,
four,2015,Charles,2.4,0.4,3,-4.0
five,2016,Charles,2.9,0.5,4,-1.0


add new column with chosing certain index

In [96]:
# logical operation of two columns! 
df['net_points'] = df.points - df.penalty
df['high_points'] = df['net_points'] > 2.0
df

Unnamed: 0,year,names,points,penalty,zeros,debt,net_points,high_points
one,2014,Kilho,1.5,0.1,0,,1.4,False
two,2015,Kilho,1.7,0.2,1,-2.0,1.5,False
three,2016,Kilho,3.6,0.3,2,,3.3,True
four,2015,Charles,2.4,0.4,3,-4.0,2.0,False
five,2016,Charles,2.9,0.5,4,-1.0,2.4,True


In [98]:
# delete a column

# df.drop('high_points', axis = 1, inplace = True)
del df['high_points']
del df['zeros']
df

Unnamed: 0,year,names,points,penalty,debt,net_points
one,2014,Kilho,1.5,0.1,,1.4
two,2015,Kilho,1.7,0.2,-2.0,1.5
three,2016,Kilho,3.6,0.3,,3.3
four,2015,Charles,2.4,0.4,-4.0,2.0
five,2016,Charles,2.9,0.5,-1.0,2.4
