## Pandas

### Series

In [1]:
import pandas as pd

In [2]:
ser = pd.Series([1, 2, 3], index=['a', 'b', 'c'])
ser

a    1
b    2
c    3
dtype: int64

In [3]:
ser = pd.Series({'a': 1, 'b': 2, 'c': 3})
ser

a    1
b    2
c    3
dtype: int64

In [4]:
ser.index

Index(['a', 'b', 'c'], dtype='object')

In [5]:
ser.values

array([1, 2, 3], dtype=int64)

In [6]:
ser['a']

1

In [7]:
ser.iloc[0]

1

In [8]:
ser = ser.drop(['a', 'c'])
ser

b    2
dtype: int64

### DataFrame

In [9]:
df = pd.DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], columns=['col0', 'col1', 'col2', 'col3'], index=['idx0', 'idx1'])
df

Unnamed: 0,col0,col1,col2,col3
idx0,1,2,3,4
idx1,5,6,7,8


In [10]:
df = pd.DataFrame({'col0': [1, 5], 'col1': [2, 6], 'col2': [3, 7], 'col3': [4, 8]}, index=['idx0', 'idx1'])
df

Unnamed: 0,col0,col1,col2,col3
idx0,1,2,3,4
idx1,5,6,7,8


In [11]:
df.to_csv('dummy_data.csv')
pd.read_csv('dummy_data.csv', index_col=0, header=0)

Unnamed: 0,col0,col1,col2,col3
idx0,1,2,3,4
idx1,5,6,7,8


In [12]:
df.shape

(2, 4)

In [13]:
# add a new column
df['new_col'] = ['i0', 'i1']
df

Unnamed: 0,col0,col1,col2,col3,new_col
idx0,1,2,3,4,i0
idx1,5,6,7,8,i1


In [14]:
# add a new row
df.loc['idx2'] = [9, 10, 11, 12, 'i2']
df

Unnamed: 0,col0,col1,col2,col3,new_col
idx0,1,2,3,4,i0
idx1,5,6,7,8,i1
idx2,9,10,11,12,i2


In [15]:
# set a new index
df.set_index('new_col', inplace=True)
df

Unnamed: 0_level_0,col0,col1,col2,col3
new_col,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
i0,1,2,3,4
i1,5,6,7,8
i2,9,10,11,12


In [16]:
# remove index name
df.index.name = None
df

Unnamed: 0,col0,col1,col2,col3
i0,1,2,3,4
i1,5,6,7,8
i2,9,10,11,12


In [17]:
# drop a column, ['col1', 'col3'] for multiple columns, axis=0 for row
df = df.drop('col3', axis=1)
df

Unnamed: 0,col0,col1,col2
i0,1,2,3
i1,5,6,7
i2,9,10,11


### Selecting data

In [18]:
import numpy as np

np.random.seed(0)
df = pd.DataFrame(np.random.randint(0, 10, (4, 6)), columns=['col0', 'col1', 'col2', 'col3', 'col4', 'col5'], index=['idx0', 'idx1', 'idx2', 'idx3'])
df

Unnamed: 0,col0,col1,col2,col3,col4,col5
idx0,5,0,3,3,7,9
idx1,3,5,2,4,7,6
idx2,8,8,1,6,7,7
idx3,8,1,5,9,8,9


In [19]:
# select a column
df['col1']
# df.loc[:, 'col1']
# df.iloc[:, 1]

idx0    0
idx1    5
idx2    8
idx3    1
Name: col1, dtype: int32

In [20]:
# select a row
df.loc['idx1']
# df.loc['idx1', :]
# df.iloc[1]

col0    3
col1    5
col2    2
col3    4
col4    7
col5    6
Name: idx1, dtype: int32

In [21]:
# select multiple columns
df[['col1', 'col3']]
# df.loc[:, ['col1', 'col3']]
# df.iloc[:, [1, 3]]

Unnamed: 0,col1,col3
idx0,0,3
idx1,5,4
idx2,8,6
idx3,1,9


In [22]:
# select multiple rows
df.loc[['idx1', 'idx3']]
# df.loc[['idx1', 'idx3'], :]
# df.iloc[[1, 3]]

Unnamed: 0,col0,col1,col2,col3,col4,col5
idx1,3,5,2,4,7,6
idx3,8,1,5,9,8,9


In [23]:
# select a cell
df.at['idx1', 'col1'] # faster than loc
# df.iat[1, 1]
# df.loc['idx1', 'col1']
# df.iloc[1, 1]

5

In [24]:
# set a cell value
df.at['idx1', 'col1'] = 100
# df.iat[1, 1] = 100
df

Unnamed: 0,col0,col1,col2,col3,col4,col5
idx0,5,0,3,3,7,9
idx1,3,100,2,4,7,6
idx2,8,8,1,6,7,7
idx3,8,1,5,9,8,9


In [25]:
# select by condition
df[df['col0'] >= 5]

Unnamed: 0,col0,col1,col2,col3,col4,col5
idx0,5,0,3,3,7,9
idx2,8,8,1,6,7,7
idx3,8,1,5,9,8,9


In [26]:
# select by multiple conditions
df[(df['col0'] >= 5) & (df['col2'] < 5)]

Unnamed: 0,col0,col1,col2,col3,col4,col5
idx0,5,0,3,3,7,9
idx2,8,8,1,6,7,7
