In [6]:
import pandas as pd

## Data Frame

In [3]:
# The simplest data frame possible
pd.DataFrame({'Yes': [50, 21], 'No': [131, 2]})

Unnamed: 0,Yes,No
0,50,131
1,21,2


In [4]:
pd.DataFrame({'Bob': ['I liked it.', 'It was awful.'], 'Sue': ['Pretty good.', 'Bland.']})

Unnamed: 0,Bob,Sue
0,I liked it.,Pretty good.
1,It was awful.,Bland.


In [5]:
# In the following way we can get the index assigned ourselves
pd.DataFrame({'Bob': ['I liked it.', 'It was awful.'], 
              'Sue': ['Pretty good.', 'Bland.']},
             index=['Product A', 'Product B'])

Unnamed: 0,Bob,Sue
Product A,I liked it.,Pretty good.
Product B,It was awful.,Bland.


## Series

In [7]:
# A Series, by contrast, is a sequence of data values. If a DataFrame is a table, a Series is a list
pd.Series([1, 2, 3, 4, 5])

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [8]:
pd.Series([30, 35, 40], index=['2015 Sales', '2016 Sales', '2017 Sales'], name='Product A')

2015 Sales    30
2016 Sales    35
2017 Sales    40
Name: Product A, dtype: int64

## A couple useful functions

In [1]:
# *.shape() - to see the dimensions
# pd.read_csv() - this function has over 30 arguments and methods attached to it so it's worth checking them out!
# *.head() - to see the first couple of rows along with headers

## Data frame and series manipulation

In [4]:
df = pd.DataFrame({'Yes': [50, 21], 'No': [131, 2]})

In [6]:
df.Yes

0    50
1    21
Name: Yes, dtype: int64

In [7]:
df['Yes']

0    50
1    21
Name: Yes, dtype: int64

In [8]:
df['Yes'][0] # this is going down to a specific serie object level

50

In [13]:
df['Yes'][0:]

0    50
1    21
Name: Yes, dtype: int64

## Index-based selection

In [14]:
df.iloc[0] # that's selecting the first row of the data based on the index value

Yes     50
No     131
Name: 0, dtype: int64

In [15]:
df.iloc[:, 0] # in order to get the first column we would need to write this

0    50
1    21
Name: Yes, dtype: int64

In [16]:
df.iloc[, 0] # not passing the colon would result in an error

SyntaxError: invalid syntax (<ipython-input-16-a31e63b9cbf8>, line 1)

In [21]:
df.iloc[[0, 1], 0] # there's a number of different ways of defining this

0    50
1    21
Name: Yes, dtype: int64

In [22]:
df.iloc[-1:] # the last x rows could be retrieved like this

Unnamed: 0,Yes,No
1,21,2


## Label-based selection

In [24]:
df.loc[0, 'No'] # the difference between iloc and loc is that iloc works only based on 'indices' whereas loc works based on 'labels'

131

In [25]:
df.loc[0:1, 'No']

0    131
1      2
Name: No, dtype: int64

In [26]:
df.loc[1, 'Yes']

21

In [27]:
df.loc[:, ['Yes', 'No']]

Unnamed: 0,Yes,No
0,50,131
1,21,2


### Worth noting

iloc uses the Python stdlib indexing scheme, where the first element of the range is included and the last one excluded. So 0:10 will select entries 0,...,9. loc, meanwhile, indexes inclusively. So 0:10 will select entries 0,...,10.

## Conditional selection

In [30]:
df.loc[df.Yes == 50] # for boolean based conditional selection we always need to use 'loc' - the reason for this is that we're
# evaluating on a set of boolean series. Otherwise this operation would not work

Unnamed: 0,Yes,No
0,50,131


In [31]:
df.loc[(df.Yes == 50) & (df.No > 150)] # we can also easily combine conditions together using '&' and '\'

Unnamed: 0,Yes,No


In [48]:
df['Cat'] = ['Miau', 'Hau'] # pandas comes with a handy function for evaluating conditions for categorical columns called '.isin()'

df.loc[df['Cat'].isin(['Miau'])]

Unnamed: 0,Yes,No,Cat
0,50,131,Miau


In [49]:
df.Yes

0    50
1    21
Name: Yes, dtype: int64

In [52]:
# another set of usefull functions are '.isnull()' and '.notnull()''
df.loc[df['Cat'].isnull()]

print('## Break ##')

df.loc[df['Cat'].notnull()]

## Break ##


Unnamed: 0,Yes,No,Cat
0,50,131,Miau
1,21,2,Hau
