# Pandas `Series`

In [2]:
import pandas as pd
pd.Series?

In [8]:
animals = ['Tiger','Bear','Moose']
pd.Series(animals)

0    Tiger
1     Bear
2    Moose
dtype: object

In [6]:
numbers = [3,1,4]
pd.Series(numbers)

0    3
1    1
2    4
dtype: int64

In [9]:
animals = ['Tiger','Bear', None]
pd.Series(animals)

0    Tiger
1     Bear
2     None
dtype: object

In [21]:
numbers = [3, 1, None]
pd.Series(numbers)

RangeIndex(start=0, stop=3, step=1)

In [11]:
import numpy as np
np.nan == None

False

In [12]:
if(np.nan):
    print ('check')

check


In [13]:
np.nan == np.nan

False

In [14]:
np.isnan(np.nan)

True

In [16]:
numbers = [3,1,4,'Bear', None]
pd.Series(numbers)

0       3
1       1
2       4
3    Bear
4    None
dtype: object

In [18]:
np.isnan('hoi')

TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [19]:
sports = {'Achery':'Bhutan',
         'Golf':'Scotland',
         'Sumo':'Japan',
         'Taekwondo':'South Korea'}
s = pd.Series(sports)
s

Achery            Bhutan
Golf            Scotland
Sumo               Japan
Taekwondo    South Korea
dtype: object

In [20]:
s.index

Index(['Achery', 'Golf', 'Sumo', 'Taekwondo'], dtype='object')

In [29]:
pd.Series(['Tiger','Bear','Moose'], index=['India','India', 1]).index

Index(['India', 'India', 1], dtype='object')

In [31]:
sports = {'Achery':'Bhutan',
         'Golf':'Scotland',
         'Sumo':'Japan',
         'Taekwondo':'South Korea'}
s = pd.Series(sports, index=['Golf','Sumo','Hockey'])
s

Golf      Scotland
Sumo         Japan
Hockey         NaN
dtype: object

## Querying a `Series` object

In [32]:
sports = {'Achery':'Bhutan',
         'Golf':'Scotland',
         'Sumo':'Japan',
         'Taekwondo':'South Korea'}
s = pd.Series(sports)
s

Achery            Bhutan
Golf            Scotland
Sumo               Japan
Taekwondo    South Korea
dtype: object

In [60]:
s.iloc[ [1,3] ]

Golf            Scotland
Taekwondo    South Korea
dtype: object

In [59]:
s.iloc[ 0:3 ]

Achery      Bhutan
Golf      Scotland
Sumo         Japan
dtype: object

In [62]:
s.loc[ ['Achery','Sumo'] ]

Achery    Bhutan
Sumo       Japan
dtype: object

In [71]:
s = pd.Series( data=[100, 120, 101, 3])
s

0    100
1    120
2    101
3      3
dtype: int64

In [74]:
%%timeit
sum(s)

940 ns ± 5.52 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)


In [75]:
%%timeit
tot = 0
for i in s:
    tot += i
    
tot

997 ns ± 9.23 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)


In [77]:
%%timeit
import numpy as np
tot = np.sum(s)
tot

16.4 µs ± 100 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [98]:
s = pd.Series(np.random.randint(0,1000,100))
s.head()

0    964
1    374
2    543
3    571
4    908
dtype: int64

In [79]:
len(s)

10000

In [84]:
%%timeit -n 100
tot = 0
for x in s:
    tot += x

    tot

700 µs ± 142 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [85]:
%%timeit -n 100
tot = np.sum(s)
tot

The slowest run took 4.08 times longer than the fastest. This could mean that an intermediate result is being cached.
60.7 µs ± 23.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [93]:
original_sports = pd.Series({'Archery': 'Bhutan',
                             'Golf': 'Scotland',
                             'Sumo': 'Japan',
                             'Taekwondo': 'South Korea'})
cricket_loving_countries = pd.Series(['Australia',
                                      'Barbados',
                                      'Pakistan',
                                      'England'], 
                                   index=['Cricket',
                                          'Cricket',
                                          'Cricket',
                                          'Cricket'])
all_countries = original_sports.append(cricket_loving_countries)
all_countries

  all_countries = original_sports.append(cricket_loving_countries)


Archery           Bhutan
Golf            Scotland
Sumo               Japan
Taekwondo    South Korea
Cricket        Australia
Cricket         Barbados
Cricket         Pakistan
Cricket          England
dtype: object

In [94]:
all_countries.loc['Cricket']

Cricket    Australia
Cricket     Barbados
Cricket     Pakistan
Cricket      England
dtype: object

In [97]:
d = {'Archery': 'Bhutan',
                             'Golf': 'Scotland',
                             'Sumo': 'Japan',
                             'Taekwondo': 'South Korea'}
d['Archery'] = 'Groningen'
d


{'Archery': 'Groningen',
 'Golf': 'Scotland',
 'Sumo': 'Japan',
 'Taekwondo': 'South Korea'}

## The `DataFrame` type

In [127]:
purchase_1 = pd.Series({'Name': 'Chris',
                        'Item Purchased': 'Dog Food',
                        'Cost': 22.50})
purchase_2 = pd.Series({'Name': 'Kevyn',
                        'Item Purchased': 'Kitty Litter',
                        'Cost': 2.50})
purchase_3 = pd.Series({'Name': 'Vinod',
                        'Item Purchased': 'Bird Seed',
                        'Cost': 5.00})
purchase_4 = pd.Series({'Name': 'Karel',
                        'Item Purchased': 'Aston Martin',
                        'Cost': 97})

df = pd.DataFrame(data=[purchase_1, purchase_2, purchase_3, purchase_4], index=['Store 1', 'Store 2', 'Store 3', 'Store 4'])
df.head()

Unnamed: 0,Name,Item Purchased,Cost
Store 1,Chris,Dog Food,22.5
Store 2,Kevyn,Kitty Litter,2.5
Store 3,Vinod,Bird Seed,5.0
Store 4,Karel,Aston Martin,97.0


In [115]:
df.loc[ ['Store 1', 'Store 3'] ]

Unnamed: 0,Name,Item Purchased,Cost
Store 1,Chris,Dog Food,22.5
Store 3,Vinod,Bird Seed,5.0


In [117]:
df.loc['Store 1']['Cost']

22.5

In [123]:
df.T

Unnamed: 0,Store 1,Store 2,Store 3
Name,Chris,Kevyn,Vinod
Item Purchased,Dog Food,Kitty Litter,Bird Seed
Cost,22.5,2.5,5.0


In [124]:
df.T.loc['Name']

Store 1    Chris
Store 2    Kevyn
Store 3    Vinod
Name: Name, dtype: object

In [130]:
df.shape

(4, 3)

In [136]:
arr = np.array( [[1,2,3], [2,4,6], [2,3,5], [1,3,5]] )
arr

array([[1, 2, 3],
       [2, 4, 6],
       [2, 3, 5],
       [1, 3, 5]])

In [138]:
arr.shape

(4, 3)

In [147]:
arr[0:2, :]

array([[1, 2, 3],
       [2, 4, 6]])

In [152]:
df.loc[['Store 1','Store 3'], ['Name', 'Cost']]

Unnamed: 0,Name,Cost
Store 1,Chris,22.5
Store 3,Vinod,5.0


In [155]:
df.drop('Store 4')

Unnamed: 0,Name,Item Purchased,Cost
Store 1,Chris,Dog Food,22.5
Store 2,Kevyn,Kitty Litter,2.5
Store 3,Vinod,Bird Seed,5.0


In [160]:
df2 = df.copy()
df2.drop('Store 1', inplace=True)

In [162]:
df2

Unnamed: 0,Name,Item Purchased,Cost
Store 2,Kevyn,Kitty Litter,2.5
Store 3,Vinod,Bird Seed,5.0
Store 4,Karel,Aston Martin,97.0


In [166]:
df2.drop('Name', inplace=True, axis=1)

In [168]:
df2

Unnamed: 0,Item Purchased,Cost
Store 3,Bird Seed,5.0
Store 4,Aston Martin,97.0


## Data loading

In [174]:
df = pd.read_csv('data/olympics.csv', header=0, skiprows=1)
df.head()

Unnamed: 0.1,Unnamed: 0,№ Summer,01 !,02 !,03 !,Total,№ Winter,01 !.1,02 !.1,03 !.1,Total.1,№ Games,01 !.2,02 !.2,03 !.2,Combined total
0,Afghanistan (AFG),13,0,0,2,2,0,0,0,0,0,13,0,0,2,2
1,Algeria (ALG),12,5,2,8,15,3,0,0,0,0,15,5,2,8,15
2,Argentina (ARG),23,18,24,28,70,18,0,0,0,0,41,18,24,28,70
3,Armenia (ARM),5,1,2,9,12,6,0,0,0,0,11,1,2,9,12
4,Australasia (ANZ) [ANZ],2,3,4,5,12,0,0,0,0,0,2,3,4,5,12


In [177]:
df.columns

Index(['Unnamed: 0', '№ Summer', '01 !', '02 !', '03 !', 'Total', '№ Winter',
       '01 !.1', '02 !.1', '03 !.1', 'Total.1', '№ Games', '01 !.2', '02 !.2',
       '03 !.2', 'Combined total'],
      dtype='object')

In [178]:
for col in df.columns:
    if col[:2]=='01':
        df.rename(columns={col:'Gold' + col[4:]}, inplace=True)
    if col[:2]=='02':
        df.rename(columns={col:'Silver' + col[4:]}, inplace=True)
    if col[:2]=='03':
        df.rename(columns={col:'Bronze' + col[4:]}, inplace=True)
    if col[:1]=='№':
        df.rename(columns={col:'#' + col[1:]}, inplace=True) 

df.columns

Index(['Unnamed: 0', '# Summer', 'Gold', 'Silver', 'Bronze', 'Total',
       '# Winter', 'Gold.1', 'Silver.1', 'Bronze.1', 'Total.1', '# Games',
       'Gold.2', 'Silver.2', 'Bronze.2', 'Combined total'],
      dtype='object')

### Alles in een list-comprehension

In [180]:
l = ['Gold','Silver','Bronze']
[l[int(x[:2])-1]+x[4:] for x in df.columns if x[:2] in ['01','02','03']]

[]

## Querying a DataFrame

In [183]:
df

Unnamed: 0.1,Unnamed: 0,# Summer,Gold,Silver,Bronze,Total,# Winter,Gold.1,Silver.1,Bronze.1,Total.1,# Games,Gold.2,Silver.2,Bronze.2,Combined total
0,Afghanistan (AFG),13,0,0,2,2,0,0,0,0,0,13,0,0,2,2
1,Algeria (ALG),12,5,2,8,15,3,0,0,0,0,15,5,2,8,15
2,Argentina (ARG),23,18,24,28,70,18,0,0,0,0,41,18,24,28,70
3,Armenia (ARM),5,1,2,9,12,6,0,0,0,0,11,1,2,9,12
4,Australasia (ANZ) [ANZ],2,3,4,5,12,0,0,0,0,0,2,3,4,5,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142,Independent Olympic Participants (IOP) [IOP],1,0,1,2,3,0,0,0,0,0,1,0,1,2,3
143,Zambia (ZAM) [ZAM],12,0,1,1,2,0,0,0,0,0,12,0,1,1,2
144,Zimbabwe (ZIM) [ZIM],12,3,4,1,8,1,0,0,0,0,13,3,4,1,8
145,Mixed team (ZZX) [ZZX],3,8,5,4,17,0,0,0,0,0,3,8,5,4,17


In [190]:
only_gold = df.where(df['Gold']>0)
only_gold.dropna(inplace=True)
only_gold.head()

Unnamed: 0.1,Unnamed: 0,# Summer,Gold,Silver,Bronze,Total,# Winter,Gold.1,Silver.1,Bronze.1,Total.1,# Games,Gold.2,Silver.2,Bronze.2,Combined total
1,Algeria (ALG),12.0,5.0,2.0,8.0,15.0,3.0,0.0,0.0,0.0,0.0,15.0,5.0,2.0,8.0,15.0
2,Argentina (ARG),23.0,18.0,24.0,28.0,70.0,18.0,0.0,0.0,0.0,0.0,41.0,18.0,24.0,28.0,70.0
3,Armenia (ARM),5.0,1.0,2.0,9.0,12.0,6.0,0.0,0.0,0.0,0.0,11.0,1.0,2.0,9.0,12.0
4,Australasia (ANZ) [ANZ],2.0,3.0,4.0,5.0,12.0,0.0,0.0,0.0,0.0,0.0,2.0,3.0,4.0,5.0,12.0
5,Australia (AUS) [AUS] [Z],25.0,139.0,152.0,177.0,468.0,18.0,5.0,3.0,4.0,12.0,43.0,144.0,155.0,181.0,480.0


In [199]:
len(df.where((df['Gold']>0) & (df['Gold.1']>0)).dropna()['Unnamed: 0'].unique())

37