# Pandas: The Series Data Structure

In [None]:
# A Series is a cross between a LIST and a DICTIONARY.
# A Series has indexes like a list and labels like a dictionary

#              Animals    <----- Name/Label
# Index --> 0   Dog
#           1   Cat   <--- Values
#           2   Bear


# Labels and data can be mixed data types (i.e. int and string)

In [1]:
import pandas as pd

In [3]:
animals = ['Tiger','Bear','Moose']
animals = pd.Series(animals) # By default the name will be set to None
print(animals) # Note the dtype

0    Tiger
1     Bear
2    Moose
dtype: object


In [4]:
numbers = [1,2,3]
pd.Series(numbers)

0    1
1    2
2    3
dtype: int64

In [7]:
# Give indexes labels using dictionaries
sports = {'Archery': 'Bhutan',
         'Golf': 'Scotland',
         'Sumo': 'Japan',
         'Taekwondo': 'South Korea'}
s = pd.Series(sports)
print(s)

Archery           Bhutan
Golf            Scotland
Sumo               Japan
Taekwondo    South Korea
dtype: object


In [9]:
print(s.index)

Index(['Archery', 'Golf', 'Sumo', 'Taekwondo'], dtype='object')


In [12]:
print(s['Archery'])

Bhutan


In [13]:
s[0]

'Bhutan'

## How Numpy and Pandas Deal with Null Data

In [5]:
# How Numpy (and hence Pandas) deals with Null data
animals = ['Tiger','Bear',None] # Our series is converted to an 'Object' type
pd.Series(animals)

0    Tiger
1     Bear
2     None
dtype: object

In [6]:
# If we have a None in an integer arrage, out datatype is set as float with a special value NaN (Not a Number)
# NaN is NOT None
numbers = [1,2,None]
pd.Series(numbers)

0    1.0
1    2.0
2    NaN
dtype: float64

In [20]:
import numpy as np
print(np.nan == None)
print(np.nan == np.nan)
print(np.isnan(np.nan))

False
False
True


In [21]:
type(None)

NoneType

In [22]:
type(np.nan)

float

## Querying a Series

In [23]:
import pandas as pd

sports = {'Archery': 'Bhutan',
          'Golf': 'Scotland',
          'Sumo': 'Japan',
         'Taekwondo': 'South Korea'}
s = pd.Series(sports)
s

Archery           Bhutan
Golf            Scotland
Sumo               Japan
Taekwondo    South Korea
dtype: object

In [24]:
s[3]

'South Korea'

In [25]:
s['Golf']

'Scotland'

In [26]:
#iloc is not a method. It is an attribute that returns an iterable list-type object. So we get our value using square brackets.
s.iloc[3]

'South Korea'

In [27]:
s.loc['Golf']

'Scotland'

In [28]:
# The indexing operator, [], can slice can select rows and columns too but not simultaneously.
# This is why we need iloc and loc

In [29]:
x = pd.Series(np.random.randint(0,1000,10000))
x.head()

0    707
1    185
2    224
3    768
4    951
dtype: int32

In [31]:
np.sum(x)

# x.sum() and sum(x) work too
# Note on parallel computing
# Modern computers can handle many calculations simultaneously (vecotrization)
# You should avoid looping whenever you can
# numpy supports vectorizations for almostg all functions in the library, including the sum() function

4982288

In [34]:
x += 1
x.head()

0    710
1    188
2    227
3    771
4    954
dtype: int32

In [35]:
s

Archery           Bhutan
Golf            Scotland
Sumo               Japan
Taekwondo    South Korea
dtype: object

In [36]:
# APPENDING Records to s
s2 = pd.Series(['Canada','Malaysia'], index = ['Hockey','Badminton'])
s2

Hockey         Canada
Badminton    Malaysia
dtype: object

In [37]:
s_s2 = s.append(s2)
s_s2

Archery           Bhutan
Golf            Scotland
Sumo               Japan
Taekwondo    South Korea
Hockey            Canada
Badminton       Malaysia
dtype: object

In [38]:
s # Unchanged

Archery           Bhutan
Golf            Scotland
Sumo               Japan
Taekwondo    South Korea
dtype: object

In [39]:
s2 #unchanged

Hockey         Canada
Badminton    Malaysia
dtype: object