# The Series Data Structure

In [1]:
# The series are a cross between a list and a dictionary. The items are all stord in order and there's labels with which
# you can retrieve them.

In [32]:
import pandas as pd
import numpy as np
import re

In [3]:
students = ['Alice', 'Jack', 'Molly']
pd.Series(students)

0    Alice
1     Jack
2    Molly
dtype: object

In [4]:
numbers=[1,2,3]
pd.Series(numbers)

0    1
1    2
2    3
dtype: int64

In [5]:
students = ['Alice','Jack',None]
pd.Series(students)

0    Alice
1     Jack
2     None
dtype: object

In [12]:
numbers=[1,2,None]
pd.Series(numbers)

0    1.0
1    2.0
2    NaN
dtype: float64

In [9]:
# NaN is *NOT* equivalent to None, but it results that you can't neither do an equality test of NaN to itself. 
# There are special functions

np.isnan(np.nan)

True

In [13]:
# You can also create series form dictionaries
students_scores = {'Alice': 'Physics',
                   'Jack': 'Chemistry',
                   'Molly': 'English'}
s = pd.Series(students_scores)
s

Alice      Physics
Jack     Chemistry
Molly      English
dtype: object

In [15]:
s.index

Index(['Alice', 'Jack', 'Molly'], dtype='object')

In [14]:
s['Alice']

'Physics'

In [16]:
students = [('Alice','Brown'),('Jack','White'),('Molly','Green')]
pd.Series(students)

0    (Alice, Brown)
1     (Jack, White)
2    (Molly, Green)
dtype: object

In [17]:
# You can also separate your index creation from the data
s = pd.Series(['Physics','Chemistry','English'], index=['Alice','Jack', 'Molly'])
s

Alice      Physics
Jack     Chemistry
Molly      English
dtype: object

In [18]:
# If the labels and values are not aligned, Pyhon will add None or NaN type values for any index you provide
students_scores = {'Alice': 'Physics',
                   'Jack': 'Chemistry',
                   'Molly': 'English'}

s = pd.Series(students_scores, index = ['Alice', 'Molly', 'Sam'])
s

Alice    Physics
Molly    English
Sam          NaN
dtype: object

# Querying a Series

In [20]:
# A pandas Series can be queried either by the index position or the index label.
# To query by numeric location, starting at zero, use the iloc attribute
# To query by the index label, use the loc attribute.

import pandas as pd
students_classes = {'Alice': 'Physics',
                   'Jack': 'Chemistry',
                   'Molly': 'English',
                   'Sam': 'History'}
s = pd.Series(students_classes)
s

Alice      Physics
Jack     Chemistry
Molly      English
Sam        History
dtype: object

In [21]:
s.iloc[3]

'History'

In [22]:
s.loc['Molly']

'English'

In [None]:
# Note that we use the indexing operator as iloc and loc are attributes, not methods

In [23]:
# Indexing directly with a number is equivalent to using iloc
s[3]

'History'

In [24]:
# Indexing passing a string, is equivalent to using the loc attribute
s['Molly']

'English'

In [29]:
# When using numeric labels, it's better to explicitly call for the iloc attribute
class_code =    {99: 'Physics',
                100: 'Chemistry',
                101: 'English',
                102: 'History'}   
cc = pd.Series(class_code)
cc[0]

KeyError: 0

In [30]:
cc.iloc[0]

'Physics'

In [41]:
# For iterating over all the elements in a series...
index = 0
for c in cc:
    if re.search('Physics', c):
        print(index)
    index += 1 

0


In [42]:
# Measuring the speed of code in Jupyter
numbers = pd.Series(np.random.randint(0,1000,10000))

# The head() function shows the 5 first elements in a series
print(numbers.head())

0    222
1    753
2    919
3    966
4    630
dtype: int64

In [None]:
# We measure the time using timeit, so the function must be the first call in the cell. Not even comments are allowed
# We fix the number of loops to 100,which means the code will be run 100 times and we will get the mean and the std.

In [44]:
%%timeit -n 100
total = 0
for number in numbers:
    total+=number
total/len(numbers)


1.12 ms ± 8.73 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
# We try to make the same operation using the vectorization

In [45]:
%%timeit -n 100
total = np.sum(numbers)
total/len(numbers)

68.2 µs ± 4.01 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [46]:
# Broadcasting allows us to apply an operation to every value in the series 
print(numbers.head())
numbers+=2
print(numbers.head())

0    222
1    753
2    919
3    966
4    630
dtype: int64
0    224
1    755
2    921
3    968
4    632
dtype: int64


In [47]:
# For iterating through a series, we can also use the iteritems() function
for label, value in numbers.iteritems():
    numbers.set_value(label, value+2)
numbers.head()

0    226
1    757
2    923
3    970
4    634
dtype: int64

In [None]:
# *NOTE*: The .loc attribute also allows us to add new data, by passing an index that didn't previously exist.

In [49]:
# We can have Series with repeated indexes: 
students_classes = pd.Series({'Alice': 'Physics',
                   'Jack': 'Chemistry',
                   'Molly': 'English',
                   'Sam': 'History'})
students_classes

Alice      Physics
Jack     Chemistry
Molly      English
Sam        History
dtype: object

In [52]:
# Kelly has three entries
kelly_classes = pd.Series(['Philosophy','Arts','Math'], index = ['Kelly','Kelly','Kelly'])
kelly_classes

Kelly    Philosophy
Kelly          Arts
Kelly          Math
dtype: object

In [53]:
# We can append Series using the .append() method
all_students_classes = students_classes.append(kelly_classes)
all_students_classes

Alice       Physics
Jack      Chemistry
Molly       English
Sam         History
Kelly    Philosophy
Kelly          Arts
Kelly          Math
dtype: object

In [54]:
# Its important to note that the original series hasn't changed
students_classes

Alice      Physics
Jack     Chemistry
Molly      English
Sam        History
dtype: object

In [55]:
# Finally, we see that when we query the appended repeated labels, all three are accessed
all_students_classes.loc['Kelly']

Kelly    Philosophy
Kelly          Arts
Kelly          Math
dtype: object