In [1]:
import pandas as pd
import numpy as np

In [9]:
# You can query series either by referring to the index postion, using iloc,
# or by referring to the index label, using loc
students_classes = {'Alice': 'Physics',
                   'Jack': 'Chemistry',
                   'Molly': 'English',
                   'Sam': 'History'}

s = pd.Series(students_classes)
print(s,'\n')

# To access the fourth entry using the index postion
print(s.iloc[3],'\n')

# To acces what class Molly is in
print(s.loc['Molly'],'\n')

# If you only pass an integer, python assumes you want to use iloc, if
# you use an object it'll assume you want to use loc
print(s[1],'\n')
print(s['Alice'])

Alice      Physics
Jack     Chemistry
Molly      English
Sam        History
dtype: object 

History 

English 

Chemistry 

Physics


In [11]:
# What if the given index label are integers? Pandas can't decide
# automatically so you have to be explicit
class_codes = {99: 'Physics',
               100: 'Chemistry',
               101: 'English',
               102: 'History'}

s = pd.Series(class_codes)
print(s)
s[0]

99       Physics
100    Chemistry
101      English
102      History
dtype: object


KeyError: 0

In [12]:
s.iloc[0]

'Physics'

In [8]:
# To generate data out of the series, a programmatic approach would be
# to iterate over each of the items

#  e.g Getting the average grade
grades = pd.Series([90, 80, 70, 60])
total = 0

for grade in grades:
    total += grade
print(total/len(grades))

75.0


In [9]:
# This is too slow however, as computer can do many computations 
# simultaneously

# Pandas and nupy supports a method of computation called vectorization,
# i.e the ability to execute multiple instructions at once, which results
# in faster execution times

# this method works with most functions in numpy, including sum()
total = np.sum(grades)
print(total/len(grades))

75.0


In [13]:
# To confirm this is actually faster we can use a magic fucntion

# Create a big Series full of random numbers
numbers = pd.Series(np.random.randint(0, 1000, 10000))

# Top five numbers in series to make sure it works as intended
print(numbers.head())

# Verify the lenght
len(numbers)

0    157
1    304
2    340
3    156
4    369
dtype: int32


10000

In [None]:
# To campare average speeds,we'll use a cellular magic function called 
# timeit
%%timeit -n 100
total = 0

for number in numbers:
    total += number
total/len(number) #---> 2.8 ms +- 343 μs per loop

In [None]:
# Now let's try it for vectorization
%%timeit -n 100
total = np.sum(number)

total/len(numbers) #---> 262 μs +- 117 μs per loop

In [16]:
# A vectorization-related Pandas feature is broadcasting, which allows for
# the application of an operation to every value in a Series, changing it 
numbers.head()

0    157
1    304
2    340
3    156
4    369
dtype: int32

In [17]:
# Now let's increase everything by 2 
numbers += 2

numbers.head()

0    159
1    306
2    342
3    158
4    371
dtype: int32

In [24]:
# The other way of doing this would be to iterate through the values
# in the Series like a dictionary by using unpacking and the itertimes()
# function
# This is slower than vectorization, however

for label, value in numbers.iteritems():

    # for the item which is returned, call at()
    numbers.at[label] = value+2

numbers.head()

0    161
1    308
2    344
3    160
4    373
dtype: int32

In [25]:
# The .loc attribute lets us not only modify data in place but
# to add new data as well
# If a passed index doesn't exist, it will be added
s = pd.Series([1,2,3])
s.loc['History'] = 102

s

0            1
1            2
2            3
History    102
dtype: int64

In [27]:
# What if index values are not unique?
# You can query series either by referring to the index postion, using iloc,
# or by referring to the index label, using loc
students_classes = pd.Series({'Alice': 'Physics',
                   'Jack': 'Chemistry',
                   'Molly': 'English',
                   'Sam': 'History'})

students_classes

Alice      Physics
Jack     Chemistry
Molly      English
Sam        History
dtype: object

In [28]:
# Create a new Series just for the classes a student Kelly is taking
kelly_classes = pd.Series(['Philosophy', 'Arts', 'Math'], index = ['Kelly', 'Kelly', 'Kelly'])
kelly_classes

Kelly    Philosophy
Kelly          Arts
Kelly          Math
dtype: object

In [30]:
# Append this data to the students_classes Series
# Append doesn't change the underlying data, instead it returns 
# a new series
all_students_classes = students_classes.append(kelly_classes)
print(all_students_classes)

students_classes

Alice       Physics
Jack      Chemistry
Molly       English
Sam         History
Kelly    Philosophy
Kelly          Arts
Kelly          Math
dtype: object


Alice      Physics
Jack     Chemistry
Molly      English
Sam        History
dtype: object

In [32]:
# If we try to query kelly_classes, Pandas will return a whole series
all_students_classes.loc['Kelly']

Kelly    Philosophy
Kelly          Arts
Kelly          Math
dtype: object