# Series_Data-Structure

In [1]:
# Let's import pandas to get started
import pandas as pd

In [2]:
# we create a list of strings
students = ['Alice', 'Jack', 'Molly']

# and we call Series function
pd.Series(students)

0    Alice
1     Jack
2    Molly
dtype: object

In [3]:
# Now using integer
numbers = [1, 2, 3]
# And turn it into a series
pd.Series(numbers)

0    1
1    2
2    3
dtype: int64

In [4]:
# What about None values, let's some cases
# First, in a list of strings
students = ['Alice', 'Jack', None]
pd.Series(students)

0    Alice
1     Jack
2     None
dtype: object

In [5]:
# Second, in a list of integer
numbers = [1, 2, None]
pd.Series(numbers)

0    1.0
1    2.0
2    NaN
dtype: float64

In [6]:
# In this case, integers become floats, because
# pandas asigns None as a float typecast

In [7]:
# So, NaN is NOT equivalent to None value, to
# see that clear, we can the following

In [8]:
import numpy as np
# we compare both values, NaN and None
np.nan == None

False

In [9]:
# However
np.nan == np.nan

False

In [10]:
# Instead, we need to use a special library, 
# i.e.,
np.isnan(np.nan)
# And thus, we can compare NaN values

True

In [11]:
# Also, we can create Series from dictionaries
# for example
students_scores = {'Alice': 'Physics',
                   'Jack': 'Chemistry',
                   'Molly': 'English'}
s = pd.Series(students_scores)
s

Alice      Physics
Jack     Chemistry
Molly      English
dtype: object

In [12]:
# To index this series, we make
s.index

Index(['Alice', 'Jack', 'Molly'], dtype='object')

In [13]:
# Also, using tuples it can get created the series
students = [('Alice', 'Brown'), ('Jack', 'White'), ('Molly', 'Green')]
pd.Series(students)

0    (Alice, Brown)
1     (Jack, White)
2    (Molly, Green)
dtype: object

In [14]:
# or in the explicity way
s = pd.Series(['Physics', 'Chemistry', 'English'], index=['Alice', 'Jack', 'Molly'])
s

Alice      Physics
Jack     Chemistry
Molly      English
dtype: object

In [15]:
# And exclude any data, e.g.,
students_scores = {'Alice': 'Physics',
                   'Jack': 'Chemistry',
                   'Molly': 'English'}

s = pd.Series(students_scores, index=['Alice', 'Molly', 'Sam'])
s

Alice    Physics
Molly    English
Sam          NaN
dtype: object

In [16]:
# the result for Sam is missing

# Querying a Series

In [17]:
# To query by index we use de attributes 'iloc' and 'loc', i.e.,
students_classes = {'Alice': 'Physics',
                   'Jack': 'Chemistry',
                   'Molly': 'English',
                   'Sam': 'History'}
s = pd.Series(students_classes)
s

Alice      Physics
Jack     Chemistry
Molly      English
Sam        History
dtype: object

In [18]:
# To see the forth entry
s.iloc[3]

'History'

In [19]:
# To see what class Molly has
s.loc['Molly']

'English'

In [20]:
# Also, using dictonary notation
s[3]

'History'

In [21]:
s['Molly']

'English'

In [22]:
# however there're situations where we cannot do this, e.g.,
class_code = {99: 'Physics',
              100: 'Chemistry',
              101: 'English',
              102: 'History'}
s = pd.Series(class_code)

In [23]:
# so if we try call s[0] element we get a error, since there no
# key which be 0
s[0]

KeyError: 0

In [25]:
# To solve this, we use explicity the 'iloc' attribute
s.iloc[0]

'Physics'

In [27]:
# Now let's see some inner functions to make easy operations
grades = pd.Series([90, 80, 70, 60])
total = 0
for grade in grades:
    total += grade
print(total/len(grades))

75.0


In [29]:
# another way to make it, using inner functions
total = np.sum(grades)
print(total/len(grades))

75.0


In [32]:
# let's see another method that it's faster
numbers = pd.Series(np.random.randint(0, 10000, 1000000))
# and let's see the first 5 elements
numbers.head()

0    5351
1    3969
2    8853
3     469
4    6243
dtype: int64

In [34]:
# so to figure out the vectorization method is faster than first one
# we take the time
%time #it -n 100 #this fuctions is proper from jupyter
total = 0
for number in numbers:
    total += number
total/len(numbers)

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 7.63 µs


5000.296187

In [36]:
%time
total = np.sum(numbers)
total/len(numbers)

CPU times: user 5 µs, sys: 1e+03 ns, total: 6 µs
Wall time: 10.5 µs


5000.296187

In [38]:
# Other operations by vectorization are
numbers.head()

0    5351
1    3969
2    8853
3     469
4    6243
dtype: int64

In [40]:
numbers += 2
numbers.head()

0    5355
1    3973
2    8857
3     473
4    6247
dtype: int64

In [46]:
# On the other hand
for label, value in numbers.items():
    numbers.iat[value] += 2
numbers.head()

0    5355
1    3973
2    8857
3     473
4    6635
dtype: int64

In [48]:
# we see that the first methos by vectorization is faster than iterate method
# Now let's see other examples with data bases
students_classes = pd.Series({'Alice': 'Physics',
                   'Jack': 'Chemistry',
                   'Molly': 'English',
                   'Sam': 'History'})
students_classes

Alice      Physics
Jack     Chemistry
Molly      English
Sam        History
dtype: object

In [50]:
kelly_classes = pd.Series(['Philosophy', 'Arts', 'Math'], index=['Kelly', 'Kelly', 'Kelly'])
kelly_classes

Kelly    Philosophy
Kelly          Arts
Kelly          Math
dtype: object

In [54]:
all_students_classes = students_classes.append(kelly_classes)
all_students_classes

  all_students_classes = students_classes.append(kelly_classes)


Alice       Physics
Jack      Chemistry
Molly       English
Sam         History
Kelly    Philosophy
Kelly          Arts
Kelly          Math
dtype: object

In [56]:
# when pandas make this, it actually create a new object
students_classes

Alice      Physics
Jack     Chemistry
Molly      English
Sam        History
dtype: object

In [None]:
# i.e., it doesn't modify the original object