In [1]:
# import libraries needed
import numpy as np
import pandas as pd

# Pandas Series

Series are Pandas data structures built on top of NumPy arrays
- series also contains an index and an optiona lname, in addition to the array
- they can be created from other data types, but are usually imported from external sources
- two or more series grouped together form a Pandas DataFrame

Pandas Series have these key properties:
  - values - data array in the Series
  - index - the index array in the Series
  - name - optional name for series
  - dtype - data type of the elements in the values array

In [2]:
sales = [0, 5, 155, 0, 518, 0, 1827, 616, 317, 325]

# convert list to series
sales_series = pd.Series(sales, name='Sales')

sales_series

0       0
1       5
2     155
3       0
4     518
5       0
6    1827
7     616
8     317
9     325
Name: Sales, dtype: int64

In [3]:
array = np.arange(5)

series = pd.Series(array)

In [4]:
pd.Series(np.arange(6), name='Test Array')

0    0
1    1
2    2
3    3
4    4
5    5
Name: Test Array, dtype: int64

In [5]:
series.values

array([0, 1, 2, 3, 4])

In [6]:
series.mean()

2.0

In [7]:
series.shape

(5,)

In [8]:
series.median()

2.0

In [9]:
# assign new index
series.index = [10, 20, 30, 40, 50]

series.name = 'special series'

series

10    0
20    1
30    2
40    3
50    4
Name: special series, dtype: int64

In [10]:
series.dtype

dtype('int64')

# Pandas Data Types

Pandas data types mostly expand on their base Python and NumPy equivalents

1. Numeric
  - NumPy
    - bool
    - int64
    - float64
  - Pandas
    - boolean
    - int64
    - float64

2. Object/Text
  - NumPy
    - object
  - Pandas
    - string
    - category
    
3. Time Series
  - NumPy
    - datetime64
    - timedelta (duration between two dates or times)
    - period (a span of time)

In [11]:
pd.Series(range(5))

0    0
1    1
2    2
3    3
4    4
dtype: int64

In [12]:
pd.Series(range(5)).astype('float')

0    0.0
1    1.0
2    2.0
3    3.0
4    4.0
dtype: float64

In [13]:
pd.Series(range(5)).astype('bool').sum()

4

In [14]:
pd.Series(range(5)).astype('string')

0    0
1    1
2    2
3    3
4    4
dtype: string

In [15]:
pd.Series(['a', 'b', 'c']).astype('object')

0    a
1    b
2    c
dtype: object

# Series Indexing

- Index lets you easily access "rows" in a Pandas Series or DataFrame
- you can index and slice Series like other sequence data types
- There are cases where it's applicable to use a customer index for accessing rows

In [27]:
sales

sales_series = pd.Series(sales, name='Sales')

sales_series

0       0
1       5
2     155
3       0
4     518
5       0
6    1827
7     616
8     317
9     325
Name: Sales, dtype: int64

In [28]:
sales_series[2]

155

In [29]:
sales_series[2:4]

2    155
3      0
Name: Sales, dtype: int64

# Custom Indices

In [33]:
sales_1 = sales[0:5]

In [36]:
sales_1

items = ['coffee', 'bananas', 'tea', 'coconut', 'sugar']

# Custom inddices can be assigned when creating the series or by assignment
sales_series = pd.Series(sales_1, index=items, name='Sales')

sales_series

coffee       0
bananas      5
tea        155
coconut      0
sugar      518
Name: Sales, dtype: int64

In [37]:
sales_series['tea']

155

In [38]:
# Slicing custom indices makes the stop point inclusive
sales_series['bananas':'coconut']

bananas      5
tea        155
coconut      0
Name: Sales, dtype: int64

In [39]:
my_series = pd.Series(range(5))

my_series

0    0
1    1
2    2
3    3
4    4
dtype: int64

In [40]:
my_series[3]

3

In [42]:
my_series[1::2]

1    1
3    3
dtype: int64

In [44]:
my_series = pd.Series(range(5), index=['Day 0', 'Day 1', 'Day 2', 'Day 3', 'Day 4'])

my_series

Day 0    0
Day 1    1
Day 2    2
Day 3    3
Day 4    4
dtype: int64

In [45]:
my_series['Day 1']

1

In [46]:
my_series['Day 1':'Day 4']

Day 1    1
Day 2    2
Day 3    3
Day 4    4
dtype: int64

# The iloc Method

- .iloc[] method is the preferred way to access values by their positional index
    - This method works even when Series have a custom, non-integer index
    - It is more efficient than slicing and is recommended by Pandas' creators

- df.iloc[row position, column position]

- row position for the value you want to access, example:
    - 0 single row
    - [5,9] multiple rows
    - [0:11] range of rows
    
- column position(s) for the value(s) you want to access

In [47]:
sales_series

coffee       0
bananas      5
tea        155
coconut      0
sugar      518
Name: Sales, dtype: int64

In [48]:
sales_series.iloc[2]

155

In [49]:
sales_series.iloc[2:4]

tea        155
coconut      0
Name: Sales, dtype: int64

In [58]:
my_series.iloc[0:4]

Day 0    0
Day 1    1
Day 2    2
Day 3    3
dtype: int64

# The .loc Method

.loc[] method is the preferred way to access values by their customer labels

df.loc[row label, column label]

row label = 
   * 'pizza' -> single row
   * ['mike', 'ike']
   * ['jan':'dec']

In [60]:
sales_series

coffee       0
bananas      5
tea        155
coconut      0
sugar      518
Name: Sales, dtype: int64

In [61]:
sales_series.loc['tea']

155

In [62]:
# inclusive range
sales_series.loc['bananas':'coconut']

bananas      5
tea        155
coconut      0
Name: Sales, dtype: int64

In [63]:
my_series

Day 0    0
Day 1    1
Day 2    2
Day 3    3
Day 4    4
dtype: int64

In [64]:
my_series.loc['Day 2':'Day4']

Day 2    2
Day 3    3
Day 4    4
dtype: int64

In [65]:
my_series.index = [0, 2, 3, 100, 5]
my_series

0      0
2      1
3      2
100    3
5      4
dtype: int64

In [68]:
my_series.loc[0:3]

0    0
2    1
3    2
dtype: int64

In [69]:
my_series.reset_index(drop=True).loc[:3]

0    0
1    1
2    2
3    3
dtype: int64

In [70]:
my_series[my_series != 2]

0      0
2      1
100    3
5      4
dtype: int64

# Duplicate Index Values

It is possible to have duplicate index values in a Pandas Series or DataFrame
- accessing these indicies by their label using .loc[] returns all corresponding rows


# Resetting the index

- reset the index in a Pandas Series back to the default range of integers using reset_index() method

- reset_index(drop=True)

In [71]:
my_series.reset_index(drop=True)

0    0
1    1
2    2
3    3
4    4
dtype: int64

# Assignment 1: Series Basics

The code has been previded to create an array, `oil_array` from a dataframe column. 

* Convert `oil_array` into a Pandas Series, called `oil_series`. Give it a name!
* Return the name, dtype, size, and index of `oil_series`.

Take the mean of the values array. 

Then, convert the series to integer datatype and recalculate the mean. 


In [73]:
# create a DataFrame from the oil file, drop missing values
oil = pd.read_csv("../retail/oil.csv").dropna()

# Grab 100 rows of oil prices
oil_array = np.array(oil["dcoilwtico"].iloc[1000:1100])

oil_array

array([52.22, 51.44, 51.98, 52.01, 52.82, 54.01, 53.8 , 53.75, 52.36,
       53.26, 53.77, 53.98, 51.95, 50.82, 52.19, 53.01, 52.36, 52.45,
       51.12, 51.39, 52.33, 52.77, 52.38, 52.14, 53.24, 53.18, 52.63,
       52.75, 53.9 , 53.55, 53.81, 53.01, 52.19, 52.37, 52.99, 53.84,
       52.96, 53.21, 53.11, 53.41, 53.41, 54.02, 53.61, 54.48, 53.99,
       54.04, 54.  , 53.82, 52.63, 53.33, 53.19, 52.68, 49.83, 48.75,
       48.05, 47.95, 47.24, 48.34, 48.3 , 48.34, 47.79, 47.02, 47.29,
       47.  , 47.3 , 47.02, 48.36, 49.47, 50.3 , 50.54, 50.25, 50.99,
       51.14, 51.69, 52.25, 53.06, 53.38, 53.12, 53.19, 52.62, 52.46,
       50.49, 50.26, 49.64, 48.9 , 49.22, 49.22, 48.96, 49.31, 48.83,
       47.65, 47.79, 45.55, 46.23, 46.46, 45.84, 47.28, 47.81, 47.83,
       48.86])

In [74]:
oil_series = pd.Series(oil_array, name = 'oil prices')

oil_series

0     52.22
1     51.44
2     51.98
3     52.01
4     52.82
      ...  
95    45.84
96    47.28
97    47.81
98    47.83
99    48.86
Name: oil prices, Length: 100, dtype: float64

In [75]:
oil_series.info()

<class 'pandas.core.series.Series'>
RangeIndex: 100 entries, 0 to 99
Series name: oil prices
Non-Null Count  Dtype  
--------------  -----  
100 non-null    float64
dtypes: float64(1)
memory usage: 928.0 bytes


In [76]:
oil_series.name

'oil prices'

In [77]:
oil_series.size

100

In [78]:
oil_series.index

RangeIndex(start=0, stop=100, step=1)

In [79]:
oil_series.dtype

dtype('float64')

In [80]:
oil_series.values

array([52.22, 51.44, 51.98, 52.01, 52.82, 54.01, 53.8 , 53.75, 52.36,
       53.26, 53.77, 53.98, 51.95, 50.82, 52.19, 53.01, 52.36, 52.45,
       51.12, 51.39, 52.33, 52.77, 52.38, 52.14, 53.24, 53.18, 52.63,
       52.75, 53.9 , 53.55, 53.81, 53.01, 52.19, 52.37, 52.99, 53.84,
       52.96, 53.21, 53.11, 53.41, 53.41, 54.02, 53.61, 54.48, 53.99,
       54.04, 54.  , 53.82, 52.63, 53.33, 53.19, 52.68, 49.83, 48.75,
       48.05, 47.95, 47.24, 48.34, 48.3 , 48.34, 47.79, 47.02, 47.29,
       47.  , 47.3 , 47.02, 48.36, 49.47, 50.3 , 50.54, 50.25, 50.99,
       51.14, 51.69, 52.25, 53.06, 53.38, 53.12, 53.19, 52.62, 52.46,
       50.49, 50.26, 49.64, 48.9 , 49.22, 49.22, 48.96, 49.31, 48.83,
       47.65, 47.79, 45.55, 46.23, 46.46, 45.84, 47.28, 47.81, 47.83,
       48.86])

In [81]:
round(oil_series.mean(), 2)

51.13

In [82]:
oil_series.astype('int').mean()

50.66

# Assignment 2:  Accessing Series Data

* Set the date series, which has been created below, to be the index of the oil price series created in assignment 1.


* Then, take the mean of the first 10 and last 10 prices of the series.


* Finally, grab all oil prices from January 1st, 2017 - January 7th, 2017 (inclusive) and set the index to the default integer index.

In [83]:
# extract date column from oil DataFrame and grab first 100 rows

dates = pd.Series(oil["date"]).iloc[1000:1100]

In [86]:
oil_series.index = dates

In [87]:
oil_series

date
2016-12-20    52.22
2016-12-21    51.44
2016-12-22    51.98
2016-12-23    52.01
2016-12-27    52.82
              ...  
2017-05-09    45.84
2017-05-10    47.28
2017-05-11    47.81
2017-05-12    47.83
2017-05-15    48.86
Name: oil prices, Length: 100, dtype: float64

In [99]:
first_10 = oil_series.iloc[:10].mean()

first_10

52.765

In [103]:
oil_series.iloc[-10:].mean()

47.129999999999995

In [111]:
oil_series.loc['2017-01-01':'2017-01-07'].reset_index(drop=True)

0    52.36
1    53.26
2    53.77
3    53.98
Name: oil prices, dtype: float64

# Filtering Series

Filter a Series by passing a logical test into the .loc[] accessor (like arrays)

- Can use operators & methods to create Boolean filters for logical tests. python operators vs pandas method
    - equal == / .eq()
    - not equal != / .ne()
    - less than or equal <= / .le()
    - less than < / .lt()
    - greater than or equal >= / .ge()
    - greater than > / .gt()
    - membership test: in / .isin()
    - inverse membership test: not in / ~isin()
    
    
 membership test 'in' and 'not in' wont work for many operations, so the pandas method must be used.

In [113]:
sales_series

coffee       0
bananas      5
tea        155
coconut      0
sugar      518
Name: Sales, dtype: int64

In [114]:
sales_series.loc[sales_series > 0]

bananas      5
tea        155
sugar      518
Name: Sales, dtype: int64

In [116]:
mask = (sales_series > 0) & (sales_series.index == 'coffee')

sales_series.loc[mask]

Series([], Name: Sales, dtype: int64)

In [118]:
# is in list
sales_series.index.isin(['coffee', 'tea'])

array([ True, False,  True, False, False])

In [119]:
# not in list
~sales_series.index.isin(['coffee', 'tea'])

array([False,  True, False,  True,  True])

In [121]:
my_series == 1

0      False
2       True
3      False
100    False
5      False
dtype: bool

In [122]:
my_series.loc[my_series != 2]

0      0
2      1
100    3
5      4
dtype: int64

In [123]:
my_series.loc[my_series.isin([1, 2])]

2    1
3    2
dtype: int64

In [124]:
# inverse condition must use paranthesis
my_series.loc[~(my_series > 2)]

0    0
2    1
3    2
dtype: int64

In [127]:
my_series

0      0
2      1
3      2
100    3
5      4
dtype: int64

In [126]:
mask = (my_series.isin([1, 2])) | (my_series > 2)

my_series.loc[mask]

2      1
3      2
100    3
5      4
dtype: int64

# Sorting Series

You can sort Series by their values or their index.

1. the sort_values() method sorts a Series by its values in ascending order

In [129]:
# descending
sales_series.sort_values(ascending=False)

sugar      518
tea        155
bananas      5
coffee       0
coconut      0
Name: Sales, dtype: int64

2. the sort_index() method sorts a Series by its index in ascending order

In [132]:
sales_series.sort_index(ascending=False)

tea        155
sugar      518
coffee       0
coconut      0
bananas      5
Name: Sales, dtype: int64

In [133]:
my_series

0      0
2      1
3      2
100    3
5      4
dtype: int64

In [134]:
# sort by values
my_series.sort_values()

0      0
2      1
3      2
100    3
5      4
dtype: int64

In [135]:
my_series.sort_values(ascending=False)

5      4
100    3
3      2
2      1
0      0
dtype: int64

In [139]:
# permanently sort series using inplace or
my_series.sort_values(ascending=False, inplace=True)

my_series

5      4
100    3
3      2
2      1
0      0
dtype: int64

In [142]:
# permanently sort series using sort
my_series2 = my_series.sort_values(ascending=False)

my_series2

5      4
100    3
3      2
2      1
0      0
dtype: int64

# Assignment 3: Sorting and Filtering Series

* First, get the 10 lowest prices from the data. 
* Sort the 10 lowest prices by date, starting with the most recent and ending with the oldest price.

* Finally, use the list of provided dates. Select only rows with these dates that had a price of less than 50 dollars per barrel.

In [143]:
# list of dates to be used to solve bullet 3

dates = [
    "2016-12-22",
    "2017-05-03",
    "2017-01-06",
    "2017-03-05",
    "2017-02-12",
    "2017-03-21",
    "2017-04-14",
    "2017-04-15",
]

In [144]:
oil_series

date
2016-12-20    52.22
2016-12-21    51.44
2016-12-22    51.98
2016-12-23    52.01
2016-12-27    52.82
              ...  
2017-05-09    45.84
2017-05-10    47.28
2017-05-11    47.81
2017-05-12    47.83
2017-05-15    48.86
Name: oil prices, Length: 100, dtype: float64

In [156]:
oil_series.sort_values(ascending=True, inplace=True)

oil_series

date
2017-05-04    45.55
2017-05-09    45.84
2017-05-05    46.23
2017-05-08    46.46
2017-03-23    47.00
              ...  
2017-02-28    54.00
2016-12-28    54.01
2017-02-21    54.02
2017-02-27    54.04
2017-02-23    54.48
Name: oil prices, Length: 100, dtype: float64

In [157]:
lowest_price = oil_series[:9]

lowest_price

date
2017-05-04    45.55
2017-05-09    45.84
2017-05-05    46.23
2017-05-08    46.46
2017-03-23    47.00
2017-03-27    47.02
2017-03-21    47.02
2017-03-14    47.24
2017-05-10    47.28
Name: oil prices, dtype: float64

In [160]:
lowest_price.sort_index(ascending=False, inplace=True)

lowest_price

date
2017-05-10    47.28
2017-05-09    45.84
2017-05-08    46.46
2017-05-05    46.23
2017-05-04    45.55
2017-03-27    47.02
2017-03-23    47.00
2017-03-21    47.02
2017-03-14    47.24
Name: oil prices, dtype: float64

In [174]:
# solution
oil_series.sort_values().iloc[:10].sort_index(ascending=False)

date
2017-05-10    47.28
2017-05-09    45.84
2017-05-08    46.46
2017-05-05    46.23
2017-05-04    45.55
2017-03-27    47.02
2017-03-23    47.00
2017-03-22    47.29
2017-03-21    47.02
2017-03-14    47.24
Name: oil prices, dtype: float64

In [175]:
# Dates in list AND price is less than or equal to 50
mask = (oil_series.index.isin(dates)) & (oil_series <= 50)
oil_series.loc[mask]

date
2017-03-21    47.02
2017-05-03    47.79
Name: oil prices, dtype: float64

# Arithmetic Operators & Methods

Python operators or pandas methods
- addition: + / .add()
- subtraction: - / .sub(), .subtract()
- multiplication: * / .mul(), .multiply()
- division: / or .div(), .truediv(), .divide()
- floor division: // or .floordiv()
- modulo: % / .mod()
- exponentiation: ** / .pow()

In [196]:
my_series = pd.Series([1, np.NaN, 2, 3, 4], index=['day 0', 'day 1', 'day 2', 'day 3', 'day 4'])

my_series                     

day 0    1.0
day 1    NaN
day 2    2.0
day 3    3.0
day 4    4.0
dtype: float64

In [197]:
my_series + 1

day 0    2.0
day 1    NaN
day 2    3.0
day 3    4.0
day 4    5.0
dtype: float64

In [201]:
my_series2 = my_series.add(1, fill_value=0).astype('int')

my_series2

day 0    2
day 1    1
day 2    3
day 3    4
day 4    5
dtype: int64

In [203]:
my_series2 / 2

day 0    1.0
day 1    0.5
day 2    1.5
day 3    2.0
day 4    2.5
dtype: float64

In [204]:
my_series + my_series2

day 0    3.0
day 1    NaN
day 2    5.0
day 3    7.0
day 4    9.0
dtype: float64

In [None]:
# handle missing value
my_series.add(1, fill_value

In [206]:
(my_series + my_series2 * 2) / 4

day 0    1.25
day 1     NaN
day 2    2.00
day 3    2.75
day 4    3.50
dtype: float64

# String Methods
The Pandas str accessor lets you access many strong methods
- these methods all return a Series (split retuns multiple series)

String Method

- .strip(): removes all leading and/or trailing characters (spaces by default)
- .upper().lower(): converts all characters to upper or lower case
- .slice(start:stop:step): applies a slice to the strings in a Series
- .count('string'): counts all instances of a given string
- .contains('string'): returns True if a given string is found; False if not
- .replace('a', 'b'): replaces instances of string 'a' with string 'b'
- .split('delimiter', expand=True): splits strings based on a given delimiter string, and returns a DataFrame with a Series for each split
- .len(): returns the length of each string in a Series
- .startswith('string'), .endswith('string): returns True if a string starts or ends with a given string; False if not.

In [176]:
string_series = pd.Series(['day 0', 'day 1', 'day 2', 'day 3', 'day 4'])

string_series

0    day 0
1    day 1
2    day 2
3    day 3
4    day 4
dtype: object

In [177]:
string_series.str.upper()

0    DAY 0
1    DAY 1
2    DAY 2
3    DAY 3
4    DAY 4
dtype: object

In [179]:
string_series.str.upper().str.contains('DAY 1')

0    False
1     True
2    False
3    False
4    False
dtype: bool

In [180]:
# remove day suffix
string_series.str.strip('day ').astype('int')

0    0
1    1
2    2
3    3
4    4
dtype: int64

In [184]:
# access position of character
string_series.str[1:3]

0    ay
1    ay
2    ay
3    ay
4    ay
dtype: object

In [186]:
# split by delimiter and expand to own columns
string_series.str.split(' ', expand=True)

Unnamed: 0,0,1
0,day,0
1,day,1
2,day,2
3,day,3
4,day,4



# Assignment 4: Series Operations

* Increase the prices in the oil series by 10%, and add an additional 2 dollars per barrel on top of that.

* Then, create a series that represents the difference between each price and max price.

* Finally, extract the month from the string dates in the index and store them as an integer in their own series.

In [221]:
oil_series

date
2017-05-04    45.55
2017-05-09    45.84
2017-05-05    46.23
2017-05-08    46.46
2017-03-23    47.00
              ...  
2017-02-28    54.00
2016-12-28    54.01
2017-02-21    54.02
2017-02-27    54.04
2017-02-23    54.48
Name: oil prices, Length: 100, dtype: float64

In [210]:
oil_increase = (oil_series * 1.1) + 2

oil_increase

date
2017-05-04    52.105
2017-05-09    52.424
2017-05-05    52.853
2017-05-08    53.106
2017-03-23    53.700
               ...  
2017-02-28    61.400
2016-12-28    61.411
2017-02-21    61.422
2017-02-27    61.444
2017-02-23    61.928
Name: oil prices, Length: 100, dtype: float64

In [215]:
max_price = oil_series.max()

max_price

54.48

In [223]:
diff_series = (oil_series - max_price) / max_price

diff_series

date
2017-05-04   -0.163913
2017-05-09   -0.158590
2017-05-05   -0.151432
2017-05-08   -0.147210
2017-03-23   -0.137298
                ...   
2017-02-28   -0.008811
2016-12-28   -0.008627
2017-02-21   -0.008443
2017-02-27   -0.008076
2017-02-23    0.000000
Name: oil prices, Length: 100, dtype: float64

In [240]:
month = s.astype('int')

month_series = pd.Series(month)

month_series

0      5
1      5
2      5
3      5
4      3
      ..
95     2
96    12
97     2
98     2
99     2
Name: date, Length: 100, dtype: int64

In [224]:
oil_series.index.str.split('-', expand=True)

MultiIndex([('2017', '05', '04'),
            ('2017', '05', '09'),
            ('2017', '05', '05'),
            ('2017', '05', '08'),
            ('2017', '03', '23'),
            ('2017', '03', '27'),
            ('2017', '03', '21'),
            ('2017', '03', '14'),
            ('2017', '05', '10'),
            ('2017', '03', '22'),
            ('2017', '03', '24'),
            ('2017', '05', '02'),
            ('2017', '03', '20'),
            ('2017', '05', '03'),
            ('2017', '05', '11'),
            ('2017', '05', '12'),
            ('2017', '03', '13'),
            ('2017', '03', '10'),
            ('2017', '03', '16'),
            ('2017', '03', '17'),
            ('2017', '03', '15'),
            ('2017', '03', '28'),
            ('2017', '03', '09'),
            ('2017', '05', '01'),
            ('2017', '05', '15'),
            ('2017', '04', '24'),
            ('2017', '04', '27'),
            ('2017', '04', '26'),
            ('2017', '04', '25'),
            ('

# Numeric Series Aggregations

Methods to aggregate numerical Series
- .count(): returns the number of items
- .first(), .last(): returns first or last item
- .mean(), .median(): calculates the mean or median
- .min(), max(): smallest or largest value
- .argmax(), argmin(): returns the index for the smallest or largest values
- .std(), .var(): calculates the standard deviation or variance
- .mad(): calculates the mean of absolute deviation
- .prod(): calculates the product of all the items
- .sum(): calculates sum of all items
- .quantile(): returns a specified percentile, or list of percentiles

In [243]:
sales_series

coffee       0
bananas      5
tea        155
coconut      0
sugar      518
Name: Sales, dtype: int64

In [242]:
sales_series.sum()

678

In [245]:
transactions = pd.read_csv('../retail/transactions.csv')

transactions_series = pd.Series(transactions['transactions'])

transactions_series.iloc[:5]

0     770
1    2111
2    2358
3    3487
4    1922
Name: transactions, dtype: int64

In [246]:
transactions_series.mean()

1694.6021583940208

In [247]:
transactions_series.quantile([.5])

0.5    1393.0
Name: transactions, dtype: float64

In [249]:
transactions_series.iloc[:5].quantile([0.5])

0.5    2111.0
Name: transactions, dtype: float64

In [252]:
# small series need to specify interpolation
transactions_series.iloc[:5].quantile([0.4], interpolation='nearest')

0.4    2111
Name: transactions, dtype: int64

# Categorical Series Aggregation

Methods to aggregate and analyze categorical Series

- .unique(): returns an array of unique items in a Series
- .nunique(): returns the number of unqiue items in a Series
- .value_counts(): returns a Series of unqiue items and thier frequency

In [255]:
string_series = pd.Series(['day 0', 'day 0', 'day 2', 'day 2', 'day 4'])

string_series

0    day 0
1    day 0
2    day 2
3    day 2
4    day 4
dtype: object

In [256]:
string_series.nunique()

3

In [257]:
string_series.unique()

array(['day 0', 'day 2', 'day 4'], dtype=object)

In [258]:
# normalize values to a percentage
string_series.value_counts(normalize=True)

day 0    0.4
day 2    0.4
day 4    0.2
dtype: float64

In [259]:
string_series.value_counts()

day 0    2
day 2    2
day 4    1
dtype: int64

# Assignment 5: Series Aggregations

* Calculate the sum and mean of prices in the month of March. 

* Next, calculate how many prices were recorded in January and February.

* Then, calculate the 10th and 90th percentiles across all data.

* Finally, how often did integer dollar value (e.g. 51, 52) occur in the data? Normalize this to a percentage.   

In [267]:
# sum
oil_series[oil_series.index.str[5:7] == '03'].sum().round(2)

1134.54

In [276]:
# mean
oil_series[oil_series.index.str[5:7] == '03'].mean()

49.32782608695652

In [281]:
# number of prices in jan and feb
mask = (oil_series.index.str[5:7] == '01')  | (oil_series.index.str[5:7] == '02')
oil_series[mask].count()

39

In [279]:
# number of prices in jan and feb - method 2
oil_series[oil_series.index.str[5:7].isin(['01','02'])].count()

39

In [285]:
# 10th/90th percentiles
oil_series.quantile([0.1, 0.9])

0.1    47.299
0.9    53.811
Name: oil prices, dtype: float64

In [288]:
# percentage distribution
oil_series.astype('int').value_counts(normalize=True)

53    0.26
52    0.22
47    0.13
48    0.10
50    0.07
51    0.07
49    0.06
54    0.05
45    0.02
46    0.02
Name: oil prices, dtype: float64

In [289]:
oil_series.astype('int').value_counts()

53    26
52    22
47    13
48    10
50     7
51     7
49     6
54     5
45     2
46     2
Name: oil prices, dtype: int64

# Missing Data

Missing data in Pandas is often represented by NumPy "NaN" values
- this is more efficent that python's "none" data type
- pandas treats NaN values as a float, which allows them to be used in vectorized operations

    - np.nan - creates a NaN value
    - arithmetic operations performed on NaN values will return NaN
    - most operation methods include a 'fill_value' argument that lets you pass a value instead of NaN

Pandas released its own missing data type, NA, in dec 2020
 - this allos missing values to be stored as intergers, instead of needing to convert to float
 - this is still a new feature, but most bugs end up converting the data to NumPy's NaN
     - pd.NA creates a NA value * if dtype='int16' wasn't specified, the values would be stored as objects
     
     ** neither np.NaN nor pd.NA are perfect, but pd.NA functionality should continue to improve and having a nullable integer isusually worth it.

# Identifying Missing Data

The .is.na() and .value_counts() methods let you identify missing data in a Series.
- the is.na() method returns True if a value is missing, and False otherwise
    - can use this as a Boolean mask
    - isna().sum() returns the count of NaN values

- the .value_counts() method returns unique values and their frequency
    - most methods ignore NaN values, so you need to specify dropna=False to return the count of NaN values

# Handling Missing Data

The .dropna() and .fillna() methods let you handle missing data in a Series
- the .dropna() method removes NaN values from your Series or DataFrame
- the index has gaps, so you can use .reset_index() to restore the range of integers

The .fillna(value) method replaces NaN values with a specified value

Important to be thoughtful and deliberate in how you handle missing data.
- keep? 
- remove? if small percentage
- replace them with zeros? (if no sales)
- impute them with the mean? 

- operations can dramatically impact the results of a analysis, so make sure you understand these impacts and talk to a data SME to understand why data is missing.

In [290]:
my_series = pd.Series([np.NaN] * 5)

my_series

0   NaN
1   NaN
2   NaN
3   NaN
4   NaN
dtype: float64

In [292]:
my_series.isna().mean()

1.0

In [294]:
# cannot convert NaN to integers
# my_series.astype('int')

In [295]:
# can convert np.NA to integers
my_series = pd.Series([pd.NA] * 5)

my_series

0    <NA>
1    <NA>
2    <NA>
3    <NA>
4    <NA>
dtype: object

In [299]:
my_series.astype('Int64')

0    <NA>
1    <NA>
2    <NA>
3    <NA>
4    <NA>
dtype: Int64

In [300]:
my_series = pd.Series(range(5))
my_series

0    0
1    1
2    2
3    3
4    4
dtype: int64

In [304]:
my_series.loc[1:2] = pd.NA

In [306]:
my_series.isna()

0    False
1     True
2     True
3    False
4    False
dtype: bool

In [317]:
# count number of NAs, need to use dropna=False to include NAs, excludes by default
my_series.value_counts(dropna=False)

<NA>    2
0       1
3       1
4       1
dtype: int64

In [310]:
# fill missing values
my_series.fillna(0)

0    0
1    0
2    0
3    3
4    4
dtype: int64

In [311]:
my_series.fillna(my_series.mean())

0    0.000000
1    2.333333
2    2.333333
3    3.000000
4    4.000000
dtype: float64

In [312]:
my_series.dropna()

0    0
3    3
4    4
dtype: object

In [313]:
# reset index after dropping NA, drop=True to return series instead of dataframe by default
my_series.dropna().reset_index(drop=True)

0    0
1    3
2    4
dtype: object

* Important: understand data and determine best practice to handle missing values

# Assignment 6: Missing Data

There were some erroneous prices in our data, so they were filled in with missing values.

Can you confirm the number of missing values in the price column? 

Once you’ve done that, fill the prices in with the median of the oil price series.


In [319]:
# Fill in two values with missing data

oil_series = oil_series.where(~oil_series.isin([51.44, 47.83]), pd.NA)

In [322]:
# Number of missing values
oil_series.isna().sum()

2

In [327]:
# Location of missing values
oil_series.loc[oil_series.isna()]

date
2017-05-12   NaN
2016-12-21   NaN
Name: oil prices, dtype: float64

In [323]:
# Find median
oil_series.median()

52.205

In [331]:
oil_series.fillna(oil_series.median()).sort_index()

date
2016-12-20    52.220
2016-12-21    52.205
2016-12-22    51.980
2016-12-23    52.010
2016-12-27    52.820
               ...  
2017-05-09    45.840
2017-05-10    47.280
2017-05-11    47.810
2017-05-12    52.205
2017-05-15    48.860
Name: oil prices, Length: 100, dtype: float64

# The Apply Method

The .apply() method lets you apply custom functions to Pandas Series
- this function will not be vectorized, so it's not as efficient as native functions

In [334]:
# Function applies 10% discount to prices over 20
def discount(price):
    if price > 20:
        return round(price * 0.9, 2)
    return price

In [336]:
clean_wholesale = pd.Series([3.99, 5.99, 22.99, 7.99, 33.99])

In [337]:
clean_wholesale.apply(discount)

0     3.99
1     5.99
2    20.69
3     7.99
4    30.59
dtype: float64

In [338]:
# Apply lambda for one off tasks

clean_wholesale.apply(lambda x: round(x * 0.9, 2) if x > 20 else x)

0     3.99
1     5.99
2    20.69
3     7.99
4    30.59
dtype: float64

Heavy duty is better to use builtin Pandas methods and functions

In [339]:
string_series = pd.Series(['day 0', 'day 0', 'day 2', 'day 2', 'day 4'])

string_series

0    day 0
1    day 0
2    day 2
3    day 2
4    day 4
dtype: object

In [340]:
# Return last element of string
string_series.apply(lambda x: x[-1])

0    0
1    0
2    2
3    2
4    4
dtype: object

In [341]:
# Built in string fucntion
string_series.str[-1]

0    0
1    0
2    2
3    2
4    4
dtype: object

In [342]:
def search(string, looking_for):
    if looking_for in string:
        return "Found it"
    return "Nope"

In [343]:
# Find '2' in each string
string_series.apply(search, args='2')

0        Nope
1        Nope
2    Found it
3    Found it
4        Nope
dtype: object

Only use apply method when there is no built in function in Pandas

# The Where Method

Pandas' .where() method returns series values based on a logical condition

- df.where(logical test, value if False, inplace=False)
    - df = series or DF
    - logical test = a logical expression that evaluates to True of False
    - value if False = value to return when expression is False
    - inplace=False = whether to perform the operation in place (default is Fase)
    
- Different from NumPy's where function, where you can specify True value

In [344]:
clean_wholesale

0     3.99
1     5.99
2    22.99
3     7.99
4    33.99
dtype: float64

In [345]:
clean_wholesale.where(clean_wholesale <= 20, round(clean_wholesale * 0.9, 2))

0     3.99
1     5.99
2    20.69
3     7.99
4    30.59
dtype: float64

In [347]:
# Use tilde ~ to invert Boolean values and turn this into a value is True logical test
clean_wholesale.where(~(clean_wholesale > 20), round(clean_wholesale * 0.9, 2))

0     3.99
1     5.99
2    20.69
3     7.99
4    30.59
dtype: float64

# NumPy vs Pandas Where

In [348]:
# NumPy array, then convert bye to pandas series
np.where(clean_wholesale > 20, "Discounted", "Normal Price")

array(['Normal Price', 'Normal Price', 'Discounted', 'Normal Price',
       'Discounted'], dtype='<U12')

In [349]:
string_series.apply(search, args='2')

0        Nope
1        Nope
2    Found it
3    Found it
4        Nope
dtype: object

In [350]:
string_series

0    day 0
1    day 0
2    day 2
3    day 2
4    day 4
dtype: object

In [353]:
# Pandas where method, specifies False value only
string_series.where(string_series.str.contains('2'), "Nope!")

0    Nope!
1    Nope!
2    day 2
3    day 2
4    Nope!
dtype: object

In [359]:
# Pandas where, use inverse to specify True value
string_series.where(
    string_series.str.contains('2'), "Nope!").where(
    ~string_series.str.contains("2"), "Found it!")

0        Nope!
1        Nope!
2    Found it!
3    Found it!
4        Nope!
dtype: object

In [360]:
# Use numpy array and reconvert back to series
pd.Series(np.where(string_series.str.contains('2'), "found it", "nope"))

0        nope
1        nope
2    found it
3    found it
4        nope
dtype: object

# Exercise 7: Apply and Where

Write a function that outputs ‘buy’ if price is less than the 90th percentile and ‘wait’ if it’s not. Apply it to the oil series.

Then, create a series that multiplies price by .9 if the date is ‘2016-12-23’ or ‘2017-05-10’, and 1.1 for all other dates. 

In [387]:
def buy_bool(price, percentile):
    if price < percentile:
        return 'Buy'
    return 'Wait'

In [388]:
oil_series.apply(buy_bool, args=(oil_series.quantile(0.9),))

date
2017-05-04     Buy
2017-05-09     Buy
2017-05-05     Buy
2017-05-08     Buy
2017-03-23     Buy
              ... 
2017-02-28    Wait
2016-12-28    Wait
2017-02-21    Wait
2017-02-27    Wait
2017-02-23    Wait
Name: oil prices, Length: 100, dtype: object

In [389]:
oil_series.apply(lambda x: 'Buy' if x < oil_series.quantile(0.9) else 'Wait')

date
2017-05-04     Buy
2017-05-09     Buy
2017-05-05     Buy
2017-05-08     Buy
2017-03-23     Buy
              ... 
2017-02-28    Wait
2016-12-28    Wait
2017-02-21    Wait
2017-02-27    Wait
2017-02-23    Wait
Name: oil prices, Length: 100, dtype: object

In [363]:
oil_series.isna().sum()

2

In [386]:
oil_series.quantile(0.9)

53.813

In [378]:
pd.Series(np.where(
    oil_series.index.isin(["2016-12-23", "2017-05-10"]), 
    oil_series * 0.9,
    oil_series * 1.1))

0     50.105
1     50.424
2     50.853
3     51.106
4     51.700
       ...  
95    59.400
96    59.411
97    59.422
98    59.444
99    59.928
Length: 100, dtype: float64

In [371]:
oil_series[oil_series.index.isin(["2016-12-23", "2017-05-10"])]

date
2017-05-10    47.28
2016-12-23    52.01
Name: oil prices, dtype: float64

In [393]:
oil_series

date
2017-05-04    45.55
2017-05-09    45.84
2017-05-05    46.23
2017-05-08    46.46
2017-03-23    47.00
              ...  
2017-02-28    54.00
2016-12-28    54.01
2017-02-21    54.02
2017-02-27    54.04
2017-02-23    54.48
Name: oil prices, Length: 100, dtype: float64

In [401]:
oil_series #.reset_index()

date
2017-05-04    45.55
2017-05-09    45.84
2017-05-05    46.23
2017-05-08    46.46
2017-03-23    47.00
              ...  
2017-02-28    54.00
2016-12-28    54.01
2017-02-21    54.02
2017-02-27    54.04
2017-02-23    54.48
Name: oil prices, Length: 100, dtype: float64

In [409]:
# Convert to data frame
df = pd.DataFrame(oil_series).sort_values(by='date').reset_index()

df

Unnamed: 0,date,oil prices
0,2016-12-20,52.22
1,2016-12-21,
2,2016-12-22,51.98
3,2016-12-23,52.01
4,2016-12-27,52.82
...,...,...
95,2017-05-09,45.84
96,2017-05-10,47.28
97,2017-05-11,47.81
98,2017-05-12,


In [413]:
df["new_price"] = pd.Series(np.where(
    oil_series.index.isin(['2016-12-20', '2017-05-10']), 
    oil_series * 0.9,
    oil_series * 1.1))
   
df

Unnamed: 0,date,oil prices,new_price
0,2016-12-20,52.22,50.105
1,2016-12-21,,50.424
2,2016-12-22,51.98,50.853
3,2016-12-23,52.01,51.106
4,2016-12-27,52.82,51.700
...,...,...,...
95,2017-05-09,45.84,59.400
96,2017-05-10,47.28,59.411
97,2017-05-11,47.81,59.422
98,2017-05-12,,59.444


# Key Takeaways

1. Pandas Series add an index & title to NumPy arrays
    - Pandas Series form the columns for DataFrames
2. The .loc() & .iloc() methods are key in working with Pandas data structures
    - These methods allow you to access rows in Series (DataFrames), either by their positional index or by their labels
3. Pandas & NumPy have similar operations for filtering, sorting & aggregating
    - Use built-in Pandas and NumPy functions and methods to take advantage of vectorization, which is much more efficient than writing for loops in base Python
4. Pandas lets you easily handle missing data
    - It's important to understand the impact of dropping or imputing might have on your analysis, so make sure you consult a SME (subject matter expert) about the root cause of the missing data