In [1]:
import numpy as np
import pandas as pd

# Series Methods

- Importing CSV datasets
- Sorting Series values in ascending and descending order
- Retrieving the largest and smallest values in a Series
- Mutating a Series inplace
- Counting occurrences of unique values in a Series
- Applying an operation to every value in a Series

### First Import the Datasets that will be used:

- pokemon.csv
- google_stock.csv
- Revolutionary_war.csv

In [2]:
#Assign the urls to variables for importation
pokemon_url = "https://raw.githubusercontent.com/paskhaver/pandas-in-action/master/chapter_03_series_methods/pokemon.csv"
google_stocks_url = "https://raw.githubusercontent.com/paskhaver/pandas-in-action/master/chapter_03_series_methods/google_stocks.csv"
rev_war_url = "https://raw.githubusercontent.com/paskhaver/pandas-in-action/master/chapter_03_series_methods/revolutionary_war.csv"

#### Assign Pokemon dataset to a DataFrame object

In [3]:
pokemon_df = pd.read_csv(pokemon_url)
pokemon_df

Unnamed: 0,Pokemon,Type
0,Bulbasaur,Grass / Poison
1,Ivysaur,Grass / Poison
2,Venusaur,Grass / Poison
3,Charmander,Fire
4,Charmeleon,Fire
...,...,...
804,Stakataka,Rock / Steel
805,Blacephalon,Fire / Ghost
806,Zeraora,Electric
807,Meltan,Steel


To render it as a Series object instead, we can use the pokemon names as index when passing in the index to index_col parameter. As well as passing in squeeze parameter as True

In [4]:
pokemon_series = pd.read_csv(pokemon_url, index_col="Pokemon", squeeze=True)
pokemon_series

Pokemon
Bulbasaur      Grass / Poison
Ivysaur        Grass / Poison
Venusaur       Grass / Poison
Charmander               Fire
Charmeleon               Fire
                    ...      
Stakataka        Rock / Steel
Blacephalon      Fire / Ghost
Zeraora              Electric
Meltan                  Steel
Melmetal                Steel
Name: Type, Length: 809, dtype: object

Google_Stocks Dataset has only numeric values and datetime. Dates are strings as default, so need to assign parse_dates parameters with column name to convert it


In [5]:
google_stocks = pd.read_csv(google_stocks_url, index_col="Date",
                            parse_dates=["Date"], squeeze = True)
google_stocks.head()

Date
2004-08-19    49.98
2004-08-20    53.95
2004-08-23    54.50
2004-08-24    52.24
2004-08-25    52.80
Name: Close, dtype: float64

The Revolutionary war dataset contains 3 columns, so to extract a series, we want to use one column as an index and and the other as values. We will convert the date values first as the index and the States as the values. we will pass in the parameter usecols= to select our 2 columns and then squeeze=True to convert it to series

In [6]:
rev_wars = pd.read_csv(rev_war_url, index_col = "Start Date",
            parse_dates = ["Start Date"],
            usecols = ["State", "Start Date"],
            squeeze = True)
 

### There are 2 main methods of sorting a series. 
Can use 
- the sort_values() which will organize the values  or 
- the sort_index() which will sort the index values 

both methods will sort in ascending order, but can be descending if set the parameter ascending=False

In [7]:
pokemon_series.sort_values()

Pokemon
Illumise                Bug
Silcoon                 Bug
Pinsir                  Bug
Burmy                   Bug
Wurmple                 Bug
                  ...      
Tirtouga       Water / Rock
Relicanth      Water / Rock
Corsola        Water / Rock
Carracosta     Water / Rock
Empoleon      Water / Steel
Name: Type, Length: 809, dtype: object

### Can extract the smallest or largest values
Can use 
- either .nsmallest(n) to extract the n amount of smallest values
or 
- the .nlargest(n) to extract the n amount of largest values

the default amount is 5

In [8]:
google_stocks.nlargest(10)

Date
2019-04-29    1287.58
2019-04-26    1272.18
2018-07-26    1268.33
2019-10-25    1265.13
2019-04-23    1264.55
2018-07-25    1263.70
2019-04-25    1263.45
2019-10-24    1260.99
2019-10-23    1259.13
2019-04-24    1256.00
Name: Close, dtype: float64

### Counting Values

In [9]:
#To get counts of rows with same labels
pokemon_series.value_counts()

Normal               65
Water                61
Grass                38
Psychic              35
Fire                 30
                     ..
Dragon / Electric     1
Fire / Dark           1
Ground / Electric     1
Fighting / Flying     1
Fire / Psychic        1
Name: Type, Length: 159, dtype: int64

In [10]:
#Can also normalize to find percentage of each label within series
#Multiply by 100 and round to 2ecimal points to convert from decimal to percent
(pokemon_series.value_counts(normalize=True)*100).round(2)

Normal               8.03
Water                7.54
Grass                4.70
Psychic              4.33
Fire                 3.71
                     ... 
Dragon / Electric    0.12
Fire / Dark          0.12
Ground / Electric    0.12
Fighting / Flying    0.12
Fire / Psychic       0.12
Name: Type, Length: 159, dtype: float64

In [11]:
#Can Group stock prices into bucket with ranges,
#This can help group ranges for better visualization
#Then sort in ascending order by passing parameter sort as False
bins = np.arange(0,1600,200)
google_stocks.value_counts(bins=bins, sort=False)

(-0.001, 200.0]      595
(200.0, 400.0]      1568
(400.0, 600.0]       575
(600.0, 800.0]       380
(800.0, 1000.0]      207
(1000.0, 1200.0]     406
(1200.0, 1400.0]      93
Name: Close, dtype: int64

In [12]:
#Can also Group bins by passing the amount of bins 
#Rather than specifying the ranges beforehand
#The disadvantage is that the bins aren't equal width 
google_stocks.value_counts(bins=6, sort=False)

(48.581, 256.113]      1204
(256.113, 462.407]     1104
(462.407, 668.7]        507
(668.7, 874.993]        380
(874.993, 1081.287]     292
(1081.287, 1287.58]     337
Name: Close, dtype: int64

### Functions on Series

In [13]:
#Apply a function on every value within the series using .apply()
google_stocks.apply(func=round)

Date
2004-08-19      50
2004-08-20      54
2004-08-23      54
2004-08-24      52
2004-08-25      53
              ... 
2019-10-21    1246
2019-10-22    1243
2019-10-23    1259
2019-10-24    1261
2019-10-25    1265
Name: Close, Length: 3824, dtype: int64

In [14]:
#Can also pass in a new function 
#To find if a pokemon has one type or two build func

def single_or_multi(types):
  if "/" in types:
    return "Multi"
  return "Single"

#This function will check if the type has a "/" which indicates multiple types
pokemon_series.apply(single_or_multi)

Pokemon
Bulbasaur       Multi
Ivysaur         Multi
Venusaur        Multi
Charmander     Single
Charmeleon     Single
                ...  
Stakataka       Multi
Blacephalon     Multi
Zeraora        Single
Meltan         Single
Melmetal       Single
Name: Type, Length: 809, dtype: object

In [15]:
#Then we can summarize the count with .value_counts()
pokemon_series.apply(single_or_multi).value_counts()

Multi     405
Single    404
Name: Type, dtype: int64

In [17]:
#Find out which day of the week had the most battles in Rev_War
#use hint
"""
from datetime import datetime
today=datetime.now()
today.strftime('%A')
"""

Start Date
1774-09-01    Massachusetts
1774-12-14    New Hampshire
1775-04-19    Massachusetts
1775-04-19    Massachusetts
1775-04-20         Virginia
Name: State, dtype: object

In [22]:
#First Reassign rev_wars with imported data and remove State features
#This will return only the dates within the dataset
rev_wars = pd.read_csv(rev_war_url,
            parse_dates = ["Start Date"],
            usecols = ["Start Date"],
            squeeze = True)

rev_wars.head()

0   1774-09-01
1   1774-12-14
2   1775-04-19
3   1775-04-19
4   1775-04-20
Name: Start Date, dtype: datetime64[ns]

In [24]:
#then use above hint to extract specific days of the week
#create a function to apply to each date
def day_of_week(date):
  return date.strftime('%A')

#Use .dropna() to avoid missing values
rev_wars.dropna().apply(day_of_week)

0       Thursday
1      Wednesday
2      Wednesday
3      Wednesday
4       Thursday
         ...    
227    Wednesday
228       Friday
229       Friday
230       Friday
231    Wednesday
Name: Start Date, Length: 228, dtype: object

In [25]:
#Then to find the counts apply .value_counts()
rev_wars.dropna().apply(day_of_week).value_counts()


Saturday     39
Friday       39
Wednesday    32
Thursday     31
Sunday       31
Tuesday      29
Monday       27
Name: Start Date, dtype: int64