### Working with Series

In [3]:
import pandas as pd

### Series
- Represents a one dimensional labeled array or single column of data.
- Data types should be consistent since it creates a column essentially.
- In order to create a Series, use a constructor method: Series with dot notation.

#### Creating a Series object from a list.

In [4]:
# Creata a Series with strings
ice_cream = ['chocolate', 'vanilla', 'strawberry', 'rum raisin']

pd.Series(ice_cream)

0     chocolate
1       vanilla
2    strawberry
3    rum raisin
dtype: object

In [5]:
# Create another series using int's with the constructor method (you can also use booleans - True/False)
lottery = [1, 5, 19, 51, 7, 43, 12]

pd.Series(lottery)

0     1
1     5
2    19
3    51
4     7
5    43
6    12
dtype: int64

#### Creating a Series object from a dictionary.
- Pandas will take the keys and auto create them as indexes.

In [6]:
webster = {'Ardvark': 'Animal',
           'Banana': 'Fruit',
           'Cyan': 'Color'}

pd.Series(webster)

Ardvark    Animal
Banana      Fruit
Cyan        Color
dtype: object

#### Attributes on a Series object
- Objects in Python have attributes and methods.
- A series is an object and therefore has many methods: attributes do not modify the object, they provide info whereas methods do apply a calculation or do task such as manipulate a string.

In [7]:
# Create a series, store in a varibale to reuse
s_char = ['Tall', 'Smart', 'Charming', 'Humble', 'Social']

# Convert to series
s = pd.Series(s_char)
s

0        Tall
1       Smart
2    Charming
3      Humble
4      Social
dtype: object

#### Access the attributes - press tab after the dot to see the methods and attributes
- Attributes do NOT require ( )
- Methods do require ( )
- .columns: shows column names
- .shape: show rows nd col counts

In [8]:
# Values - returns an array
s.values

array(['Tall', 'Smart', 'Charming', 'Humble', 'Social'], dtype=object)

In [9]:
# Index - retruns the range and the step, default is 1 which is the increment
s.index

RangeIndex(start=0, stop=5, step=1)

In [10]:
# returns O for object
s.dtype

dtype('O')

#### Methods on a Series object
- Pressing tab after the dot will reveal a list of methods and attributes.

In [11]:
# Create new series
prices = [2.99, 4.45, 3.36]

p = pd.Series(prices)

p

0    2.99
1    4.45
2    3.36
dtype: float64

In [12]:
# Call sum method on p series - adds all values of the prices
p.sum()

10.8

In [13]:
# Call product the method on p series - multiplies all values against one another
p.product()

44.706480000000006

In [14]:
# Call the mean method for an average over the series which is the sum divided by the length
p.mean()

3.6

In [15]:
# Call the median
p.median()

3.36

In [16]:
# Standard deviation
p.std()

0.759012516365837

#### Parameters and arguments
- for methods
- The parameter is the name of the argument and the argument is the choice we choose. Example: .sort_values(by= "colname", ascending=False)
- Indexes don't have to be unique.

In [17]:
# Create 2 lists, use weekdays as the index by adding it in the index= position for an argument
fruits = ['apple', 'orange', 'grape','plum', 'blueberry' ]

weekdays = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday']

pd.Series(fruits, weekdays)

# alternatively you can write out the parameter, writing the explicit arg allows you skip the sequence 

pd.Series(data=fruits, index=weekdays)

Monday           apple
Tuesday         orange
Wednesday        grape
Thursday          plum
Friday       blueberry
dtype: object

#### Create a Series from a dataset with pd.read_csv( ) function

In [18]:
# import the data
p = pd.read_csv('../datasets/pokemon.csv')

p.head()

Unnamed: 0,Pokemon,Type
0,Bulbasaur,Grass
1,Ivysaur,Grass
2,Venusaur,Grass
3,Charmander,Fire
4,Charmeleon,Fire


In [19]:
# pull one col to create a series by using the parametr usecols and the argument for the col
# You have to use the parameter squeeze=True to make it a series
p = pd.read_csv('../datasets/pokemon.csv', usecols=['Pokemon'], squeeze=True)

p.head()

0     Bulbasaur
1       Ivysaur
2      Venusaur
3    Charmander
4    Charmeleon
Name: Pokemon, dtype: object

In [20]:
# get the counts using shape whihc provides a tuple of rows/columns - 721 rows, 1 col
p.shape

(721,)

#### Head and Tail methods
- head( ) - returns the 1st 5
- tail returns the last 5

In [21]:
stock = pd.read_csv('../datasets/google_stock_price.csv',usecols=['Stock Price'], squeeze=True)

# preview the 1st 5 rows - chnage the row num by entering in the ( )
stock.head()

0    50.12
1    54.10
2    54.65
3    52.38
4    52.95
Name: Stock Price, dtype: float64

In [22]:
# preview the last 5 rows
stock.tail()

3007    772.88
3008    771.07
3009    773.18
3010    771.61
3011    782.22
Name: Stock Price, dtype: float64

In [23]:
# get the average
stock.mean()

334.31009296148744

#### Passing objects into built in functions
- use Pokemon and Stock series to demonstrate functions.

In [24]:
# get the number of rows from pokemon and stock
len(p)
len(stock)

3012

In [25]:
# get the type
type(stock)

pandas.core.series.Series

In [26]:
# retrun a sorted list - works the saem for numeric, in ascending order
sortp = sorted(p)

# sortp

In [27]:
# turns series into a list -showing fiorst 5 by using index slice
list(sortp)[0:5]

['Abomasnow', 'Abra', 'Absol', 'Accelgor', 'Aegislash']

In [28]:
# use the dict function to turn google stock into a dict, index are keys
s = dict(stock)

# s

In [29]:
# get the min/max from google - max retruns largest value, min retruns smallest value
min(stock)

max(stock)

782.22

#### Sort_values( )
- Default is ascending order
- To change default, use ascending = False
- A series or DF isnt modifed permanently unless you reassign or use the parameter inplace=True.

In [30]:
# re import series 
stock = pd.read_csv('../datasets/google_stock_price.csv',usecols=['Stock Price'], squeeze=True)
p = pd.read_csv('../datasets/pokemon.csv', usecols=['Pokemon'], squeeze=True)

In [31]:
# sort the values in scending order- strings will be alphabetical
# we can method chain here 
p.sort_values().head()

# make descending order
p.sort_values(ascending=False).head()

717     Zygarde
633    Zweilous
40        Zubat
569       Zorua
570     Zoroark
Name: Pokemon, dtype: object

In [32]:
# sort on a numeric series and get the top 3
stock.sort_values(ascending=False).head(3)

3011    782.22
2859    776.60
3009    773.18
Name: Stock Price, dtype: float64

In [33]:
# using the inplace paremeter to overwrite the original object
# running stock again will not be sorted as it is above becseu it wasnt saved or reassigned
stock

0        50.12
1        54.10
2        54.65
3        52.38
4        52.95
         ...  
3007    772.88
3008    771.07
3009    773.18
3010    771.61
3011    782.22
Name: Stock Price, Length: 3012, dtype: float64

In [34]:
# modify to be in descending order permanently
stock.sort_values(ascending=False, inplace=True)

In [35]:
# now test it, it in descending order now.
stock

3011    782.22
2859    776.60
3009    773.18
3007    772.88
3010    771.61
         ...  
12       50.74
10       50.70
0        50.12
9        50.07
11       49.95
Name: Stock Price, Length: 3012, dtype: float64

#### Sort_index

In [36]:
p = pd.read_csv('../datasets/pokemon.csv', usecols=['Pokemon'], squeeze=True)

In [37]:
# sorting in place will shuffle the index out of order - look at the 1st position, its 717!
p.sort_values(ascending=False, inplace=True)

p.head()

717     Zygarde
633    Zweilous
40        Zubat
569       Zorua
570     Zoroark
Name: Pokemon, dtype: object

In [38]:
# reset the index
p.sort_index(inplace=True)
p.head()

0     Bulbasaur
1       Ivysaur
2      Venusaur
3    Charmander
4    Charmeleon
Name: Pokemon, dtype: object

#### The "in" keyword
- used to check if a value is in a series or index

In [39]:
# re import series 
stock = pd.read_csv('../datasets/google_stock_price.csv',usecols=['Stock Price'], squeeze=True)
p = pd.read_csv('../datasets/pokemon.csv', usecols=['Pokemon'], squeeze=True)

In [40]:
# the in keywrod returns true or false depending on the existence of the value within a list 
# Example
nums = [1,2,3,4,5,6]

2 in nums

True

In [41]:
# you can search by using values as well
'Charmander' in p.values

True

#### Extract by index position 
- extracting by index position uses brackets[ ] and the index position [0] whihc starts at 0.

In [42]:
# reimport the pokenom csv
p = pd.read_csv('../datasets/pokemon.csv', usecols=['Pokemon'], squeeze=True)
p.head(2)

0    Bulbasaur
1      Ivysaur
Name: Pokemon, dtype: object

In [51]:
# Get the 1st row - index starts at 0
p[0]

'Bulbasaur'

In [52]:
# how about the 200th position? this wouldbe [199] to het positin 200, use 200!
p[199]

'Misdreavus'

In [54]:
# get multiple positions using multiple index positions 100,200,300 th values
p[[99,199,299]]

99        Voltorb
199    Misdreavus
299        Skitty
Name: Pokemon, dtype: object

In [56]:
# several ways to extract by index position

# up to 20
p[:20]

# from the end
p[:-10]

# # extract all positions between 50-60
p[50:60]

50      Dugtrio
51       Meowth
52      Persian
53      Psyduck
54      Golduck
55       Mankey
56     Primeape
57    Growlithe
58     Arcanine
59      Poliwag
Name: Pokemon, dtype: object

#### Extract by label
- Rather than extract by position, use the label. 
- For this example, we will chnage the index to be the Pokemon column.
- Pandas will trigger an erro if the label does not exist.

In [61]:
# reimport the data set and set the index but we still want a Series so squeeze is still required.
p = pd.read_csv('../datasets/pokemon.csv', index_col=['Pokemon'], squeeze=True)
p.head(10)

Pokemon
Bulbasaur     Grass
Ivysaur       Grass
Venusaur      Grass
Charmander     Fire
Charmeleon     Fire
Charizard      Fire
Squirtle      Water
Wartortle     Water
Blastoise     Water
Caterpie        Bug
Name: Type, dtype: object

In [63]:
# we can still use index position
p[[100,200]]

# or use the Pokemon label
p['Charizard']

'Fire'

In [62]:
# get multiple values using a list of labels
p[['Squirtle', 'Caterpie']]

Pokemon
Squirtle    Water
Caterpie      Bug
Name: Type, dtype: object

In [64]:
# you can still use a between range with labels-with labels the last position is included (not so when using the index position)
p['Charmander':'Wartortle']

Pokemon
Charmander     Fire
Charmeleon     Fire
Charizard      Fire
Squirtle      Water
Wartortle     Water
Name: Type, dtype: object

#### The .get( ) method

In [65]:
# you can get with the index positions or label
# sort first by index
p.sort_index(inplace=True)

In [72]:
p.head(6)

Pokemon
Abomasnow       Grass
Abra          Psychic
Absol            Dark
Accelgor          Bug
Aegislash       Steel
Aerodactyl       Rock
Name: Type, dtype: object

In [70]:
# get position 1 or by label
p.get(0)

p.get('Absol')

'Dark'

In [74]:
# get a list by multiple index position or lables using a list inside the method
p.get([0,3,5])

p.get(['Accelgor', 'Aegislash'])

Pokemon
Accelgor       Bug
Aegislash    Steel
Name: Type, dtype: object

In [78]:
# default = is set to None which prodcues an error so you set the default message.
# if the label or position doesnt exist in the series by using default='does not exist'
# if you have a list with one value that doesnt exist, it will return only the default message.
p.get(key=[0,1,2,1000], default = 'Does not exist')

'Does not exist'

### Math methods

In [43]:
# reimport data set
stock = pd.read_csv('../datasets/google_stock_price.csv',usecols=['Stock Price'], squeeze=True)
stock.head(3)

0    50.12
1    54.10
2    54.65
Name: Stock Price, dtype: float64

In [44]:
# lest count the valid values in the series - count excludes null values where len does not
stock.count()

3012

In [45]:
# sum the values - this is all the prices over the years summed up
stock.sum()

1006942.0

In [46]:
# the average- this is the saem as taking stock.sum()/stock.count()
stock.mean()

334.31009296148744

In [47]:
# standard deviation - distance from the mean
stock.std()

173.18720477113106

In [48]:
# largest close value, smallest close value
stock.max()
stock.min()

49.95

In [49]:
# mode - value that occurs most frequently
stock.mode()

0    291.21
dtype: float64

In [50]:
# summary stats - gives all of the above in one command + percentiles
stock.describe()

count    3012.000000
mean      334.310093
std       173.187205
min        49.950000
25%       218.045000
50%       283.315000
75%       443.000000
max       782.220000
Name: Stock Price, dtype: float64

#### Value_counts method
- Similar to a pivot summary
- Default for order is ascending = False so it provides a descending count. Change with ascending=True.

In [79]:
# get a count using the above import with Pokemon as the index
# this shows us how many times something occurs
p.value_counts()

Water       105
Normal       93
Grass        66
Bug          63
Psychic      47
Fire         47
Rock         41
Electric     36
Ground       30
Poison       28
Dark         28
Fighting     25
Dragon       24
Ice          23
Ghost        23
Steel        22
Fairy        17
Flying        3
Name: Type, dtype: int64

In [80]:
# using the sum with value counts - this is the total coutn or length of the series
p.value_counts().sum()

721

In [81]:
# get top 3 and bottom 3
p.value_counts().head(3)

p.value_counts().tail(3)

Steel     22
Fairy     17
Flying     3
Name: Type, dtype: int64

#### Using apply( ) to invoke a function on every Series value

In [85]:
# reimport googel stock
google = pd.read_csv('../datasets/google_stock_price.csv', squeeze=True)
google.head(6)

0    50.12
1    54.10
2    54.65
3    52.38
4    52.95
5    53.90
Name: Stock Price, dtype: float64

In [87]:
# call a custom function on each value and apply it to the series values
def classify_performance(num):
    if num < 300:
        return 'Average Performance'
    elif num >= 300 and num < 650:
        return 'Good Performance'
    else:
        return 'Outstanding Performance'
    

In [88]:
# apply the function now
google.apply(classify_performance)

0           Average Performance
1           Average Performance
2           Average Performance
3           Average Performance
4           Average Performance
                 ...           
3007    Outstanding Performance
3008    Outstanding Performance
3009    Outstanding Performance
3010    Outstanding Performance
3011    Outstanding Performance
Name: Stock Price, Length: 3012, dtype: object

#### Map method