In [1]:
%pwd

'/media/vivek/Everything/personal_projects/habits'

In [3]:
import pandas as pd


# Series

This notebook explores all functionalities related to a Series datatype in pandas i.e. **pd.Series**

In [4]:
pd.Series?

In [5]:
s = pd.Series([7, 'Heisenberg', 3.14, -1789710578, 'Happy Eating!'])

In [6]:
s

0                7
1       Heisenberg
2             3.14
3      -1789710578
4    Happy Eating!
dtype: object

In [33]:
# now let's create a series with an index
s = pd.Series([7, 'Heisenberg', 3.14, -1789710578, 'Happy Eating!'], index=['A', 'Z', 'C', 'Y', 'E'])

**NOTE** Series don't have a `columns` property so doing

```
s.columns
```

will throw an error

In [34]:
s

A                7
Z       Heisenberg
C             3.14
Y      -1789710578
E    Happy Eating!
dtype: object

In [37]:
s.map(lambda a: type(a))      # will save it to the same column

# or

s.apply(lambda a: type(a))    # will return as a new column

A      <class 'int'>
Z      <class 'str'>
C    <class 'float'>
Y      <class 'int'>
E      <class 'str'>
dtype: object

In [12]:
labels = ['a', 'b', 'c', 'd', 'e']
s = pd.Series(np.random.randn(5), index=labels)

In [13]:
s

a   -0.388755
b   -0.366503
c   -1.752408
d    0.079943
e    0.108645
dtype: float64

In [14]:
'b' in s

True

invoke a value by it's index

In [27]:
s['b']

# or

s.loc['b']

-0.36650285436318358

In [16]:
s.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

Convert series with an index into a dict

In [17]:
s_dict = s.to_dict()
s_dict

{'a': -0.38875505849394393,
 'b': -0.3665028543631836,
 'c': -1.7524082546256938,
 'd': 0.07994275562643542,
 'e': 0.1086447793929061}

totaling the values of a series

In [18]:
s.sum()

-2.3190786324634796

In [20]:
s.cumsum()

a   -0.388755
b   -0.755258
c   -2.507666
d   -2.427723
e   -2.319079
dtype: float64

In [21]:
s.diff()

a         NaN
b    0.022252
c   -1.385905
d    1.832351
e    0.028702
dtype: float64

In [46]:
s = pd.Series(s_dict, index=['b', 'e', 'a', 'd', 'f'])
s

b    1.199811
e   -0.165838
a   -0.435313
d   -0.412888
f         NaN
dtype: float64

In [49]:
# Check to see if there are null values
pd.isnull(s)

b    False
e    False
a    False
d    False
f     True
dtype: bool

In [51]:
# If you want to the rows in which there are/aren't null values, do 
s[pd.notnull(s)]

b    1.199811
e   -0.165838
a   -0.435313
d   -0.412888
dtype: float64

In [53]:
# You can also do a dropna on the Series and it will drop the rows with null values
s.dropna()

b    1.199811
e   -0.165838
a   -0.435313
d   -0.412888
dtype: float64

In [54]:
s * 2

b    2.399621
e   -0.331677
a   -0.870626
d   -0.825776
f         NaN
dtype: float64

In [55]:
s[:3]

b    1.199811
e   -0.165838
a   -0.435313
dtype: float64

In [64]:
s.ix[3, 'd']

-0.41288804595034079

In [63]:
s.get_value(3, 'b')

-0.41288804595034079

## Indexing

In [89]:
s.index

Index(['A', 'Z', 'C', 'Y', 'E'], dtype='object')

In [90]:
# you can also use the set_index method to set a new index like so.
s.set_index = ['A', 'B', 'C', 'D', 'E']

In [91]:
# which means, you can access the value of the Series using the index like so...
s['A']

7

In [92]:
# You can also convert a dictionary into a Series like so...
d = {'Chicago': 1000, 'New York': 1300, 'Portland': 900, 'San Francisco': 1100, 'Austin': 450, 'Boston': None}

In [93]:
# This will automatically convert the 'keys' of the dictionary into the index of the Series
cities = pd.Series(d)

In [94]:
cities.index

Index(['Austin', 'Boston', 'Chicago', 'New York', 'Portland', 'San Francisco'], dtype='object')

In [95]:
cities['Chicago']

1000.0

## Selection

In [96]:
# Selecting multiple keys within a Series
cities[['Chicago', 'Austin', 'New York']]

Chicago     1000.0
Austin       450.0
New York    1300.0
dtype: float64

In [97]:
# You can also have a conditional selection on a Series like so...
cities[cities < 1000]
# which means - select all cities where values < 1000. Here the key inside the [] automatically assumes, it is
#  being applied to the Seires value.

Austin      450.0
Portland    900.0
dtype: float64

In [98]:
# you can also split the condition saperately like so...
condition = cities > 800
# and then apply the condition to the Series
cities[condition]

Chicago          1000.0
New York         1300.0
Portland          900.0
San Francisco    1100.0
dtype: float64

In [99]:
# you can also apply conditional selection on the index
'Seattle' in cities or 'San Francisco' in cities
# checking if 'Seattle' or 'SF' is present as index within the Series

True

In [100]:
# or, select 'Chicago' if present in cities, and show the value
cities['Chicago' in cities]

nan

## Update

In [101]:
# you can update the Series values on the fly. Let us change all values less than 1000 to 750 by default
cities[cities < 1000] = 750

In [102]:
cities

Austin            750.0
Boston              NaN
Chicago          1000.0
New York         1300.0
Portland          750.0
San Francisco    1100.0
dtype: float64

In [114]:
# NaN will be shown against all the values which is has a null value
cities.isnull()
# this is the condition against the Series

Austin           False
Boston            True
Chicago          False
New York         False
Portland         False
San Francisco    False
dtype: bool

In [120]:
# This line will get you all the rows/keys which has "null" or NaN or nan as values
cities[cities.isnull()].fillna('UNDEFINED')

Boston    UNDEFINED
dtype: object

In [121]:
cities

Austin            750.0
Boston              NaN
Chicago          1000.0
New York         1300.0
Portland          750.0
San Francisco    1100.0
dtype: float64

In [118]:
# If you want to check if there are "any" null values within a series or even a dataframe, use the following cmd:
cities.isnull().values.any()

True

In [39]:
# Create another condition that we want to update
is_null_condition = cities.isnull()

In [43]:
# now update all the conditional values in the Series with a default value of 100
cities[is_null_condition] = 100

In [44]:
cities

Austin            750.0
Boston            100.0
Chicago          1000.0
New York         1300.0
Portland          750.0
San Francisco    1100.0
dtype: float64

## Operations

In [54]:
cities / 3
# will automatically give respected values divided by 3

Austin           250.000000
Boston            33.333333
Chicago          333.333333
New York         433.333333
Portland         250.000000
San Francisco    366.666667
dtype: float64

In [60]:
# a bit complex conditional query
cities[cities > 1000] / 100
# divide all values of cities which have values greater than 1000

New York         13.0
San Francisco    11.0
dtype: float64

In [66]:
np.square(cities[cities > 1000])
# square all values for cities whose value is greater than 1000

New York         1690000.0
San Francisco    1210000.0
dtype: float64

**Here you need to remember that these operations don't automatically update the original series, it merely returns a new one** 

In [69]:
# Try adding two Series with a few common keys and a few new ones, see what happens
cities[['New York', 'San Francisco', 'Chicago']] + cities[['Chicago', 'Austin', 'New York']]
# pandas automatically identifies the values with the same keys and adds them and for the ones which are new,
#  it adds an 'NaN' value by default. In order to combine two Series, both the Series should have all the same keys

Austin              NaN
Chicago          2000.0
New York         2600.0
San Francisco       NaN
dtype: float64