In [5]:
import numpy as np
import pandas as pd

In [6]:
### Series are Pandas version of numpy arrays. However, they are more generalizable
# and flexible. Similar to a numpy array with a (index, value) datatype pair

data = pd.Series([1, 2, 3, 4, 5], dtype='int8')
data

0    1
1    2
2    3
3    4
4    5
dtype: int8

In [7]:
# .values returns a familiar numpy array of the series values
data.values

array([1, 2, 3, 4, 5], dtype=int8)

In [8]:
# whereas .index returns the index labels of the series
data.index

RangeIndex(start=0, stop=5, step=1)

In [9]:
# Can slice data as usual to access elements
data[1:3]

1    2
2    3
dtype: int8

In [10]:
# However, because pandas has an explicitly defined index (compared to
# numpy's implicitly defined index) we can also set our index to other
# formats, such as strings
data = pd.Series([1, 2, 3, 4, 5], 
                 index=['a', 'b', 'c', 'd', 'e'])
data

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [11]:
data['b']

2

In [12]:
data['b':'d']

b    2
c    3
d    4
dtype: int64

In [13]:
# We can even use non-sequential values for some reason
data = pd.Series([1, 2, 3, 4, 5, 6],
                index=[3, 1, 4, 1, 5, 9])
print(data[9])
print(data[1])

6
1    2
1    4
dtype: int64


In [14]:
# So a series can really be thought of more like a dictionary, and
# can in fact be constructed directly from a dictionary
population_dict = {'California': 38332521,
                   'Texas': 26448193,
                   'New York': 19651127,
                   'Florida': 19552860,
                   'Illinois': 12882135}
population = pd.Series(population_dict)
population

California    38332521
Florida       19552860
Illinois      12882135
New York      19651127
Texas         26448193
dtype: int64

In [15]:
population['California']

38332521

In [16]:
population['California':'Illinois']

California    38332521
Florida       19552860
Illinois      12882135
dtype: int64

In [17]:
# Dataframes are generalized 2D numpy arrays, with aligned Series objects.
# By aligned, I mean that each row shares the same index. Let's add
# another Series to the State index we started with population

area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297,
             'Florida': 170312, 'Illinois': 149995}
area = pd.Series(area_dict)
print(area)

California    423967
Florida       170312
Illinois      149995
New York      141297
Texas         695662
dtype: int64


In [18]:
states = pd.DataFrame({'population': population, 'area': area})
states

Unnamed: 0,area,population
California,423967,38332521
Florida,170312,19552860
Illinois,149995,12882135
New York,141297,19651127
Texas,695662,26448193


In [19]:
states.index

Index(['California', 'Florida', 'Illinois', 'New York', 'Texas'], dtype='object')

In [20]:
states.columns

Index(['area', 'population'], dtype='object')

In [21]:
# Note that in 2D numpy arrays, data[0] will return the first row,
# whereas in dataframes, data['column0'] will return the first col
states['area']

California    423967
Florida       170312
Illinois      149995
New York      141297
Texas         695662
Name: area, dtype: int64

In [22]:
# Make a dataframe from a single series
pd.DataFrame(population, columns=['population'])

Unnamed: 0,population
California,38332521
Florida,19552860
Illinois,12882135
New York,19651127
Texas,26448193


In [23]:
# from a list of dicts, where each dict is a row of data
data = [{'a': i, 'b': 2*i} for i in range(3)]
pd.DataFrame(data)

Unnamed: 0,a,b
0,0,0
1,1,2
2,2,4


In [24]:
# If some keys are missing, Pandas will fill them with NaNs
pd.DataFrame([{'a':1, 'b':2}, {'b':3, 'c':4}])

Unnamed: 0,a,b,c
0,1.0,2,
1,,3,4.0


In [25]:
# And DFs can be created from series objects as well
pd.DataFrame({'population': population,
              'area': area})


Unnamed: 0,area,population
California,423967,38332521
Florida,170312,19552860
Illinois,149995,12882135
New York,141297,19651127
Texas,695662,26448193


In [27]:
# DFs can also be created from a 2D numpy array
pd.DataFrame(np.random.random((3, 2)), 
             columns=['a', 'b'], index=[0, 1, 2])

Unnamed: 0,a,b
0,0.554197,0.292097
1,0.362426,0.747606
2,0.300048,0.199283


In [28]:
# Or from a numpy structured array
struct = np.zeros(3, dtype=[('A', 'i8'), ('B', 'f8')])
pd.DataFrame(struct)

Unnamed: 0,A,B
0,0,0.0
1,0,0.0
2,0,0.0


In [41]:
# Dataframe index objects can be sliced and indexed like a
# numpy array, the only difference is that an index is immutable

indA = pd.Index([1, 3, 5, 7, 9])
# indA[0] = 0   FAILS! Index is immutable
print(indA[2:4])
print(indA[::2])

Int64Index([5, 7], dtype='int64')
Int64Index([1, 5, 9], dtype='int64')


In [44]:
# Joining two index objects behaves similar to sets
indA = pd.Index([1, 3, 5, 7, 9])
indB = pd.Index([2, 3, 5, 7, 11])

print(indA & indB) # Intersection (elements in both indices)
print(indA | indB) # Union (combination of all indices)
print(indA ^ indB) # symmetric difference (Union - Intersection)

Int64Index([3, 5, 7], dtype='int64')
Int64Index([1, 2, 3, 5, 7, 9, 11], dtype='int64')
Int64Index([1, 2, 9, 11], dtype='int64')


In [48]:
# Indexing can be a source of confusion for series objects
# when using non-generic indices

a = pd.Series(np.arange(4), index=['a','b','c','d'])
print(a[0:2])
print(a['a':'c'])

a    0
b    1
dtype: int64
a    0
b    1
c    2
dtype: int64


In [58]:
# The output is different because pandas includes the upper element
# when the index label is being sliced.

# Also, there are issues when using non-sequential number indices
a = pd.Series(np.arange(4), index=[3,1,0,2])
print(a[:2])      # Slicing with [] assumes I want to slice the element
                  # order, not the labels
print(a.iloc[:2]) # The i in iloc denotes normal [] slicing
print(a.loc[:2])  # Loc specifies slicing with respect to labels,
                  # includes all elements up to and including label 2!

3    0
1    1
dtype: int64
3    0
1    1
dtype: int64
3    0
1    1
0    2
2    3
dtype: int64


In [59]:
# Always use loc and iloc to be explicit about what type of access
# you are doing.

# Initializing a dataframe using two series objects
area = pd.Series({'California': 423967, 'Texas': 695662,
                  'New York': 141297, 'Florida': 170312,
                  'Illinois': 149995})
pop = pd.Series({'California': 38332521, 'Texas': 26448193,
                 'New York': 19651127, 'Florida': 19552860,
                 'Illinois': 12882135})
data = pd.DataFrame({'area':area, 'pop':pop})
data

Unnamed: 0,area,pop
California,423967,38332521
Florida,170312,19552860
Illinois,149995,12882135
New York,141297,19651127
Texas,695662,26448193


In [62]:
# Two different styles of data access for column label 'area'
data['area']

California    423967
Florida       170312
Illinois      149995
New York      141297
Texas         695662
Name: area, dtype: int64

In [67]:
# This is only possible for string-like column labels that are
# also not names of dataframe methods (like pop)
data.area

California    423967
Florida       170312
Illinois      149995
New York      141297
Texas         695662
Name: area, dtype: int64

In [68]:
# They are equivalent methods
data.area is data['area']

True

In [70]:
# Not equivalent because pop() is a method of dataframes
data['pop'] is data.pop

False

In [72]:
# Can use usual dictionary assignments to create new columns
data['density'] = data['pop']/data['area']
data

Unnamed: 0,area,pop,density
California,423967,38332521,90.413926
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763
New York,141297,19651127,139.076746
Texas,695662,26448193,38.01874


In [73]:
# Returns 2d numpy array
data.values

array([[  4.23967000e+05,   3.83325210e+07,   9.04139261e+01],
       [  1.70312000e+05,   1.95528600e+07,   1.14806121e+02],
       [  1.49995000e+05,   1.28821350e+07,   8.58837628e+01],
       [  1.41297000e+05,   1.96511270e+07,   1.39076746e+02],
       [  6.95662000e+05,   2.64481930e+07,   3.80187404e+01]])

In [75]:
# Many numpy array methods are available to the dataframe
data.T

Unnamed: 0,California,Florida,Illinois,New York,Texas
area,423967.0,170312.0,149995.0,141297.0,695662.0
pop,38332520.0,19552860.0,12882140.0,19651130.0,26448190.0
density,90.41393,114.8061,85.88376,139.0767,38.01874


In [76]:
# Access the raw values
data.values[0]

array([  4.23967000e+05,   3.83325210e+07,   9.04139261e+01])

In [91]:
# access data as series
data.iloc[0]

area       4.239670e+05
pop        3.833252e+07
density    9.041393e+01
Name: California, dtype: float64

In [94]:
data.iloc[:3,:2]

Unnamed: 0,area,pop
California,423967,38332521
Florida,170312,19552860
Illinois,149995,12882135


In [95]:
data.loc[:'Illinois',:'pop']

Unnamed: 0,area,pop
California,423967,38332521
Florida,170312,19552860
Illinois,149995,12882135


In [116]:
## Fancy indexing, mask for choosing rows
data.loc[data.density > 100, ['pop', 'density']]

Unnamed: 0,pop,density
Florida,19552860,120.0
New York,19651127,139.076746


In [117]:
# Altering values in the dataframe
data.iloc[0, 2] = 90
data

Unnamed: 0,area,pop,density
California,423967,38332521,90.0
Florida,170312,19552860,120.0
Illinois,149995,12882135,85.883763
New York,141297,19651127,139.076746
Texas,695662,26448193,38.01874


In [118]:
data.loc['Florida', 'density'] = 120
data

Unnamed: 0,area,pop,density
California,423967,38332521,90.0
Florida,170312,19552860,120.0
Illinois,149995,12882135,85.883763
New York,141297,19651127,139.076746
Texas,695662,26448193,38.01874


In [119]:
# Assigning indices makes a copy, so changing the assignment doesn't
# result in a changed dataframe
x = data.loc['Texas', 'density']
x

38.018740422791531

In [120]:
x = 50
data

Unnamed: 0,area,pop,density
California,423967,38332521,90.0
Florida,170312,19552860,120.0
Illinois,149995,12882135,85.883763
New York,141297,19651127,139.076746
Texas,695662,26448193,38.01874


In [126]:
# Furthermore, slicing and masks operate on rows
data[1:3]

Unnamed: 0,area,pop,density
Florida,170312,19552860,120.0
Illinois,149995,12882135,85.883763


In [127]:
data[data.density>100]

Unnamed: 0,area,pop,density
Florida,170312,19552860,120.0
New York,141297,19651127,139.076746


In [135]:
### NumPy ufuncs also work with series and dataframes
rng = np.random.RandomState(42)
ser = pd.Series(rng.randint(0, 10, 4))
ser

0    6
1    3
2    7
3    4
dtype: int64

In [136]:
np.exp(ser)

0     403.428793
1      20.085537
2    1096.633158
3      54.598150
dtype: float64

In [137]:
df = pd.DataFrame(rng.randint(0, 10, (3, 4)),
                  columns = ['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
0,6,9,2,6
1,7,4,3,7
2,7,2,5,4


In [138]:
np.sin(df * np.pi/4)

Unnamed: 0,A,B,C,D
0,-1.0,0.7071068,1.0,-1.0
1,-0.707107,1.224647e-16,0.707107,-0.7071068
2,-0.707107,1.0,-0.707107,1.224647e-16


In [140]:
## For binary operations on two Series or DataFrames, Pandas
## will align indices during the operation, which helps when
## dealing with incomplete data.

area = pd.Series({'Alaska': 1723337, 'Texas': 695662,
                  'California': 423967}, name='area')
population = pd.Series({'California': 38332521, 'Texas': 26448193,
                        'New York': 19651127}, name='population')
population/area

Alaska              NaN
California    90.413926
New York            NaN
Texas         38.018740
dtype: float64

In [143]:
# The above returned a union of the two series, with missing data 
# from one of the two entries resulting a NaN. This isn't just
# division by zero either:
(population + area).values

array([       nan,  38756488.,        nan,  27143855.])

In [145]:
# However, one can specify what to do with the nan values for an
# operation joining two different indices. Fill_value replaces
# missing data with zeros, so the addition function will treat
# missing data as 0, so the result is the lone value for that index.
population.add(area, fill_value=0)

Alaska         1723337.0
California    38756488.0
New York      19651127.0
Texas         27143855.0
dtype: float64

In [149]:
# Similar alignment occurs for both columns and indices when performing
# operations with a dataframe
A = pd.DataFrame(rng.randint(0, 20, (2, 2)),
                 columns=list('AB'))
A

Unnamed: 0,A,B
0,9,15
1,14,14


In [151]:
B = pd.DataFrame(rng.randint(0, 10, (3, 3)),
                 columns = list('BAC'))
B

Unnamed: 0,B,A,C
0,8,6,1
1,3,8,1
2,9,8,9


In [152]:
A + B

Unnamed: 0,A,B,C
0,15.0,23.0,
1,22.0,17.0,
2,,,


In [159]:
fill = A.stack().mean()
A.add(B, fill_value=fill)

Unnamed: 0,A,B,C
0,15.0,23.0,14.0
1,22.0,17.0,14.0
2,21.0,22.0,22.0


In [163]:
## With broadcasting, subtracting a series from a dataframe acts on rows

df

Unnamed: 0,A,B,C,D
0,6,9,2,6
1,7,4,3,7
2,7,2,5,4


In [168]:
df - pd.Series([2,1,1,1], index=list('ABCD'))

Unnamed: 0,A,B,C,D
0,4,8,1,5
1,5,3,2,6
2,5,1,4,3


In [167]:
# Unless the axis is specified
df.sub(pd.Series([2,1,1], index=[0,1,2]), axis=0)

Unnamed: 0,A,B,C,D
0,4,7,0,4
1,6,3,2,6
2,6,1,4,3
