In [1]:
import numpy as np
import pandas as pd

In [2]:
# there are three pandas data structures

# 1. Series
# 2. DataFrame
# 3. Index

In [31]:
# Series is a 1d array with a built-in index starting at 0

data = pd.Series([1,2,3,4,5])
data

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [4]:
# We can access just the values by using .values

data.values

array([1, 2, 3, 4, 5])

In [5]:
# the index is an object

data.index

RangeIndex(start=0, stop=5, step=1)

In [6]:
# slicing and selection is still valid as before

data[1]

2

In [7]:
data[1:3]

1    2
2    3
dtype: int64

In [8]:
# Pandas Series are *explicitly* defined, which means that we can use
# strings to call an index instead of integers

data = pd.Series([0.25, 0.5, 0.75, 1.0],
                 index=['a', 'b', 'c', 'd'])
data['a']

0.25

In [9]:
# Series can take Python Dictionaries and easily convert them

population_dict = {'California': 38332521,
                   'Texas': 26448193,
                   'New York': 19651127,
                   'Florida': 19552860,
                   'Illinois': 12882135}
population = pd.Series(population_dict)
population

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64

In [10]:
population['California']

38332521

In [12]:
# We can even slice the string indices

population['California':'Illinois']

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64

In [13]:
# data can be a scalar in Series, repeated o fill the specifid index

pd.Series(5, index=[100, 200, 300])

100    5
200    5
300    5
dtype: int64

In [14]:
# interestingly, we can set the index to not even include some of the values

pd.Series({2:'a', 1:'b', 3:'c'}, index=[3, 2])

3    c
2    a
dtype: object

In [16]:
# Next is DataFrames, which can be thought of as a generalization of a Numpy array.
# lets add some area data for each of the states we put in a Series above

area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297,
             'Florida': 170312, 'Illinois': 149995}
area = pd.Series(area_dict)
area

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
dtype: int64

In [17]:
# combining these two series into a DataFrame

states = pd.DataFrame({'population': population,
                        'area': area})
states

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


In [19]:
# We can also access the index and column info

print(states.index)
print(states.columns)

Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')
Index(['population', 'area'], dtype='object')


In [20]:
# DataFrames can be constructed directly from Numpy structured arrays

A = np.zeros(3, dtype=[('A', 'i8'), ('B', 'f8')])
A

array([(0, 0.), (0, 0.), (0, 0.)], dtype=[('A', '<i8'), ('B', '<f8')])

In [22]:
# notice that column formatting is already correct

A = pd.DataFrame(A)
A

Unnamed: 0,A,B
0,0,0.0
1,0,0.0
2,0,0.0


In [32]:
# Index also follow the same set arithmetic as Python's built in sets
# For some reason, i'm not able to grasp what's going on here.

indA = pd.Index([1, 3, 5, 7, 9])
indB = pd.Index([2, 3, 5, 7, 11])

print(indA & indB) # intersection
print(indA | indB) # union
print(indA ^ indB) # symmetric difference

Index([0, 3, 5, 7, 9], dtype='int64')
Index([3, 3, 5, 7, 11], dtype='int64')
Index([3, 0, 0, 0, 2], dtype='int64')


In [33]:
###