In [1]:
import numpy as np
import pandas as pd

In [13]:
### Series are Pandas version of numpy arrays. However, they are more generalizable
# and flexible. Similar to a numpy array with a (index, value) datatype pair

data = pd.Series([1, 2, 3, 4, 5], dtype='int8')
data

0    1
1    2
2    3
3    4
4    5
dtype: int8

In [14]:
# .values returns a familiar numpy array of the series values
data.values

array([1, 2, 3, 4, 5], dtype=int8)

In [15]:
# whereas .index returns the index labels of the series
data.index

RangeIndex(start=0, stop=5, step=1)

In [16]:
# Can slice data as usual to access elements
data[1:3]

1    2
2    3
dtype: int8

In [20]:
# However, because pandas has an explicitly defined index (compared to
# numpy's implicitly defined index) we can also set our index to other
# formats, such as strings
data = pd.Series([1, 2, 3, 4, 5], 
                 index=['a', 'b', 'c', 'd', 'e'])
data

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [21]:
data['b']

2

In [22]:
data['b':'d']

b    2
c    3
d    4
dtype: int64

In [27]:
# We can even use non-sequential values for some reason
data = pd.Series([1, 2, 3, 4, 5, 6],
                index=[3, 1, 4, 1, 5, 9])
print(data[9])
print(data[1])

6
1    2
1    4
dtype: int64


In [28]:
# So a series can really be thought of more like a dictionary, and
# can in fact be constructed directly from a dictionary
population_dict = {'California': 38332521,
                   'Texas': 26448193,
                   'New York': 19651127,
                   'Florida': 19552860,
                   'Illinois': 12882135}
population = pd.Series(population_dict)
population

California    38332521
Florida       19552860
Illinois      12882135
New York      19651127
Texas         26448193
dtype: int64

In [29]:
population['California']

38332521

In [30]:
population['California':'Illinois']

California    38332521
Florida       19552860
Illinois      12882135
dtype: int64

In [33]:
# Dataframes are generalized 2D numpy arrays, with aligned Series objects.
# By aligned, I mean that each row shares the same index. Let's add
# another Series to the State index we started with population

area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297,
             'Florida': 170312, 'Illinois': 149995}
area = pd.Series(area_dict)
print(area)

California    423967
Florida       170312
Illinois      149995
New York      141297
Texas         695662
dtype: int64


In [36]:
states = pd.DataFrame({'population': population, 'area': area})
states

Unnamed: 0,area,population
California,423967,38332521
Florida,170312,19552860
Illinois,149995,12882135
New York,141297,19651127
Texas,695662,26448193


In [37]:
states.index

Index(['California', 'Florida', 'Illinois', 'New York', 'Texas'], dtype='object')

In [38]:
states.columns

Index(['area', 'population'], dtype='object')

In [42]:
# Note that in 2D numpy arrays, data[0] will return the first row,
# whereas in dataframes, data['column0'] will return the first col
states['area']

California    423967
Florida       170312
Illinois      149995
New York      141297
Texas         695662
Name: area, dtype: int64

In [44]:
# Make a dataframe from a single series
pd.DataFrame(population, columns=['population'])

Unnamed: 0,population
California,38332521
Florida,19552860
Illinois,12882135
New York,19651127
Texas,26448193


In [45]:
# from a list of dicts
data = [{'a': i, 'b': 2*i} for i in range(3)]
pd.DataFrame(data)

Unnamed: 0,a,b
0,0,0
1,1,2
2,2,4
