# Pandas fundamental data structure 
## 1. Series Object
## 2. DataFrame Object
## 3. Index Object

In [4]:
import numpy as np
import pandas as pd

## 1. The pandas 'series' object

In [7]:
data = pd.Series([0.25, 0.5, 0.75, 1.0])
data

data.values
data.index

data[1]
data[1:3]

## series as a generalized numpy array
data = pd.Series([0.25, 0.5, 0.75, 1.0],index=['a', 'b', 'c', 'd'])
data
data['b']

## non-contiguous or non-sequential indices:
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=[2, 5, 3, 7])
data
data[5]

##series as a specialized dictionary
population_dict = {'California': 38332521,
                   'Texas': 26448193,
                   'New York': 19651127,
                   'Florida': 19552860,
                   'Illinois': 12882135}
population = pd.Series(population_dict)
population
population['California']
population['California':'Illinois']

##summary: constructing series objects
#>>>pd.Series(data, index=index)
# where index is an optional argument and data can be one of many entities
# data can be a numpy array, in which case index defaults to an integer sequence
# eg.
pd.Series([2,4,6])
pd.Series(5, index=[100, 200, 300])
# data can be a dictionary
pd.Series({2:'a', 1:'b', 3:'c'})
# index can be explicitly set
pd.Series({2:'a', 1:'b', 3:'c'}, index=[3, 2])

3    c
2    a
dtype: object

# 2. The Pandas DataFrame object

In [9]:
# The Pandas DataFrame object

# If a Series is an analog of a one-dimensional array with flexible indices, a DataFrame is an analog of a two-dimensional array with both flexible row indices and flexible column names
# Think of a DataFrame as a sequence of aligned Series objects. 
# Here, by "aligned" we mean that they share the same index.

## DataFrame as a generalized numpy array
area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297,
             'Florida': 170312, 'Illinois': 149995}
area = pd.Series(area_dict)
area

states = pd.DataFrame({'population': population,
                       'area': area})
states

states.index
states.columns
# Additionally, the DataFrame has a columns attribute, which is an Index object holding the column labels:


## DataFrame as a specialized dictionary
# Similarly, we can also think of a DataFrame as a specialization of a dictionary. Where a dictionary maps a key to a value, a DataFrame maps a column name to a Series of column data.
states['area']

## Constructing DataFrame objects
pd.DataFrame(population, columns=['population']) #constructed from a single series object

# from a list of dicts
data = [{'a': i, 'b': 2 * i}
        for i in range(3)]
pd.DataFrame(data)
#if some keys in the dictionary are missing, Pandas will fill them in with NaN (i.e., "not a number") values
pd.DataFrame([{'a': 1, 'b': 2}, {'b': 3, 'c': 4}])

### from a dictionary of series objects
pd.DataFrame({'population': population,
              'area': area})

### from a 2-D numpy array
pd.DataFrame(np.random.rand(3, 2),
             columns=['foo', 'bar'],
             index=['a', 'b', 'c'])

### from a numpy structured array
A = np.zeros(3, dtype=[('A', 'i8'), ('B', 'f8')])
A
pd.DataFrame(A)


Unnamed: 0,A,B
0,0,0.0
1,0,0.0
2,0,0.0


# 3. The Pandas Index Object

In [12]:
"""We have seen here that both the Series and DataFrame objects contain an explicit index 
that lets you reference and modify data. This Index object is an interesting structure in 
itself, and it can be thought of either as an immutable array or as an ordered set 
(technically a multi-set, as Index objects may contain repeated values)"""

ind = pd.Index([2, 3, 5, 7, 11])
ind

## index as an immutable array
ind[1]
ind[::2]
print(ind.size, ind.shape, ind.ndim, ind.dtype) # attributes of intex object

#One difference between Index objects and NumPy arrays is that indices are immutable
ind[1] = 0
"""This immutability makes it safer to share indices between multiple DataFrames and arrays, without the potential for side effects from inadvertent index modification."""

## Index as ordered set
indA = pd.Index([1, 3, 5, 7, 9])
indB = pd.Index([2, 3, 5, 7, 11])

indA & indB  # intersection
indA | indB  # union
indA ^ indB  # symmetric difference



5 (5,) 1 int64


Int64Index([1, 2, 9, 11], dtype='int64')