<title>pandas Data Structures</title>

# Matering pandas

- by Femi Anthony
- Published by Packt Publishing, 2015

# ch03. pandas Data Structures



### Notes by Jesse
-  2017/08/16

### Concept

- Tour of numpy.ndarray data structure.
- The pandas.Series 1-dimensional (1D) pandas data structure
- The pandas.DatcaFrame 2-dimensional (2D) pandas tabular data structure
- The pandas.Panel 3-dimensional (3D) pandas data structure

## NumPy array creation via numpy.array
- The type numpy.ndarray, a homogenous multidimensional array
- Access to numerous mathematical functions – linear algebra, statistics, and so on
- Ability to integrate C, C++, and Fortran code

In [None]:
import numpy as np
import pandas as pd

from IPython.display import *

In [None]:
# NUMPY ARRAYS VIA NUMPY.ARRAY

ar1 = np.array([0,1,2,3]) # 1 dimensional array
ar1

In [None]:
ar2=np.array([[0,3,5],[2,8,7]]) # 2D array
display(ar2)

<b>Shape of the array</b>

In [None]:
ar2.shape

<b>Number of dimensions</b>

In [None]:
ar2.ndim

<h2>NumPy array creation via numpy.arange</h2>

In [None]:
ar3=np.arange(12);ar3

In [None]:
# start, end (exclusive)
ar4=np.arange(3,10,3);ar4

<h2>NumPy array creation via numpy.linspace</h2>

In [None]:
# args - start element,end element, number of elements
ar5=np.linspace(0,2.0/3,4); ar5

<h2>NumPy array via various other functions</h2>

<h3>numpy.ones</h3>

In [None]:
# Produces 2x3x2 array of 1's.
ar7=np.ones((2,3,2)); ar7

<h3>numpy.zeros</h3>

In [None]:
# Produce 4x2 array of zeros.
ar8=np.zeros((4,2));ar8

<h3>numpy.eye</h3>

In [None]:
# Produces identity matrix
ar9 = np.eye(3);ar9

In [None]:
f_ar = np.array([3,-2,8.18])
f_ar

<h3>numpy.diag</h3>

In [None]:
# Create diagonal array
ar10=np.diag((2,1,4,6));ar10

<h3>numpy.random.rand</h3>

In [None]:
# Using the rand, randn functions
# rand(m) produces uniformly distributed random numbers with range 0 to m
np.random.seed(100) # Set seed
ar11=np.random.rand(3); ar11

In [None]:
# randn(m) produces m normally distributed (Gaussian) random numbers
ar12=np.random.randn(5); ar12

<h3>numpy.empty</h3>

In [None]:
ar13=np.empty((3,2)); ar13

<h3>numpy.tile</h3>

In [None]:
np.array([[1,2],[6,7]])

In [None]:
np.tile(np.array([[1,2],[6,7]]),3)

In [None]:
np.tile(np.array([[1,2],[6,7]]),(2,2))

<h2>NumPy datatypes</h2>

In [None]:
ar=np.array([2,-1,6,3],dtype='float'); ar

In [None]:
ar.dtype

In [None]:
ar=np.array([2,4,6,8]); ar.dtype

In [None]:
ar=np.array([2.,4,6,8]); ar.dtype

In [None]:
sar=np.array(['Goodbye','Welcome','Tata','Goodnight']); sar.dtype

In [None]:
bar=np.array([True, False, True]); bar.dtype

In [None]:
f_ar = np.array([3,-2,8.18])
f_ar

In [None]:
f_ar.dtype

In [None]:
g_ar=f_ar.astype(int)

In [None]:
print(f_ar)
print(g_ar)

In [None]:
print(f_ar.dtype)
print(g_ar)

<h2>NumPy indexing and slicing</h2>

In [None]:
# print entire array, element 0, element 1, last element.
ar = np.arange(5); print (ar); ar[0], ar[1], ar[-1]

In [None]:
# 2nd, last and 1st elements
ar=np.arange(5); ar[1], ar[-1], ar[0]

In [None]:
# Reverse array using ::-1 idiom
ar=np.arange(5); ar[::-1]

In [None]:
# Index multi-dimensional array

In [None]:
ar = np.array([[2,3,4],[9,8,7],[11,12,13]]); ar

In [None]:
ar[1,1]

In [None]:
ar[1,1]=5; ar

In [None]:
ar[2]

In [None]:
ar[2,:]

In [None]:
ar[:,1]

In [None]:
ar = np.array([0,1,2])

In [None]:
ar[5]

<h2>Array slicing</h2>

In [None]:
ar=2*np.arange(6); ar

In [None]:
ar[1:5:2]

In [None]:
ar[1:6:2]

In [None]:
ar[:4]

In [None]:
ar[4:]

In [None]:
ar[::3]

In [None]:
ar

In [None]:
ar[:3]=1;ar

In [None]:
ar[2:]=np.ones(4);ar

<h2>Array masking</h2>

In [None]:
np.random.seed(10)
# ar=np.random.random_integers(0,25,10); ar
ar=np.random.randint(0, 25, 10); ar

In [None]:
np.random.randint?

In [None]:
evenMask=(ar % 2==0); evenMask

In [None]:
evenNums=ar[evenMask]; evenNums

In [None]:
ar=np.array(['Hungary','Nigeria',
'Guatemala','','Poland','','Japan']); ar

In [None]:
ar[ar=='']='USA'; ar

In [None]:
ar=11*np.arange(0,10); ar

In [None]:
# ***, vs. next
ar[[1,3,4,2,7]]

In [None]:
ar[1,3,4,2,7]

In [None]:
ar[1]

In [None]:
ar[[1,3]]=50; ar

<h2>Complex indexing</h2>

In [None]:
ar=np.arange(15); ar

In [None]:
ar2=np.arange(0,-10,-1)[::-1]; ar2

In [None]:
ar2=np.arange(-9,1,1);ar2

In [None]:
ar[:10]=ar2; ar

<h2>Copies and views</h2>

<b>Modifying view modifies original array</b>

In [None]:
ar1=np.arange(12); ar1

In [None]:
ar2=ar1[::2]; ar2

In [None]:
ar2[1]=-1; ar1

<b>Use np.copy to force a copy</b>

In [None]:
ar=np.arange(8);ar

In [None]:
arc=ar[:3].copy(); arc

In [None]:
arc[0]=-1; arc

In [None]:
ar

<h1>Operations</h1>
<h2>Basic Operations</h2>

<b>Element-wise</b>

In [98]:
ar=np.arange(0,7)*5; ar

array([ 0,  5, 10, 15, 20, 25, 30])

In [99]:
ar=np.arange(5) ** 4 ; ar

array([  0,   1,  16,  81, 256], dtype=int32)

In [100]:
ar ** 0.5

array([  0.,   1.,   4.,   9.,  16.])

In [107]:
ar=3+np.arange(0, 30,3); ar

array([ 3,  6,  9, 12, 15, 18, 21, 24, 27, 30])

In [108]:
ar2=np.arange(1,11); ar2

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

In [109]:
ar-ar2

array([ 2,  4,  6,  8, 10, 12, 14, 16, 18, 20])

In [110]:
ar/ar2

array([ 3.,  3.,  3.,  3.,  3.,  3.,  3.,  3.,  3.,  3.])

In [111]:
ar*ar2

array([  3,  12,  27,  48,  75, 108, 147, 192, 243, 300])

<b>NumPy faster for this than Python</b>

In [112]:
ar=np.arange(1000)
%timeit ar**3

10000 loops, best of 3: 53.8 µs per loop


In [113]:
ar=range(1000)
%timeit [ar[i]**3 for i in ar]

The slowest run took 6.64 times longer than the fastest. This could mean that an intermediate result is being cached.
1000 loops, best of 3: 538 µs per loop


<b>Array multiplication is element wise</b>

In [114]:
ar=np.array([[1,1],[1,1]]); ar

array([[1, 1],
       [1, 1]])

In [115]:
ar2=np.array([[2,2],[2,2]]); ar2

array([[2, 2],
       [2, 2]])

In [116]:
ar.dot(ar2)

array([[4, 4],
       [4, 4]])

In [117]:
ar*ar2

array([[2, 2],
       [2, 2]])

<b>Comparison and logical operations are also elememt-wise</b>

In [118]:
ar=np.arange(1,5); ar

array([1, 2, 3, 4])

In [119]:
ar2=np.arange(5,1,-1);ar2

array([5, 4, 3, 2])

In [120]:
ar < ar2

array([ True,  True, False, False], dtype=bool)

In [121]:
l1 = np.array([True,False,True,False])
l2 = np.array([False,False,True, False])
np.logical_and(l1,l2)

array([False, False,  True, False], dtype=bool)

<b>Other operations are also element-wise</b>

In [122]:
ar=np.array([np.pi, np.pi/2]); np.sin(ar)

array([  1.22464680e-16,   1.00000000e+00])

<b>For element-wise operations, the 2 arrays must be the same shape else an error results</b>

In [123]:
ar=np.arange(0,6); ar

array([0, 1, 2, 3, 4, 5])

In [124]:
ar2=np.arange(0,8); ar2

array([0, 1, 2, 3, 4, 5, 6, 7])

In [125]:
ar*ar2

ValueError: operands could not be broadcast together with shapes (6,) (8,) 

In [129]:
ar.shape

(6,)

<b>NumPy arrays can be transposed</b>

In [130]:
ar=np.array([[1,2,3],[4,5,6]]); ar

array([[1, 2, 3],
       [4, 5, 6]])

In [131]:
ar.T

array([[1, 4],
       [2, 5],
       [3, 6]])

In [132]:
np.transpose(ar)

array([[1, 4],
       [2, 5],
       [3, 6]])

In [134]:
%timeit ar.T
%timeit np.transpose(ar)

The slowest run took 46.31 times longer than the fastest. This could mean that an intermediate result is being cached.
1000000 loops, best of 3: 177 ns per loop
The slowest run took 16.34 times longer than the fastest. This could mean that an intermediate result is being cached.
1000000 loops, best of 3: 427 ns per loop


<b>Compare arrays not element-wise but array-wise</b>

In [135]:
ar=np.arange(0,6)
ar2=np.array([0,1,2,3,4,5])
np.array_equal(ar, ar2)

True

In [136]:
np.all(ar==ar2)

True

<h1>Reduction Operations</h1>

In [144]:
ar=np.arange(1,5)
ar.prod()

24

In [145]:
np.prod?

In [146]:
np.prod(ar)

24

In [147]:
%timeit ar.prod()
%timeit np.prod(ar)

The slowest run took 17.03 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 3: 1.61 µs per loop
The slowest run took 10.10 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 3: 2.6 µs per loop


In [148]:
ar=np.array([np.arange(1,6),np.arange(1,6)]);ar

array([[1, 2, 3, 4, 5],
       [1, 2, 3, 4, 5]])

In [149]:
#Columns
np.prod(ar,axis=0)

array([ 1,  4,  9, 16, 25])

In [150]:
#Rows
np.prod(ar,axis=1)

array([120, 120])

In [151]:
ar=np.array([[2,3,4],[5,6,7],[8,9,10]]); ar.sum()

54

In [152]:
ar.mean()

6.0

In [153]:
np.median(ar)

6.0

<h1>Statistical operators</h1>

In [154]:
np.random.seed(10)
ar=np.random.randint(0,10, size=(4,5));ar

array([[9, 4, 0, 1, 9],
       [0, 1, 8, 9, 0],
       [8, 6, 4, 3, 0],
       [4, 6, 8, 1, 8]])

In [155]:
ar.mean()

4.4500000000000002

In [156]:
ar.std()

3.4274626183227732

In [157]:
ar.var(axis=0) #across rows

array([ 12.6875,   4.1875,  11.    ,  10.75  ,  18.1875])

In [158]:
ar.cumsum()

array([ 9, 13, 13, 14, 23, 23, 24, 32, 41, 41, 49, 55, 59, 62, 62, 66, 72,
       80, 81, 89], dtype=int32)

<h1>Logical operators</h1>

In [159]:
np.random.seed(100)
ar=np.random.randint(1,10, size=(4,4));ar

array([[9, 9, 4, 8],
       [8, 1, 5, 3],
       [6, 3, 3, 3],
       [2, 1, 9, 5]])

In [160]:
np.any((ar%7)==0)

False

In [161]:
np.all(ar<11)

True

<h1>Broadcasting</h1>

In [162]:
ar=np.ones([3,2]); ar

array([[ 1.,  1.],
       [ 1.,  1.],
       [ 1.,  1.]])

In [163]:
ar2=np.array([2,3]); ar2

array([2, 3])

In [164]:
ar+ar2

array([[ 3.,  4.],
       [ 3.,  4.],
       [ 3.,  4.]])

<b>Broadcasting works across dimensions</b>

In [165]:
ar=np.array([[23,24,25]]); ar

array([[23, 24, 25]])

In [166]:
ar.T

array([[23],
       [24],
       [25]])

In [167]:
ar.T+ar

array([[46, 47, 48],
       [47, 48, 49],
       [48, 49, 50]])

<h1>Array shape manipulation<h1>
<h2>Flattening a multi-dimensional array<h2>

In [168]:
ar=np.array([np.arange(1,6), np.arange(10,15)]); ar

array([[ 1,  2,  3,  4,  5],
       [10, 11, 12, 13, 14]])

In [169]:
ar.ravel()

array([ 1,  2,  3,  4,  5, 10, 11, 12, 13, 14])

In [171]:
ar

array([[ 1,  2,  3,  4,  5],
       [10, 11, 12, 13, 14]])

In [170]:
ar.T.ravel()

array([ 1, 10,  2, 11,  3, 12,  4, 13,  5, 14])

<h2>Reshaping<h2>

In [172]:
ar=np.arange(1,16);ar

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15])

In [173]:
ar.reshape(3,5)

array([[ 1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10],
       [11, 12, 13, 14, 15]])

<h2>Resizing</h2>

In [174]:
ar=np.arange(5); ar.resize((8,));ar

array([0, 1, 2, 3, 4, 0, 0, 0])

In [175]:
ar=np.arange(5);
ar

array([0, 1, 2, 3, 4])

<b>resize() only works if no otehr references to array, else error</b>

In [176]:
ar2=ar

In [177]:
ar.resize((8,));

ValueError: cannot resize an array that references or is referenced
by another array in this way.  Use the resize function

<b>Workaround is to use numpy.ndarray.resize() instead</b>

In [178]:
np.resize(ar,(8,))

array([0, 1, 2, 3, 4, 0, 1, 2])

<h2>Adding a dimension</h2>

In [179]:
ar=np.array([14,15,16]); ar.shape

(3,)

In [180]:
ar

array([14, 15, 16])

In [181]:
ar=ar[:, np.newaxis]; ar.shape

(3, 1)

In [182]:
ar

array([[14],
       [15],
       [16]])

In [183]:
ar=ar[:, np.newaxis]; ar.shape

(3, 1, 1)

<h1>Array sorting</h1>

<b>Along y-axis<b>

In [184]:
ar=np.array([[3,2],[10,-1]])
ar

array([[ 3,  2],
       [10, -1]])

In [185]:
ar.sort(axis=1)
ar

array([[ 2,  3],
       [-1, 10]])

<b>Along x-axis</b>

In [186]:
ar=np.array([[3,2],[10,-1]])
ar

array([[ 3,  2],
       [10, -1]])

In [187]:
ar.sort(axis=0)
ar

array([[ 3, -1],
       [10,  2]])

<h1>Data structures in pandas<h1>

<h1>Series</h1>

<h2>Series creation</h2>

<h2>Using numpy.ndarray</h2>

In [188]:
import numpy as np
np.random.seed(100)
ser=pd.Series(np.random.rand(7)); ser

0    0.543405
1    0.278369
2    0.424518
3    0.844776
4    0.004719
5    0.121569
6    0.670749
dtype: float64

In [189]:
import calendar as cal
monthNames=[cal.month_name[i] for i in np.arange(1,6)]
months=pd.Series(np.arange(1,6),index=monthNames);months

January     1
February    2
March       3
April       4
May         5
dtype: int32

In [190]:
months.index

Index(['January', 'February', 'March', 'April', 'May'], dtype='object')

<h2>Using Python dictionary</h2>

In [191]:
currDict={'US' : 'dollar', 'UK' : 'pound',
'Germany': 'euro', 'Mexico':'peso',
'Nigeria':'naira',
'China':'yuan', 'Japan':'yen'}
currSeries=pd.Series(currDict); currSeries

China        yuan
Germany      euro
Japan         yen
Mexico       peso
Nigeria     naira
UK          pound
US         dollar
dtype: object

In [193]:
stockPrices = {'GOOG':1180.97,'FB':62.57,
'TWTR': 64.50, 'AMZN':358.69,
'AAPL':500.6}
stockPriceSeries=pd.Series(stockPrices,
index=['GOOG','FB','YHOO',
'TWTR','AMZN','AAPL'],
name='stockPrices')
stockPriceSeries

GOOG    1180.97
FB        62.57
YHOO        NaN
TWTR      64.50
AMZN     358.69
AAPL     500.60
Name: stockPrices, dtype: float64

<h2>Using scalar values</h2>

In [194]:
dogSeries=pd.Series('chihuahua',
index=['breed','countryOfOrigin',
'name', 'gender'])
dogSeries

breed              chihuahua
countryOfOrigin    chihuahua
name               chihuahua
gender             chihuahua
dtype: object

In [195]:
dogSeries=pd.Series('pekingese'); dogSeries

0    pekingese
dtype: object

In [196]:
type(dogSeries)

pandas.core.series.Series

<h1>Operations on Series</h1>

<h2>Assignment</h2>

In [197]:
currDict['China']

'yuan'

In [198]:
stockPriceSeries['GOOG']=1200.0
stockPriceSeries

GOOG    1200.00
FB        62.57
YHOO        NaN
TWTR      64.50
AMZN     358.69
AAPL     500.60
Name: stockPrices, dtype: float64

<b>KeyError is raised if you try to retrieve a missing label</b>

In [None]:
# stockPriceSeries['MSFT']

<b>Use get() to avoid this</b>

In [200]:
stockPriceSeries.get('MSFT',np.NaN)

nan

<h2>Slicing</h2>

In [201]:
stockPriceSeries[:4]

GOOG    1200.00
FB        62.57
YHOO        NaN
TWTR      64.50
Name: stockPrices, dtype: float64

In [202]:
stockPriceSeries[stockPriceSeries > 100]



GOOG    1200.00
AMZN     358.69
AAPL     500.60
Name: stockPrices, dtype: float64

<h2>Other operations</h2>

In [203]:
np.mean(stockPriceSeries)

437.27200000000005

In [204]:
np.std(stockPriceSeries)

417.4446361087899

<b>Element-wise operations</b>

In [205]:
ser

0    0.543405
1    0.278369
2    0.424518
3    0.844776
4    0.004719
5    0.121569
6    0.670749
dtype: float64

In [206]:
ser*ser

0    0.295289
1    0.077490
2    0.180215
3    0.713647
4    0.000022
5    0.014779
6    0.449904
dtype: float64

In [207]:
np.sqrt(ser)

0    0.737160
1    0.527607
2    0.651550
3    0.919117
4    0.068694
5    0.348668
6    0.818993
dtype: float64

<b>Data ia automatically aligned on basis of the label</b>

In [208]:
ser[1:]

1    0.278369
2    0.424518
3    0.844776
4    0.004719
5    0.121569
6    0.670749
dtype: float64

In [209]:
ser[1:] + ser[:-2]

0         NaN
1    0.556739
2    0.849035
3    1.689552
4    0.009438
5         NaN
6         NaN
dtype: float64

<h1>DataFrame</h1>


<h1>DataFrame Creation</h1>


<h2>Using dictionaries of Series</h2>

In [214]:
import pprint as pp

In [217]:
stockSummaries={'AMZN': pd.Series([346.15,0.59,459,0.52,589.8,158.88],
                                  index=['Closing price','EPS','Shares Outstanding(M)',
                                         'Beta', 'P/E','Market Cap(B)']),
                'GOOG': pd.Series([1133.43,36.05,335.83,0.87,31.44,380.64],
                                  index=['Closing price','EPS','Shares Outstanding(M)',
                                         'Beta','P/E','Market Cap(B)']),
                  'FB': pd.Series([61.48,0.59,2450,104.93,150.92],
                                  index=['Closing price','EPS','Shares Outstanding(M)',
                                         'P/E', 'Market Cap(B)']),
                'YHOO': pd.Series([34.90,1.27,1010,27.48,0.66,35.36],
                                  index=['Closing price','EPS','Shares Outstanding(M)',
                                         'P/E','Beta', 'Market Cap(B)']),
               'TWTR':pd.Series([65.25,-0.3,555.2,36.23],
                                  index=['Closing price','EPS','Shares Outstanding(M)',
                                         'Market Cap(B)']),
               'AAPL':pd.Series([501.53,40.32,892.45,12.44,447.59,0.84],
                                  index=['Closing price','EPS','Shares Outstanding(M)','P/E',
                                         'Market Cap(B)','Beta'])}

print(stockSummaries)

{'AAPL': Closing price            501.53
EPS                       40.32
Shares Outstanding(M)    892.45
P/E                       12.44
Market Cap(B)            447.59
Beta                       0.84
dtype: float64, 'TWTR': Closing price             65.25
EPS                       -0.30
Shares Outstanding(M)    555.20
Market Cap(B)             36.23
dtype: float64, 'GOOG': Closing price            1133.43
EPS                        36.05
Shares Outstanding(M)     335.83
Beta                        0.87
P/E                        31.44
Market Cap(B)             380.64
dtype: float64, 'AMZN': Closing price            346.15
EPS                        0.59
Shares Outstanding(M)    459.00
Beta                       0.52
P/E                      589.80
Market Cap(B)            158.88
dtype: float64, 'YHOO': Closing price              34.90
EPS                         1.27
Shares Outstanding(M)    1010.00
P/E                        27.48
Beta                        0.66
Market Cap(B)       

In [211]:
stockDF=pd.DataFrame(stockSummaries); stockDF


Unnamed: 0,AAPL,AMZN,FB,GOOG,TWTR,YHOO
Beta,0.84,0.52,,0.87,,0.66
Closing price,501.53,346.15,61.48,1133.43,65.25,34.9
EPS,40.32,0.59,0.59,36.05,-0.3,1.27
Market Cap(B),447.59,158.88,150.92,380.64,36.23,35.36
P/E,12.44,589.8,104.93,31.44,,27.48
Shares Outstanding(M),892.45,459.0,2450.0,335.83,555.2,1010.0


In [212]:
stockDF=pd.DataFrame(stockSummaries,
index=['Closing price','EPS',
       'Shares Outstanding(M)',
    'P/E', 'Market Cap(B)','Beta']);stockDF

Unnamed: 0,AAPL,AMZN,FB,GOOG,TWTR,YHOO
Closing price,501.53,346.15,61.48,1133.43,65.25,34.9
EPS,40.32,0.59,0.59,36.05,-0.3,1.27
Shares Outstanding(M),892.45,459.0,2450.0,335.83,555.2,1010.0
P/E,12.44,589.8,104.93,31.44,,27.48
Market Cap(B),447.59,158.88,150.92,380.64,36.23,35.36
Beta,0.84,0.52,,0.87,,0.66


In [218]:
stockDF=pd.DataFrame(stockSummaries,
        index=['Closing price','EPS',
               'Shares Outstanding(M)',
               'P/E', 'Market Cap(B)','Beta'],
        columns=['FB','TWTR','SCNW'])
stockDF

Unnamed: 0,FB,TWTR,SCNW
Closing price,61.48,65.25,
EPS,0.59,-0.3,
Shares Outstanding(M),2450.0,555.2,
P/E,104.93,,
Market Cap(B),150.92,36.23,
Beta,,,


In [219]:
stockDF.index

Index(['Closing price', 'EPS', 'Shares Outstanding(M)', 'P/E', 'Market Cap(B)',
       'Beta'],
      dtype='object')

In [220]:
stockDF.columns

Index(['FB', 'TWTR', 'SCNW'], dtype='object')

<h2>Using a dictionary of ndarrays/lists</h2>

In [221]:
algos={'search':['DFS','BFS','Binary Search','Linear','ShortestPath (Djikstra)'],
      'sorting': ['Quicksort','Mergesort', 'Heapsort','Bubble Sort', 'Insertion Sort'],
      'machine learning':['RandomForest','K Nearest Neighbor','Logistic Regression',
                          'K-Means Clustering','Linear Regression']}
algoDF=pd.DataFrame(algos); algoDF

Unnamed: 0,machine learning,search,sorting
0,RandomForest,DFS,Quicksort
1,K Nearest Neighbor,BFS,Mergesort
2,Logistic Regression,Binary Search,Heapsort
3,K-Means Clustering,Linear,Bubble Sort
4,Linear Regression,ShortestPath (Djikstra),Insertion Sort


In [222]:
pd.DataFrame(algos,index=['algo_1','algo_2','algo_3','algo_4','algo_5'])

Unnamed: 0,machine learning,search,sorting
algo_1,RandomForest,DFS,Quicksort
algo_2,K Nearest Neighbor,BFS,Mergesort
algo_3,Logistic Regression,Binary Search,Heapsort
algo_4,K-Means Clustering,Linear,Bubble Sort
algo_5,Linear Regression,ShortestPath (Djikstra),Insertion Sort


<h2>Using a structured array</h2>

In [223]:
memberData = np.zeros((4,),
             dtype=[('Name','a15'),
                    ('Age','i4'),
                   ('Weight','f4')])
memberData[:] = [('Sanjeev',37,162.4),
                 ('Yingluck',45,137.8),
                 ('Emeka',28,153.2),
                 ('Amy',67,101.3)]
memberDF=pd.DataFrame(memberData);memberDF


Unnamed: 0,Name,Age,Weight
0,b'Sanjeev',37,162.399994
1,b'Yingluck',45,137.800003
2,b'Emeka',28,153.199997
3,b'Amy',67,101.300003


In [224]:
pd.DataFrame(memberData, index=['a','b','c','d'])

Unnamed: 0,Name,Age,Weight
a,b'Sanjeev',37,162.399994
b,b'Yingluck',45,137.800003
c,b'Emeka',28,153.199997
d,b'Amy',67,101.300003


<h2>Using a Series structure</h2>

In [225]:
currSeries.name='currency'
pd.DataFrame(currSeries)

Unnamed: 0,currency
China,yuan
Germany,euro
Japan,yen
Mexico,peso
Nigeria,naira
UK,pound
US,dollar


<h1>DataFrame Operations</h1>

<h2>Selection</h2>

In [226]:
memberDF['Name']

0     b'Sanjeev'
1    b'Yingluck'
2       b'Emeka'
3         b'Amy'
Name: Name, dtype: object

<h2>Assignment</h2>

In [227]:
memberDF['Height']=60;memberDF

Unnamed: 0,Name,Age,Weight,Height
0,b'Sanjeev',37,162.399994,60
1,b'Yingluck',45,137.800003,60
2,b'Emeka',28,153.199997,60
3,b'Amy',67,101.300003,60


<h2>Deletion</h2>

In [228]:
del memberDF['Height']; memberDF

Unnamed: 0,Name,Age,Weight
0,b'Sanjeev',37,162.399994
1,b'Yingluck',45,137.800003
2,b'Emeka',28,153.199997
3,b'Amy',67,101.300003


In [229]:
memberDF['BloodType']='O'
bloodType=memberDF.pop('BloodType'); bloodType

0    O
1    O
2    O
3    O
Name: BloodType, dtype: object

<h2>Insertion</h2>

In [230]:
memberDF.insert(2,'isSenior',memberDF['Age']>60);
memberDF

Unnamed: 0,Name,Age,isSenior,Weight
0,b'Sanjeev',37,False,162.399994
1,b'Yingluck',45,False,137.800003
2,b'Emeka',28,False,153.199997
3,b'Amy',67,True,101.300003


<h2>Alignment</h2>

In [231]:
ore1DF=pd.DataFrame(np.array([[20,35,25,20],
                              [11,28,32,29]]),
                    columns=['iron','magnesium',
                             'copper','silver'])
ore2DF=pd.DataFrame(np.array([[14,34,26,26],
                              [33,19,25,23]]),
                    columns=['iron','magnesium',
                            'gold','silver'])
ore1DF+ore2DF

Unnamed: 0,copper,gold,iron,magnesium,silver
0,,,34,69,46
1,,,44,47,52


In [232]:
ore1DF + pd.Series([25,25,25,25],
index=['iron','magnesium','copper','silver'])

Unnamed: 0,iron,magnesium,copper,silver
0,45,60,50,45
1,36,53,57,54


<h2>Other mathematical operations</h2>

In [233]:
np.sqrt(ore1DF)

Unnamed: 0,iron,magnesium,copper,silver
0,4.472136,5.91608,5.0,4.472136
1,3.316625,5.291503,5.656854,5.385165


<h1>Panel</h1>
<h1>Panel Creation</h1>

<h2>Using 3D NumPy array with axis labels</h2>

In [234]:
stockData=np.array([[[63.03,61.48,75],
                     [62.05,62.75,46],
                     [62.74,62.19,53]],
                   [[411.90, 404.38, 2.9],
                    [405.45, 405.91, 2.6],
                    [403.15, 404.42, 2.4]]])
stockData

array([[[  63.03,   61.48,   75.  ],
        [  62.05,   62.75,   46.  ],
        [  62.74,   62.19,   53.  ]],

       [[ 411.9 ,  404.38,    2.9 ],
        [ 405.45,  405.91,    2.6 ],
        [ 403.15,  404.42,    2.4 ]]])

In [235]:
stockHistoricalPrices = pd.Panel(stockData,
                                 items=['FB', 'NFLX'],
                                 major_axis=pd.date_range('2/3/2014',periods=3),
                                 minor_axis=['open price', 'closing price', 'volume'])
stockHistoricalPrices


<class 'pandas.core.panel.Panel'>
Dimensions: 2 (items) x 3 (major_axis) x 3 (minor_axis)
Items axis: FB to NFLX
Major_axis axis: 2014-02-03 00:00:00 to 2014-02-05 00:00:00
Minor_axis axis: open price to volume

<h2>Using a Python dictionary of DataFrame objects</h2>

In [236]:
USData=pd.DataFrame(np.array([[249.62 , 8900],
                              [282.16,12680],
                              [309.35,14940]]),
                    columns=['Population(M)','GDP($B)'],
                    index=[1990,2000,2010])
USData

Unnamed: 0,Population(M),GDP($B)
1990,249.62,8900.0
2000,282.16,12680.0
2010,309.35,14940.0


In [237]:
ChinaData=pd.DataFrame(np.array([[1133.68, 390.28],
                                 [1266.83,1198.48],
                                 [1339.72, 6988.47]]),
                       columns=['Population(M)','GDP($B)'],
                       index=[1990,2000,2010])
ChinaData

Unnamed: 0,Population(M),GDP($B)
1990,1133.68,390.28
2000,1266.83,1198.48
2010,1339.72,6988.47


In [238]:
US_ChinaData={'US' : USData,
              'China': ChinaData}
pd.Panel(US_ChinaData)

<class 'pandas.core.panel.Panel'>
Dimensions: 2 (items) x 3 (major_axis) x 2 (minor_axis)
Items axis: China to US
Major_axis axis: 1990 to 2010
Minor_axis axis: Population(M) to GDP($B)

<h2>Using the DataFrame.to_panel method</h2>

In [239]:
mIdx = pd.MultiIndex(levels=[['US', 'China'],
                             [1990,2000, 2010]],
                     labels=[[1,1,1,0,0,0],[0,1,2,0,1,2]])
mIdx

MultiIndex(levels=[['US', 'China'], [1990, 2000, 2010]],
           labels=[[1, 1, 1, 0, 0, 0], [0, 1, 2, 0, 1, 2]])

In [240]:
ChinaUSDF = pd.DataFrame({'Population(M)' : [1133.68, 1266.83, 1339.72, 
                                                        249.62, 282.16,309.35], 
                                     'GDB($B)': [390.28, 1198.48, 6988.47, 8900,12680,14940]}, 
                          index=mIdx)
ChinaUSDF


Unnamed: 0,Unnamed: 1,GDB($B),Population(M)
China,1990,390.28,1133.68
China,2000,1198.48,1266.83
China,2010,6988.47,1339.72
US,1990,8900.0,249.62
US,2000,12680.0,282.16
US,2010,14940.0,309.35


In [241]:
ChinaUSDF.to_panel()


<class 'pandas.core.panel.Panel'>
Dimensions: 2 (items) x 2 (major_axis) x 3 (minor_axis)
Items axis: GDB($B) to Population(M)
Major_axis axis: US to China
Minor_axis axis: 1990 to 2010