# Python for Data Analysis
# Chapter 5: Getting started with pandas

Series

In [1]:
# !pip3 install pandas
import numpy as np
import pandas as pd
import string
# Two primary data types: Series, Dataframe
from pandas import Series, DataFrame

In [2]:
# Series like a dict
obj = pd.Series([1, 2, 3, 4, -6, -10])
print(obj)
print(obj.values)
print(obj.index)
print()
ob2 = pd.Series([-3, 4, 52, -3], index = ['a', 'b', 'c', 'd'])
print(ob2)
print(ob2[['d', 'b']])


0     1
1     2
2     3
3     4
4    -6
5   -10
dtype: int64
[  1   2   3   4  -6 -10]
RangeIndex(start=0, stop=6, step=1)

a    -3
b     4
c    52
d    -3
dtype: int64
d   -3
b    4
dtype: int64


In [3]:
# Can perform operations on Series

print(obj[obj % 2 == 0])
new_obj = obj * 2
print(new_obj)

# Can use numpy to treat Series as arrays
print(np.ceil(obj))
print(np.cos(obj))
print(np.square(obj))
print(np.logical_not(obj))

1     2
3     4
4    -6
5   -10
dtype: int64
0     2
1     4
2     6
3     8
4   -12
5   -20
dtype: int64
0     1.0
1     2.0
2     3.0
3     4.0
4    -6.0
5   -10.0
dtype: float64
0    0.540302
1   -0.416147
2   -0.989992
3   -0.653644
4    0.960170
5   -0.839072
dtype: float64
0      1
1      4
2      9
3     16
4     36
5    100
dtype: int64
0    False
1    False
2    False
3    False
4    False
5    False
dtype: bool


In [4]:
# Can make Series out of dict, can pass in list as index to filter

data = {'cat' : 1,
        'dog' : 2,
        'moon' : -5,
        'bar' : 5
       }

series = pd.Series(data, index = ['dog', 'cat', 'alf'])
print(series)
print('dog' in series)
print('moon' in series)
print(series.isnull())
print()

# Can name values and index
series.name = 'test'
series.index.name = 'index'
print(series)

dog    2.0
cat    1.0
alf    NaN
dtype: float64
True
False
dog    False
cat    False
alf     True
dtype: bool

index
dog    2.0
cat    1.0
alf    NaN
Name: test, dtype: float64


DataFrame

In [5]:
# Can make a DataFrame out of dict of list
data = {'Name' : ['Bob', 'Tess', 'Zoe', 'James'],
        'Age' : [35, 28, 34, 30],
       'Town' : ['Brattleboro', 'West Ches', 'Brattleboro', 'BF']}
df = pd.DataFrame(data, index = [10, 11, 12, 34])
print(df)
print()
# Can change the order of the columns NAMES
df.columns ='Town','Name','Age'
print(df)
# Can return columns as Series with dict-like reference
towns = df['Town']
print(towns)


     Name  Age         Town
10    Bob   35  Brattleboro
11   Tess   28    West Ches
12    Zoe   34  Brattleboro
34  James   30           BF

     Town  Name          Age
10    Bob    35  Brattleboro
11   Tess    28    West Ches
12    Zoe    34  Brattleboro
34  James    30           BF
10      Bob
11     Tess
12      Zoe
34    James
Name: Town, dtype: object


In [6]:
# Adding columns
df['None'] = None
df['gt_30'] = df['Name'] > 30
df['random'] = np.random.randint(0, 10, len(df))


In [7]:
# Index a set-like array on rows
print(df.index)
print(df.T.index)
'Names' in df.T.index

Int64Index([10, 11, 12, 34], dtype='int64')
Index(['Town', 'Name', 'Age', 'None', 'gt_30', 'random'], dtype='object')


False

In [8]:
array = np.random.randn(4,4)

dataf = pd.DataFrame(array, index = ['b', 'c', 'a', 'd'], columns=['this',
                                                                  'is',
                                                                  'a',
                                                                  'thing'])
dataf = dataf.reindex(['a', 'b', 'c', 'd', 'e'])
dataf

Unnamed: 0,this,is,a,thing
a,0.834525,-0.294727,0.535929,-0.873831
b,0.966441,-0.107338,-1.026345,-0.438825
c,-0.392954,0.671513,0.652234,-0.183253
d,-0.050676,1.105939,0.999473,0.591251
e,,,,


In [9]:
# dataf.drop('e', inplace=True)
dataf.drop('a', axis=1)

Unnamed: 0,this,is,thing
a,0.834525,-0.294727,-0.873831
b,0.966441,-0.107338,-0.438825
c,-0.392954,0.671513,-0.183253
d,-0.050676,1.105939,0.591251
e,,,


In [10]:
# Easy filtering on value with indexing
d1 = pd.DataFrame(np.arange(8).reshape(2,4),
                 columns=['I', 'like', 'to', 'do'],
                 index=[2, 3])

print(d1[d1 % 2 == 0])
print(d1)
# Slice access columns
print(d1['like'])
# Can set constant value like this
d1['like'] = 5
# Access two columns with a list
print(d1[['I', 'to']])
print()

# Can make bool, can update based on coditionals
print(d1 < 5)
d1[d1 < 5] = 'booger'
print(d1)

   I  like  to  do
2  0   NaN   2 NaN
3  4   NaN   6 NaN
   I  like  to  do
2  0     1   2   3
3  4     5   6   7
2    1
3    5
Name: like, dtype: int64
   I  to
2  0   2
3  4   6

      I   like     to     do
2  True  False   True   True
3  True  False  False  False
        I  like      to      do
2  booger     5  booger  booger
3  booger     5       6       7


Loc and iloc

In [22]:
# messing abt
# col = np.random.choice(list(string.printable), 4)
# row = np.random.choice(list(string.printable), 3)
col, row = col, row

df2 = pd.DataFrame(np.arange(12).reshape(3,4),
                   columns=col,
                  index=row)
print(df2)
print()
# Filter all by one column
print(df2[df2['p'] > 2], end='\n\n')
print(df2, end='\n\n')



   n  B   p   ?
Q  0  1   2   3
#  4  5   6   7
.  8  9  10  11

   n  B   p   ?
#  4  5   6   7
.  8  9  10  11

   n  B   p   ?
Q  0  1   2   3
#  4  5   6   7
.  8  9  10  11



In [40]:
# loc = reference with axis labels, iloc with integers
print(df2['n']) # column
print(df2.loc['Q']) # row
print(df2.loc['Q':'#']) # row slice
print()
print(df2.loc['Q', 'p']) # single value

print(df2)


Q    0
#    4
.    8
Name: n, dtype: int64
n    0
B    1
p    2
?    3
Name: Q, dtype: int64
   n  B  p  ?
Q  0  1  2  3
#  4  5  6  7

2
   n  B   p   ?
Q  0  1   2   3
#  4  5   6   7
.  8  9  10  11


In [52]:
# iloc
print(df2.iloc[1]) # row
print(df2.iloc[:, 1]) # column

print(df2.at['.', 'B']) # like loc
print(df2.iat[1, 2]) # like iloc


n    4
B    5
p    6
?    7
Name: #, dtype: int64
Q    1
#    5
.    9
Name: B, dtype: int64
9
6


AttributeError: '_iLocIndexer' object has no attribute 'getter'