# Basic Data Processing with Pandas

## The Series Data Structure

In [2]:
import pandas as pd
pd.Series?

In [3]:
animals = ['Tiger', 'Bear', 'Moose']
pd.Series(animals)

0    Tiger
1     Bear
2    Moose
dtype: object

In [4]:
numbers = [1, 2, 3]
pd.Series(numbers)

0    1
1    2
2    3
dtype: int64

In [5]:
animals = ['Tiger', 'Bear', None]
pd.Series(animals)

0    Tiger
1     Bear
2     None
dtype: object

In [6]:
numbers = [1, 2, None]
pd.Series(numbers)

0    1.0
1    2.0
2    NaN
dtype: float64

In [7]:
import numpy as np
np.nan == None

False

In [8]:
np.nan == np.nan

False

In [9]:
np.isnan(np.nan)

True

In [10]:
sports = {'Archery': 'Bhutan',
         'Golf': 'Scotland',
         'Sumo': 'Japan',
         'Taekwondo': 'South Korea'}
s = pd.Series(sports)
s

Archery           Bhutan
Golf            Scotland
Sumo               Japan
Taekwondo    South Korea
dtype: object

In [11]:
s.index

Index(['Archery', 'Golf', 'Sumo', 'Taekwondo'], dtype='object')

In [12]:
s = pd.Series(['Tiger', 'Bear', 'Moose'], index=['India', 'America', 'Canada'])
s

India      Tiger
America     Bear
Canada     Moose
dtype: object

## Querying a Series

In [13]:
sports = {'Archery': 'Bhutan',
         'Golf': 'Scotland',
         'Sumo': 'Japan',
         'Taekwondo': 'South Korea'}
s = pd.Series(sports)
s

Archery           Bhutan
Golf            Scotland
Sumo               Japan
Taekwondo    South Korea
dtype: object

In [17]:
s.iloc[3]

'South Korea'

In [18]:
s.loc['Golf']

'Scotland'

In [19]:
s[3]

'South Korea'

In [20]:
s['Golf']

'Scotland'

In [21]:
s = pd.Series([100.00, 120.00, 101.00, 3.00])
s

0    100.0
1    120.0
2    101.0
3      3.0
dtype: float64

In [22]:
total = 0
for item in s:
    total += item
print(total)

324.0


In [23]:
import numpy as np

total = np.sum(s)
print(total)

324.0


In [24]:
s = pd.Series(np.random.randint(0, 1000, 10000))
s.head()

0     96
1    569
2    739
3    677
4    220
dtype: int64

In [25]:
len(s)

10000

In [26]:
%%timeit -n 100
summary = 0
for item in s:
    summary += item

100 loops, best of 3: 1.03 ms per loop


In [27]:
%%timeit -n 100
summary = np.sum(s)

100 loops, best of 3: 27.9 µs per loop


In [28]:
s += 2
s.head()

0     98
1    571
2    741
3    679
4    222
dtype: int64

In [29]:
for label, value in s.iteritems():
    s.set_value(label, value + 2)
s.head()

0    100
1    573
2    743
3    681
4    224
dtype: int64

In [31]:
%%timeit -n 10
s = pd.Series(np.random.randint(0, 1000, 10000))
for label, value in s.iteritems():
    s.loc[label] = value + 2

10 loops, best of 3: 719 ms per loop


In [32]:
%%timeit -n 10
s = pd.Series(np.random.randint(0, 1000, 10000))
s += 2

10 loops, best of 3: 231 µs per loop


In [33]:
s = pd.Series([1, 2, 3])
s.loc['Animal'] = 'Bears'
s

0             1
1             2
2             3
Animal    Bears
dtype: object

In [34]:
original_sports = pd.Series({'Archery': 'Bhutan',
                            'Golf': 'Scotland',
                            'Sumo': 'Japan',
                            'Taekwondo': 'South Korea'})

cricket_loving_countries = pd.Series(['Australia',
                                     'Barbados',
                                     'Pakistan',
                                     'England'],
                                    index = ['Cricket',
                                            'Cricket',
                                            'Cricket',
                                            'Cricket'])

all_countries = original_sports.append(cricket_loving_countries)

In [35]:
original_sports

Archery           Bhutan
Golf            Scotland
Sumo               Japan
Taekwondo    South Korea
dtype: object

In [36]:
cricket_loving_countries

Cricket    Australia
Cricket     Barbados
Cricket     Pakistan
Cricket      England
dtype: object

In [37]:
all_countries['Cricket']

Cricket    Australia
Cricket     Barbados
Cricket     Pakistan
Cricket      England
dtype: object

## The DataFrame Data Structure

In [3]:
import pandas as pd

purchase_1 = pd.Series({'Name': 'Chris',
                       'Items_Purchased': 'Dog Food',
                       'Cost': 22.50})
purchase_2 = pd.Series({'Name': 'Kevyn',
                       'Items_Purchased': 'Kitty Litter',
                       'Cost': 2.50})
purchase_3 = pd.Series({'Name': 'Vinod',
                       'Items_Purchased': 'Bird Seed',
                       'Cost': 5.00})

df = pd.DataFrame([purchase_1, purchase_2, purchase_3], 
                  index = ['Store 1', 'Store 1', 'Store 2'])

df

Unnamed: 0,Cost,Items_Purchased,Name
Store 1,22.5,Dog Food,Chris
Store 1,2.5,Kitty Litter,Kevyn
Store 2,5.0,Bird Seed,Vinod


In [4]:
df.loc['Store 2']

Cost                       5
Items_Purchased    Bird Seed
Name                   Vinod
Name: Store 2, dtype: object

In [5]:
type(df.loc['Store 2'])

pandas.core.series.Series

In [6]:
df.loc['Store 1']

Unnamed: 0,Cost,Items_Purchased,Name
Store 1,22.5,Dog Food,Chris
Store 1,2.5,Kitty Litter,Kevyn


In [7]:
df.loc['Store 1', 'Cost']

Store 1    22.5
Store 1     2.5
Name: Cost, dtype: float64

In [8]:
df.T

Unnamed: 0,Store 1,Store 1.1,Store 2
Cost,22.5,2.5,5
Items_Purchased,Dog Food,Kitty Litter,Bird Seed
Name,Chris,Kevyn,Vinod


In [9]:
df.T.loc['Cost']

Store 1    22.5
Store 1     2.5
Store 2       5
Name: Cost, dtype: object

In [10]:
df['Cost']

Store 1    22.5
Store 1     2.5
Store 2     5.0
Name: Cost, dtype: float64

In [12]:
df['Cost'].head()

Store 1    22.5
Store 1     2.5
Store 2     5.0
Name: Cost, dtype: float64

In [13]:
df.loc[:,['Name', 'Cost']]

Unnamed: 0,Name,Cost
Store 1,Chris,22.5
Store 1,Kevyn,2.5
Store 2,Vinod,5.0


In [15]:
df[['Name', 'Cost']]

Unnamed: 0,Name,Cost
Store 1,Chris,22.5
Store 1,Kevyn,2.5
Store 2,Vinod,5.0


In [16]:
df.drop('Store 1')


Unnamed: 0,Cost,Items_Purchased,Name
Store 2,5.0,Bird Seed,Vinod


In [17]:
df

Unnamed: 0,Cost,Items_Purchased,Name
Store 1,22.5,Dog Food,Chris
Store 1,2.5,Kitty Litter,Kevyn
Store 2,5.0,Bird Seed,Vinod


In [22]:
copy_df = df.copy()
copy_df.drop('Store 1',inplace=True)
copy_df

Unnamed: 0,Cost,Items_Purchased,Name
Store 2,5.0,Bird Seed,Vinod


In [23]:
del copy_df['Name']
copy_df

Unnamed: 0,Cost,Items_Purchased
Store 2,5.0,Bird Seed


In [24]:
df['Location'] = None
df

Unnamed: 0,Cost,Items_Purchased,Name,Location
Store 1,22.5,Dog Food,Chris,
Store 1,2.5,Kitty Litter,Kevyn,
Store 2,5.0,Bird Seed,Vinod,
