# Combining, Relating, and Reshaping data

In [None]:
# import numpy and pandas
import numpy as np
import pandas as pd

## Concatenating data

In [None]:
# two Series objects to concatenate
s1 = pd.Series(np.arange(0, 3))
s2 = pd.Series(np.arange(5, 8))
s1

In [None]:
s2

In [None]:
# concatenate them, index labels are duplicated
pd.concat([s1, s2])

In [None]:
# create two DataFrame objects to concatenate
# using the same index labels and column names, 
# but different values
df1 = pd.DataFrame(np.arange(9).reshape(3, 3), 
                   columns=['a', 'b', 'c'])
#df2 has 9 .. 18
df2 = pd.DataFrame(np.arange(9, 18).reshape(3, 3), 
                   columns=['a', 'b', 'c'])
df1

In [None]:
df2

In [None]:
# do the concat
# the result has index labels duplicated along the rows index
pd.concat([df1, df2])

In [None]:
# demonstrate concatenating two DataFrame objects with
# different columns
df1 = pd.DataFrame(np.arange(9).reshape(3, 3), 
                   columns=['a', 'b', 'c'])
df2 = pd.DataFrame(np.arange(9, 18).reshape(3, 3), 
                   columns=['a', 'c', 'd'])
df1

In [None]:
df2

In [None]:
# do the concat, NaN's will be filled in for
# the d column for df1 and b column for df2
pd.concat([df1, df2])

In [None]:
# concat the two objects, but create an index using the
# given keys 
c = pd.concat([df1, df2], keys=['df1', 'df2'])
# note in the labeling of the rows in the output
c

In [None]:
# we can extract the data originating from
# the first or second source DataFrame
c.loc['df2']

### Switching axes of alignment

In [None]:
# concat df1 and df2 along columns
# aligns on row labels, has duplicate columns
pd.concat([df1, df2], axis=1)

In [None]:
# a new DataFrame to merge with df1
# this has two common row labels (2, 3) 
# common columns (a) and one disjoint column
# in each (b in df1 and d in df2)
df3 = pd.DataFrame(np.arange(20, 26).reshape(3, 2), 
                   columns=['a', 'd'], 
                   index=[2, 3, 4])
df3

In [None]:
# concat them. Alignment is along row labels
# columns first from df1 and then df3, with duplicates.
# NaN filled in where those columns do not exist in the source
pd.concat([df1, df3], axis=1)

### Specifying join type

In [None]:
# do an inner join instead of outer
# results in one row, because 2 is the only row index label in common
pd.concat([df1, df3], axis=1, join='inner')

In [None]:
# add keys to the columns
df = pd.concat([df1, df2], 
               axis=1,
               keys=['df1', 'df2'])
df

In [None]:
# retrieve the data that originated from the 
# DataFrame with key 'df2'
df.loc[:, 'df2']

### Appending versus concatenation

In [None]:
# append does a concatenate along axis=0 
# duplicate row index labels can result
df1.append(df2)

In [None]:
# remove duplicates in the result index
df1.append(df2, ignore_index=True)

## An overview of merges

In [None]:
# these are our customers
customers = {'CustomerID': [10, 11],
             'Name': ['Mike', 'Marcia'],
             'Address': ['Address for Mike',
                         'Address for Marcia']}
customers = pd.DataFrame(customers)
customers

In [None]:
# and these are the orders made by our customers
# they are related to customers by CustomerID
orders = {'CustomerID': [10, 11, 10],
          'OrderDate': [date(2014, 12, 1),
                        date(2014, 12, 1),
                        date(2014, 12, 1)]}
orders = pd.DataFrame(orders)
orders

In [None]:
# merge customers and orders so we can ship the items
customers.merge(orders)

In [None]:
# data to be used in the remainder of this section's examples
left_data = {'key1': ['a', 'b', 'c'], 
            'key2': ['x', 'y', 'z'],
            'lval1': [ 0, 1, 2]}
right_data = {'key1': ['a', 'b', 'c'],
              'key2': ['x', 'a', 'z'], 
              'rval1': [ 6, 7, 8 ]}
left = pd.DataFrame(left_data, index=[0, 1, 2])
right = pd.DataFrame(right_data, index=[1, 2, 3])
left

In [None]:
right

In [None]:
# demonstrate merge without specifying columns to merge
# this will implicitly merge on all common columns
left.merge(right)

In [None]:
# demonstrate merge using an explicit column
# on needs the value to be in both DataFrame objects
left.merge(right, on='key1')

In [None]:
# merge explicitly using two columns
left.merge(right, on=['key1', 'key2'])

In [None]:
# join on the row indices of both matrices
pd.merge(left, right, left_index=True, right_index=True)

### Specifying the join semantics of a merge operation

In [None]:
# outer join, merges all matched data, 
# and fills unmatched items with NaN
left.merge(right, how='outer')

In [None]:
# left join, merges all matched data, and only fills unmatched 
# items from the left dataframe with NaN filled for the 
# unmatched items in the result 
# rows with labels 0 and 2 
# match on key1 and key2 the row with label 1 is from left

left.merge(right, how='left')

In [None]:
# right join, merges all matched data, and only fills unmatched
# item from the right with NaN filled for the unmatched items
# in the result 
# rows with labels 0 and 2 match on key1 and key2
# the row with label 1 is from right
left.merge(right, how='right')

In [None]:
# join left with right (default method is outer) using the index labels
# and since these DataFrame objects have duplicate column names
# we just specify lsuffix and rsuffix
left.join(right, lsuffix='_left', rsuffix='_right')

In [None]:
# join left with right with an inner join
left.join(right, lsuffix='_left', rsuffix='_right', how='inner')

## Pivoting

In [None]:
# read in accelerometer data
sensor_readings = pd.read_csv("data/accel.csv")
sensor_readings

In [None]:
# extract Y-axis readings
sensor_readings[sensor_readings['axis'] == 'Y']

In [None]:
# pivot the data. Interval becomes the index, the columns are
# the current axes values, and use the readings as values
sensor_readings.pivot(index='interval', 
                     columns='axis', 
                     values='reading')

## Stacking using non-hierarchical indexes

In [None]:
# simple DataFrame with one column
df = pd.DataFrame({'a': [1, 2]}, index={'one', 'two'})
df

In [None]:
# push the column to another level of the index
# the result is a Series where values are looked up through
# a multi-index
stacked1 = df.stack()
stacked1

In [None]:
# lookup one / a using just the index via a tuple
stacked1[('one', 'a')]

In [None]:
# DataFrame with two columns
df = pd.DataFrame({'a': [1, 2],
                   'b': [3, 4]}, 
                  index={'one', 'two'})
df

In [None]:
# push the two columns into a single level of the index
stacked2 = df.stack()
stacked2

In [None]:
# lookup value with index of one / b
stacked2[('one', 'b')]

## Unstacking using hierarchical indexes

In [None]:
# make two copies of the sensor data, one for each user
user1 = sensor_readings.copy()
user2 = sensor_readings.copy()
# add names to the two copies
user1['who'] = 'Mike'
user2['who'] = 'Mikael'
# for demonstration, lets scale user2's readings
user2['reading'] *= 100
# and reorganize this to have a hierarchical row index
multi_user_sensor_data = pd.concat([user1, user2]) \
            .set_index(['who', 'interval', 'axis'])
multi_user_sensor_data

In [None]:
# lookup user data for Mike using just the index
multi_user_sensor_data.loc['Mike']

In [None]:
# readings for all users and axes at interval 1
multi_user_sensor_data.xs(1, level='interval')

In [None]:
# unstack the who level
multi_user_sensor_data.unstack()

In [None]:
# unstack at level=0
multi_user_sensor_data.unstack(level=0)

In [None]:
# unstack who and axis levels
unstacked = multi_user_sensor_data.unstack(['who', 'axis'])
unstacked

In [None]:
# and we can of course stack what we have unstacked
# this re-stacks who
unstacked.stack(level='who')

## Melting

In [None]:
# we will demonstrate melting with this DataFrame
data = pd.DataFrame({'Name' : ['Mike', 'Mikael'],
                     'Height' : [6.1, 6.0],
                     'Weight' : [220, 185]})
data

In [None]:
# melt it, use Name as the id's, 
# Height and Weight columns as the variables
pd.melt(data, 
        id_vars=['Name'],
        value_vars=['Height', 'Weight'])

## Performance benefits of stacked data

In [None]:
# stacked scalar access can be a lot faster than
# column access

# time the different methods
import timeit
t = timeit.Timer("stacked1[('one', 'a')]", 
                 "from __main__ import stacked1, df")
r1 = timeit.timeit(lambda: stacked1.loc[('one', 'a')], 
                   number=10000)
r2 = timeit.timeit(lambda: df.loc['one']['a'], 
                   number=10000)
r3 = timeit.timeit(lambda: df.iloc[1, 0], 
                   number=10000)

# and the results are...  Yes, it's the fastest of the three
r1, r2, r3