In [3]:
# Chapter 8: Data Wrangling - Join, Combine, and Reshape
import pandas as pd
import numpy as np

In [4]:
# Hierarchical Indexing
# enables to have multiple index levels on an axis
# provides a way to work with higher dimensional data in a lower dimensional form
data = pd.Series(np.random.uniform(size=9),
                 index=[["a","a","a","b","b","c","c","d","d"],
                        [1,2,3,1,3,1,2,2,3]])
data

a  1    0.707968
   2    0.454253
   3    0.270795
b  1    0.807907
   3    0.699294
c  1    0.921825
   2    0.021760
d  2    0.120563
   3    0.507892
dtype: float64

In [5]:
# Series with a Multiindex as its index, gap in the index means use the label directly above
data.index

MultiIndex([('a', 1),
            ('a', 2),
            ('a', 3),
            ('b', 1),
            ('b', 3),
            ('c', 1),
            ('c', 2),
            ('d', 2),
            ('d', 3)],
           )

In [6]:
# with hierarchically indexed object, partial indexing is possible, enabling to concisely select subsets of data
data["b"]

1    0.807907
3    0.699294
dtype: float64

In [8]:
data["b":"c"]

b  1    0.807907
   3    0.699294
c  1    0.921825
   2    0.021760
dtype: float64

In [9]:
data.loc[["b","d"]]

b  1    0.807907
   3    0.699294
d  2    0.120563
   3    0.507892
dtype: float64

In [11]:
# selection is even possible from an "inner" level; selecting all of the values having the value 2 from the second index level
data.loc[:,2]

a    0.454253
c    0.021760
d    0.120563
dtype: float64

In [13]:
# hierarchical indexing plays an important role in reshaping data and in group-based operations like pivot table
# rearranging data into DataFrame using .unstack()
data.unstack()

Unnamed: 0,1,2,3
a,0.707968,0.454253,0.270795
b,0.807907,,0.699294
c,0.921825,0.02176,
d,,0.120563,0.507892


In [14]:
# the inverse operation of .unstack() is .stack()
data.unstack().stack()

a  1    0.707968
   2    0.454253
   3    0.270795
b  1    0.807907
   3    0.699294
c  1    0.921825
   2    0.021760
d  2    0.120563
   3    0.507892
dtype: float64

In [15]:
# with a DataFrame, either axis can have a hierarchical index
frame = pd.DataFrame(np.arange(12).reshape((4,3)),
                     index=[["a","a","b","b"],[1,2,1,2,]],
                     columns=[["Ohio","Ohio","Colorado"],
                              ["Green","Red","Green"]])
frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [16]:
# the hierarchical levels can have names (as strings or any Python objects); if so, these will show up in the console output
frame.index.names = ["key1","key2"]
frame.columns.names = ["state","color"]
frame

# these names supersede the name attribute, which is used only with single-level indexes; index names are not part of the row labels

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [17]:
# you can see how many levels an index has by accessing its .nlevels attribute:
frame.index.nlevels

2

In [19]:
# with partial column indexing you can similarly select groups of columns:
frame["Ohio"]

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0,1
a,2,3,4
b,1,6,7
b,2,9,10


In [20]:
# a MultiIndex can be created by itself and then reused; 
# the columns in the preceding DataFrame with level names could also be created like below:
pd.MultiIndex.from_arrays([["Ohio","Ohio","Colorado"],
                           ["Green","Red","Green"]],
                          names=["state","color"])

MultiIndex([(    'Ohio', 'Green'),
            (    'Ohio',   'Red'),
            ('Colorado', 'Green')],
           names=['state', 'color'])

In [21]:
# Reordering and Sorting levels
# rearrange the order of the levels on an axis or sort the data by the valeus in one specific level
# .swaplevel() takes two level numvers or names and returns a new object with the levels interchanged (but data is otherwise unaltered)
frame.swaplevel("key1","key2")

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
2,a,3,4,5
1,b,6,7,8
2,b,9,10,11


In [28]:
# .sort_index() by default sorts the data using all the index levels; 
# but can choose to use only a single level or a subset of levels to sort by passing the level argument
frame.sort_index(level=1)

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
b,1,6,7,8
a,2,3,4,5
b,2,9,10,11


In [29]:
frame.swaplevel(0,1).sort_index(level=0)

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
1,b,6,7,8
2,a,3,4,5
2,b,9,10,11


In [30]:
# data selection performance is much better on heirarchically indexed objects if the index is sorted starting with the outermost level
# that is, the result of calling .sort_index(level=0) or .sort_index()

In [31]:
# Summary Statistics by Level
# many descriptive and summary statistics on DataFrame and Series have a level option in which you can specify the level you want to aggregate by on a particular axis
# can aggregate by level on either the rows or columns:
frame.groupby(level="key2").sum()

state,Ohio,Ohio,Colorado
color,Green,Red,Green
key2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,6,8,10
2,12,14,16


In [32]:
frame.groupby(level="color",axis="columns").sum()

  frame.groupby(level="color",axis="columns").sum()


Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,2,1
a,2,8,4
b,1,14,7
b,2,20,10


In [33]:
# Indexing with a DataFrames's columns
# moving columns from a DataFrame as the row index; alternatively, moving row index into DataFrame's columns
frame = pd.DataFrame({"a":range(7),"b":range(7,0,-1),
                      "c":["one","one","one","two","two","two","two"],
                      "d":[0,1,2,0,1,2,3]})
frame

Unnamed: 0,a,b,c,d
0,0,7,one,0
1,1,6,one,1
2,2,5,one,2
3,3,4,two,0
4,4,3,two,1
5,5,2,two,2
6,6,1,two,3


In [35]:
# DataFrame's .set_index() will create a new DataFrame using one or more of its columns as the index:
frame2 = frame.set_index(["c","d"])
frame2

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,0,7
one,1,1,6
one,2,2,5
two,0,3,4
two,1,4,3
two,2,5,2
two,3,6,1


In [36]:
# by default,the columns are removed from the DataFrame, though you can leave them in by passing drop=False to .set_index():
frame.set_index(["c","d"],drop=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c,d
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,0,0,7,one,0
one,1,1,6,one,1
one,2,2,5,one,2
two,0,3,4,two,0
two,1,4,3,two,1
two,2,5,2,two,2
two,3,6,1,two,3


In [37]:
# on the other hand, .reset_index() does the opposite;
# the hierarchical index levels are moved into the columns
frame2.reset_index()

Unnamed: 0,c,d,a,b
0,one,0,0,7
1,one,1,1,6
2,one,2,2,5
3,two,0,3,4
4,two,1,4,3
5,two,2,5,2
6,two,3,6,1


In [38]:
# Combining and Merging Datasets

In [39]:
# methods of combining data objects in pandas

# 1. pandas.merge : Connect rows in DataFrames baed on one or more keys; SQL JOIN method
# 2. pandas.concat : Cancatenate or "stack" objects together along an axis
# 3. combine_first : Splice together overlapping dat to fill in missing values in one object with value from another

In [40]:
# Database-Style DatFrame Joins
# pd.merge() is the main entry point
df1 = pd.DataFrame({"key":["b","b","a","c","a","a","b"],
                    "data1":pd.Series(range(7),dtype="Int64")})
df1

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,a,5
6,b,6


In [41]:
df2 = pd.DataFrame({"key":["a","b","d"],
                    "data2":pd.Series(range(3),dtype="Int64")})
df2

Unnamed: 0,key,data2
0,a,0
1,b,1
2,d,2


In [42]:
# using Int64 extention type for nullable integers
# example of "many-to-one-join" : df1's multiple rows of "a" and "b" whereas df2 has only one corresponding row for each
pd.merge(df1,df2)

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,1,1
2,a,2,0
3,a,4,0
4,a,5,0
5,b,6,1


In [43]:
# note that for above example, didn't specify which column to join on
# if not specified, pd.merge() uses the overlapping column names as keys - it's a good practice to specify explicitly
pd.merge(df1,df2,on="key")

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,1,1
2,a,2,0
3,a,4,0
4,a,5,0
5,b,6,1


In [44]:
# in general, the order of column output is unspecified
# if column names are different in each object, can specify them separately:
df3 = pd.DataFrame({"lkey":["b","b","a","c","a","a","b"],
                    "data1":pd.Series(range(7),dtype="Int64")})
df3

Unnamed: 0,lkey,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,a,5
6,b,6


In [45]:
df4 = pd.DataFrame({"rkey":["a","b","d"],
                    "data2":pd.Series(range(3),dtype="Int64")})
df4

Unnamed: 0,rkey,data2
0,a,0
1,b,1
2,d,2


In [46]:
pd.merge(df3,df4,left_on="lkey",right_on="rkey")

Unnamed: 0,lkey,data1,rkey,data2
0,b,0,b,1
1,b,1,b,1
2,a,2,a,0
3,a,4,a,0
4,a,5,a,0
5,b,6,b,1


In [47]:
# "c" and "d" values and assiciated data are missing from the result above
# by default, pd.merge() does an "INNER JOIN"; the keys in the result are the intesection, or common set
# other possible options are "left", "right", and "outer" (union of the keys)
pd.merge(df1,df2,how="outer")

Unnamed: 0,key,data1,data2
0,a,2.0,0.0
1,a,4.0,0.0
2,a,5.0,0.0
3,b,0.0,1.0
4,b,1.0,1.0
5,b,6.0,1.0
6,c,3.0,
7,d,,2.0


In [48]:
pd.merge(df3,df4,left_on="lkey",right_on="rkey",how="outer")

Unnamed: 0,lkey,data1,rkey,data2
0,a,2.0,a,0.0
1,a,4.0,a,0.0
2,a,5.0,a,0.0
3,b,0.0,b,1.0
4,b,1.0,b,1.0
5,b,6.0,b,1.0
6,c,3.0,,
7,,,d,2.0


In [49]:
# in outer join, rows from the left or right DataFrame objects that do not match will appear with NA

In [50]:
# Different join types with the "how" argument
# how="inner" : Use only the key combinations observed in both tables
# how="left" : Use all key combinations found in the left table
# how="right" : Use all key combinations found in the right table
# how="outer" : Use all key combinations observed in both tables together

In [51]:
# Many-to-many merges from Cartesian product of the matching keys
df1 = pd.DataFrame({"key":["b","b","a","c","a","b"],
                    "data1":pd.Series(range(6),dtype="Int64")})
df1

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [52]:
df2 = pd.DataFrame({"key":["a","b","a","b","d"],
                    "data2":pd.Series(range(5),dtype="Int64")})
df2

Unnamed: 0,key,data2
0,a,0
1,b,1
2,a,2
3,b,3
4,d,4


In [53]:
pd.merge(df1,df2,on="key",how="left")

Unnamed: 0,key,data1,data2
0,b,0,1.0
1,b,0,3.0
2,b,1,1.0
3,b,1,3.0
4,a,2,0.0
5,a,2,2.0
6,c,3,
7,a,4,0.0
8,a,4,2.0
9,b,5,1.0


In [54]:
# since there were three "b" rows in the left DF and two in the right one, there are six "b" rows in the result
# the join method passed to the how keyword argument affects only the distinct key values appearing in the result:
pd.merge(df1,df2,how="inner")

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,0,3
2,b,1,1
3,b,1,3
4,a,2,0
5,a,2,2
6,a,4,0
7,a,4,2
8,b,5,1
9,b,5,3


In [55]:
# to merge with multiple keys, pass a list of column names:
left = pd.DataFrame({"key1":["foo","foo","bar"],
                     "key2":["one","two","one"],
                     "lval":pd.Series([1,2,3],dtype='Int64')})
left

Unnamed: 0,key1,key2,lval
0,foo,one,1
1,foo,two,2
2,bar,one,3


In [56]:
right = pd.DataFrame({"key1":["foo","foo","bar","bar"],
                      "key2":["one","one","one","two"],
                      "rval":pd.Series([4,5,6,7],dtype='Int64')})
right

Unnamed: 0,key1,key2,rval
0,foo,one,4
1,foo,one,5
2,bar,one,6
3,bar,two,7


In [57]:
pd.merge(left,right,on=["key1","key2"],how="outer")

Unnamed: 0,key1,key2,lval,rval
0,bar,one,3.0,6.0
1,bar,two,,7.0
2,foo,one,1.0,4.0
3,foo,one,1.0,5.0
4,foo,two,2.0,


In [58]:
# to determine which key combinations will appear in the result depending on the choice of merge method;
# think of the multiple keys as forming an array of tuples to be used as a single join key
# when joining columns on columns, indexes on the passed DF objects are discarded; if preservation needed, use .reset_index() to append the index to columns
# last issue to consider in merge operations is the treatment of overlapping column names
pd.merge(left,right,on="key1")

Unnamed: 0,key1,key2_x,lval,key2_y,rval
0,foo,one,1,one,4
1,foo,one,1,one,5
2,foo,two,2,one,4
3,foo,two,2,one,5
4,bar,one,3,one,6
5,bar,one,3,two,7


In [59]:
# you can either rename axis indexes or,
# pd.merge() has a suffixes option for specifying strings to append to overlapping names in the left and right DataFrame objects:
pd.merge(left,right,on="key1",suffixes=("_left","_right"))

Unnamed: 0,key1,key2_left,lval,key2_right,rval
0,foo,one,1,one,4
1,foo,one,1,one,5
2,foo,two,2,one,4
3,foo,two,2,one,5
4,bar,one,3,one,6
5,bar,one,3,two,7


In [60]:
# .pd.merge() function arguments

# left : DataFrame to be merged on the left side
# right : DataFrame to be merged on the right side
# how : Type of join to apply; on eof "inner", "outer", "left", or "right"; defaults to "inner"
# on : Column names to join on; Must be found in both DataFrame objects;
#      If not specified and no other join keys given, will use the intersection of the column names in left and right as the join keys
# left_on : Columns in left DataFrame to use as join keys; Can be a single column name or a list of column names
# right_on : Analogous to left_on for right DataFrame
# left_index : Use row index in left as its join key (or keys, if a MultiIndex)
# right_index : Analogous to left_index
# sort : Sort merged data lexicographically by join keys; False by default
# suffixes : Tuple of string values to append to column names in case of overlap; defaults to ("_x","-y") (e.g., "data_x","data_y")
# copy : If False, avlid copying data into resulting data structure in some exceptional cases; by default always copies
# validate : Verifies if the merge is of the specified type, whether one-to-one, one-to-many, or many-to-many
# indicator : Adds a special column _merge that indicates the source of each row; 
#             values will be "left_only", "right_only", or "both" based on the origin of the joined data in each row

In [62]:
# Merging on Index

# in some cases, the merge key(s) in a DF will be found in its index (row labels);
# can pass left_index=True or right_index=True (or both) to indicate that the index should be used as the merge key:

left1 = pd.DataFrame({"key":["a","b","a","a","b","c"],
                      "value":pd.Series(range(6),dtype="Int64")})
left1

Unnamed: 0,key,value
0,a,0
1,b,1
2,a,2
3,a,3
4,b,4
5,c,5


In [63]:
right1 = pd.DataFrame({"group_val":[3.5,7]},index=["a","b"])
right1

Unnamed: 0,group_val
a,3.5
b,7.0


In [65]:
pd.merge(left1,right1,left_on="key",right_index=True)

Unnamed: 0,key,value,group_val
0,a,0,3.5
1,b,1,7.0
2,a,2,3.5
3,a,3,3.5
4,b,4,7.0


In [66]:
# looking carefully, index values from left1 have been preserved, the indexes of the input DF objects are dropped
# because the index of right1 is unique, this "many-to-one" merge (with default how="inner") can preserve the index values from left1

In [67]:
# since the default pd.merge() is to intersect the join keys, can instead form the union of them with an outer join:
pd.merge(left1,right1,left_on="key",right_index=True,how="outer")

Unnamed: 0,key,value,group_val
0,a,0,3.5
2,a,2,3.5
3,a,3,3.5
1,b,1,7.0
4,b,4,7.0
5,c,5,


In [68]:
# with hierarchically indexed data, joining on index is equivalent to a multiple-key merge:
lefth = pd.DataFrame({"key1":["Ohio","Ohio","Ohio","Nevada","Nevada"],
                      "key2":[2000,2001,2002,2001,2002],
                      "data":pd.Series(range(5),
                                       dtype="Int64")})
lefth

Unnamed: 0,key1,key2,data
0,Ohio,2000,0
1,Ohio,2001,1
2,Ohio,2002,2
3,Nevada,2001,3
4,Nevada,2002,4


In [69]:
righth_index = pd.MultiIndex.from_arrays(
                [
                    ["Nevada","Nevada","Ohio","Ohio","Ohio","Ohio"],
                    [2001,2000,2000,2000,2001,2002]
                ])
righth_index

MultiIndex([('Nevada', 2001),
            ('Nevada', 2000),
            (  'Ohio', 2000),
            (  'Ohio', 2000),
            (  'Ohio', 2001),
            (  'Ohio', 2002)],
           )

In [70]:
righth = pd.DataFrame({"event1":pd.Series([0,2,4,6,8,10],dtype="Int64",
                                          index=righth_index),
                       "event2":pd.Series([1,3,5,7,9,11],dtype="Int64",
                                          index=righth_index)})
righth

Unnamed: 0,Unnamed: 1,event1,event2
Nevada,2001,0,1
Nevada,2000,2,3
Ohio,2000,4,5
Ohio,2000,6,7
Ohio,2001,8,9
Ohio,2002,10,11


In [71]:
# in this case, you have to indicate multiple columns to merge on as a list (note the handling of duplicate index values with how="outer"):
pd.merge(lefth,righth,left_on=["key1","key2"],right_index=True)

Unnamed: 0,key1,key2,data,event1,event2
0,Ohio,2000,0,4,5
0,Ohio,2000,0,6,7
1,Ohio,2001,1,8,9
2,Ohio,2002,2,10,11
3,Nevada,2001,3,0,1


In [73]:
pd.merge(lefth,righth,left_on=["key1","key2"],right_index=True,how="outer")

Unnamed: 0,key1,key2,data,event1,event2
4,Nevada,2000,,2.0,3.0
3,Nevada,2001,3.0,0.0,1.0
4,Nevada,2002,4.0,,
0,Ohio,2000,0.0,4.0,5.0
0,Ohio,2000,0.0,6.0,7.0
1,Ohio,2001,1.0,8.0,9.0
2,Ohio,2002,2.0,10.0,11.0


In [74]:
# using the indexes of both sides of the merge is also possible
left2 = pd.DataFrame([[1.,2.],[3.,4.],[5.,6.]],
                     index=["a","c","e"],
                     columns=["Ohio","Nevada"]).astype("Int64")
left2

Unnamed: 0,Ohio,Nevada
a,1,2
c,3,4
e,5,6


In [75]:
right2 = pd.DataFrame([[7.,8.],[9.,10.],[11.,12.],[13,14]],
                      index=["b","c","d","e"],
                      columns=["Missouri","Alabama"]).astype("Int64")
right2

Unnamed: 0,Missouri,Alabama
b,7,8
c,9,10
d,11,12
e,13,14


In [76]:
pd.merge(left2,right2,how="outer",left_index=True,right_index=True)

Unnamed: 0,Ohio,Nevada,Missouri,Alabama
a,1.0,2.0,,
b,,,7.0,8.0
c,3.0,4.0,9.0,10.0
d,,,11.0,12.0
e,5.0,6.0,13.0,14.0


In [77]:
# DataFrame has a .join() instance method to simplify merging by index
# can also be used to combine many DataFrame objects having the same or similar indexes but nonoverlapping columns

left2.join(right2,how="outer")

Unnamed: 0,Ohio,Nevada,Missouri,Alabama
a,1.0,2.0,,
b,,,7.0,8.0
c,3.0,4.0,9.0,10.0
d,,,11.0,12.0
e,5.0,6.0,13.0,14.0


In [78]:
# DF's .join() performs a left join on the join keys by default
# .join() also supports joining the indexes of the passed DF on one of the columns of the calling DF:
# joining on key

left1.join(right1,on="key")

Unnamed: 0,key,value,group_val
0,a,0,3.5
1,b,1,7.0
2,a,2,3.5
3,a,3,3.5
4,b,4,7.0
5,c,5,


In [79]:
# for simple index-on-index merges, can pass a list of DFs to join as an alternative to using the more general .pd.concat()

another = pd.DataFrame([[7.,8.],[9.,10.],[11.,12.],[16.,17.]],
                       index=["a","c","e","f"],
                       columns=["New York","Oregon"])
another

Unnamed: 0,New York,Oregon
a,7.0,8.0
c,9.0,10.0
e,11.0,12.0
f,16.0,17.0


In [80]:
left2.join([right2,another])

Unnamed: 0,Ohio,Nevada,Missouri,Alabama,New York,Oregon
a,1,2,,,7.0,8.0
c,3,4,9.0,10.0,9.0,10.0
e,5,6,13.0,14.0,11.0,12.0


In [81]:
left2.join([right2,another],how="outer")

Unnamed: 0,Ohio,Nevada,Missouri,Alabama,New York,Oregon
a,1.0,2.0,,,7.0,8.0
c,3.0,4.0,9.0,10.0,9.0,10.0
e,5.0,6.0,13.0,14.0,11.0,12.0
b,,,7.0,8.0,,
d,,,11.0,12.0,,
f,,,,,16.0,17.0


In [82]:
# Concatenating Along an Axis

# another kind of data combination operation is referred to interchangeably as concatenation or staking
# numpy arrays concatenating

arr = np.arange(12).reshape((3,4))
arr

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [85]:
np.concatenate([arr,arr],axis=1)

array([[ 0,  1,  2,  3,  0,  1,  2,  3],
       [ 4,  5,  6,  7,  4,  5,  6,  7],
       [ 8,  9, 10, 11,  8,  9, 10, 11]])

In [86]:
# for Series and DFs, having labeled axes enable you to further generalize array concatenation
# for concatenation there are additional concerns:
# if the objects are indexed differently on the other axes, should we combine the distinct elemnts in these axes or use only the values in common?
# do the concatenated chunks of data need to be identifiable as such in the resulting object?
# does the "concatenation axis" contain data that needs to be preserved? In many cases, the default integer labels in a DF are best discarded during concatenation
# the .concat() provides a consistent way to address each of the questions above.

In [87]:
# Suppose we have 3 Series with no index overlap:

s1 = pd.Series([0,1],index=["a","b"],dtype="Int64")
s2 = pd.Series([2,3,4],index=["c","d","e"],dtype="Int64")
s3 = pd.Series([5,6],index=["f","g"],dtype="Int64")

In [88]:
s1

a    0
b    1
dtype: Int64

In [89]:
s2

c    2
d    3
e    4
dtype: Int64

In [90]:
s3

f    5
g    6
dtype: Int64

In [91]:
# calling pd.concat() with these objects in a list glues together the values and indexes:
pd.concat([s1,s2,s3])

a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: Int64

In [92]:
# by defualt, pd.concat() works along axis="index", producing another Series
# if pass axis="columns" the result will instead be a DataFrame

pd.concat([s1,s2,s3],axis="columns")

Unnamed: 0,0,1,2
a,0.0,,
b,1.0,,
c,,2.0,
d,,3.0,
e,,4.0,
f,,,5.0
g,,,6.0


In [93]:
# in this case there is no overlap on the other axis, so as seen above is a union ("outer" join) of the indexes
# can instead intersect them by passing join="inner"
s4 = pd.concat([s1,s3])
s4

a    0
b    1
f    5
g    6
dtype: Int64

In [95]:
pd.concat([s1,s4],axis="columns")

Unnamed: 0,0,1
a,0.0,0
b,1.0,1
f,,5
g,,6


In [96]:
pd.concat([s1,s4],axis="columns",join="inner")

Unnamed: 0,0,1
a,0,0
b,1,1


In [97]:
# a potential issue is that the concatenated pieces are not identifiable in the result
# Suppose instead you wanted to create a hierarchical index on the concatenation axis - use the keys argument
result = pd.concat([s1,s2,s3],keys=["one","two","three"])
result

one    a    0
       b    1
two    c    2
       d    3
       e    4
three  f    5
       g    6
dtype: Int64