### Concatenating & appending Series

In [1]:
import pandas as pd

# Series of US states
northeast = pd.Series(['CT', 'ME', 'MA', 'NH'])
south = pd.Series(['DE', 'FL', 'GA', 'MD'])
midwest = pd.Series(['IL', 'IN', 'MN', 'MO'])
west = pd.Series(['AZ', 'CO', 'ID', 'MT'])

In [2]:
# using the append() method index values are not adjusted
east = northeast.append(south)
east.head(10)

0    CT
1    ME
2    MA
3    NH
0    DE
1    FL
2    GA
3    MD
dtype: object

In [4]:
new_east = northeast.append(south).reset_index(drop=True)
# the option drop=True discards the old index repeated entries 
# rather than keeping it as a column in the dataframe
new_east.head(10)

0    CT
1    ME
2    MA
3    NH
4    DE
5    FL
6    GA
7    MD
dtype: object

In [5]:
new_east2 = northeast.append(south).reset_index()
new_east2.head(10)

Unnamed: 0,index,0
0,0,CT
1,1,ME
2,2,MA
3,3,NH
4,0,DE
5,1,FL
6,2,GA
7,3,MD


In [6]:
# using concat() method
east = pd.concat([northeast, south])
print(east.head(10))

0    CT
1    ME
2    MA
3    NH
0    DE
1    FL
2    GA
3    MD
dtype: object


In [7]:
# using the additional argument ignore_index=True we get normal index
new_east = pd.concat([northeast, south], ignore_index=True)
new_east.head(10)

0    CT
1    ME
2    MA
3    NH
4    DE
5    FL
6    GA
7    MD
dtype: object

### Concatenating & appending DataFrames

In [8]:
# lets create two DFs
raw_data1 = {'2010 Census Population': [479, 4716, 2405, 30670],
             'Zip Code ZCTA': [66407, 72732, 50579, 46241]}

raw_data2 = {'2010 Census Population': [2180, 26669, 12221, 27481],
             'Zip Code ZCTA': [12776, 76092, 98360, 49464]}

pop1 = pd.DataFrame(raw_data1, columns=['2010 Census Population', 'Zip Code ZCTA'])
pop2 = pd.DataFrame(raw_data2, columns=['2010 Census Population', 'Zip Code ZCTA'])
empty_line = '--------------------------------------------------------'
pop1, empty_line, pop2

(   2010 Census Population  Zip Code ZCTA
 0                     479          66407
 1                    4716          72732
 2                    2405          50579
 3                   30670          46241,
 '--------------------------------------------------------',
    2010 Census Population  Zip Code ZCTA
 0                    2180          12776
 1                   26669          76092
 2                   12221          98360
 3                   27481          49464)

In [9]:
print(pop1.dtypes)

2010 Census Population    int64
Zip Code ZCTA             int64
dtype: object


In [10]:
pop1 = pop1.set_index('Zip Code ZCTA')
pop2 = pop2.set_index('Zip Code ZCTA')
pop1, empty_line, pop2

(               2010 Census Population
 Zip Code ZCTA                        
 66407                             479
 72732                            4716
 50579                            2405
 46241                           30670,
 '--------------------------------------------------------',
                2010 Census Population
 Zip Code ZCTA                        
 12776                            2180
 76092                           26669
 98360                           12221
 49464                           27481)

In [11]:
# appending when both DFs have IDENTICAL column and index names
pop1.append(pop2)

Unnamed: 0_level_0,2010 Census Population
Zip Code ZCTA,Unnamed: 1_level_1
66407,479
72732,4716
50579,2405
46241,30670
12776,2180
76092,26669
98360,12221
49464,27481


Stacking DFs with different index and column names

In [12]:
population = pop1

raw_data3 = {'unemployment': [0.11, 0.02, 0.33],
             'participants': [34447, 4800, 42],
             'Zip': [2860, 46241, 1097]}

unemployment = pd.DataFrame(raw_data3, columns=['unemployment', 'participants', 'Zip'])
unemployment = unemployment.set_index('Zip')

population, empty_line, unemployment

(               2010 Census Population
 Zip Code ZCTA                        
 66407                             479
 72732                            4716
 50579                            2405
 46241                           30670,
 '--------------------------------------------------------',
        unemployment  participants
 Zip                              
 2860           0.11         34447
 46241          0.02          4800
 1097           0.33            42)

In [13]:
# notice that the Index '46241' appears in both DFs
population.append(unemployment), empty_line, pd.concat([population, unemployment], axis='rows')

(       2010 Census Population  participants  unemployment
 66407                   479.0           NaN           NaN
 72732                  4716.0           NaN           NaN
 50579                  2405.0           NaN           NaN
 46241                 30670.0           NaN           NaN
 2860                      NaN       34447.0          0.11
 46241                     NaN        4800.0          0.02
 1097                      NaN          42.0          0.33,
 '--------------------------------------------------------',
        2010 Census Population  participants  unemployment
 66407                   479.0           NaN           NaN
 72732                  4716.0           NaN           NaN
 50579                  2405.0           NaN           NaN
 46241                 30670.0           NaN           NaN
 2860                      NaN       34447.0          0.11
 46241                     NaN        4800.0          0.02
 1097                      NaN          42.0         

In [14]:
pd.concat([population, unemployment], axis='columns')

Unnamed: 0,2010 Census Population,unemployment,participants
1097,,0.33,42.0
2860,,0.11,34447.0
46241,30670.0,0.02,4800.0
50579,2405.0,,
66407,479.0,,
72732,4716.0,,


### Concatenation, keys & MultiIndex

In [15]:
# if we want to concatenate DFs which have common row and column labels

raw1 = {'Precipitation': [0.096, 0.067, 0.061],
        'Month': ['Jan', 'Feb', 'Mar']}

raw2 = {'Precipitation': [0.050, 0.082, 0.071],
        'Month': ['Jan', 'Feb', 'Mar']}

def create_df(raw_dict, index_col=None):
    
    if index_col == None:
        
        return pd.DataFrame(raw_dict, columns=list(raw_dict.keys()))
    else:
        
        df = pd.DataFrame(raw_dict, columns=list(raw_dict.keys()))
        
        return df.set_index(index_col)

In [16]:
rain13 = create_df(raw1, index_col='Month')
rain14 = create_df(raw2, index_col='Month')
rain13, empty_line, rain14

(       Precipitation
 Month               
 Jan            0.096
 Feb            0.067
 Mar            0.061,
 '--------------------------------------------------------',
        Precipitation
 Month               
 Jan            0.050
 Feb            0.082
 Mar            0.071)

In [17]:
pd.concat([rain13, rain14], axis='rows')

#these months correspond to different years

Unnamed: 0_level_0,Precipitation
Month,Unnamed: 1_level_1
Jan,0.096
Feb,0.067
Mar,0.061
Jan,0.05
Feb,0.082
Mar,0.071


In [18]:
# Using multi-index on rows
pd.concat([rain13, rain14], keys=[2013, 2014], axis='rows')

Unnamed: 0_level_0,Unnamed: 1_level_0,Precipitation
Unnamed: 0_level_1,Month,Unnamed: 2_level_1
2013,Jan,0.096
2013,Feb,0.067
2013,Mar,0.061
2014,Jan,0.05
2014,Feb,0.082
2014,Mar,0.071


In [19]:
# Concatenating columns
pd.concat([rain13, rain14], axis='columns')

Unnamed: 0_level_0,Precipitation,Precipitation
Month,Unnamed: 1_level_1,Unnamed: 2_level_1
Jan,0.096,0.05
Feb,0.067,0.082
Mar,0.061,0.071


In [20]:
# Using multi-index
rain1314 = pd.concat([rain13, rain14], keys=[2013, 2014], axis='columns')
rain1314

Unnamed: 0_level_0,2013,2014
Unnamed: 0_level_1,Precipitation,Precipitation
Month,Unnamed: 1_level_2,Unnamed: 2_level_2
Jan,0.096,0.05
Feb,0.067,0.082
Mar,0.061,0.071


In [21]:
rain1314[2013]

Unnamed: 0_level_0,Precipitation
Month,Unnamed: 1_level_1
Jan,0.096
Feb,0.067
Mar,0.061


### Outer and inner joins

In [22]:
population, empty_line, unemployment

(               2010 Census Population
 Zip Code ZCTA                        
 66407                             479
 72732                            4716
 50579                            2405
 46241                           30670,
 '--------------------------------------------------------',
        unemployment  participants
 Zip                              
 2860           0.11         34447
 46241          0.02          4800
 1097           0.33            42)

In [23]:
# concatenationa and inner join
pd.concat([population, unemployment], axis='columns', join='inner')

Unnamed: 0,2010 Census Population,unemployment,participants
46241,30670,0.02,4800


In [24]:
pd.concat([population, unemployment], axis='columns', join='outer')

Unnamed: 0,2010 Census Population,unemployment,participants
1097,,0.33,42.0
2860,,0.11,34447.0
46241,30670.0,0.02,4800.0
50579,2405.0,,
66407,479.0,,
72732,4716.0,,


### Merging DFs

Merge extends concat with the ability to align rows with multiple columns.

In [42]:
dict = {'Zipcode': [16855, 15681, 18657, 17307, 15635],
        '2010 Census Population': [282, 5241, 11985, 5899, 220]}

popul = pd.DataFrame(dict)
popul

Unnamed: 0,2010 Census Population,Zipcode
0,282,16855
1,5241,15681
2,11985,18657
3,5899,17307
4,220,15635


In [47]:
# creating a 13 line DF
import random

zipcodes = [random.randrange(17000, 19000, 1) for number in range(14)]
cities = ['manheim', 'preston park', 'biglerville', 'indiana',
          'curwensville', 'crown', 'harveys lake', 'mineral springs',
          'cassville', 'hannastown', 'saltsburg', 'tunkhannock',
          'pittsburgh', 'lemasters']
dictionary = {'Zipcode': zipcodes,
              'City': cities}

df_cities = pd.DataFrame(dictionary)
df_cities['City'] = df_cities.City.str.upper()
df_cities['State'] = 'PA'
df_cities.loc[7, 'Zipcode'] = 16855
df_cities.loc[10, 'Zipcode'] = 15681
df_cities.loc[11, 'Zipcode'] = 18657
df_cities.loc[2, 'Zipcode'] = 17307
df_cities.loc[9, 'Zipcode'] = 15635
df_cities, empty_line, popul

(               City  Zipcode State
 0           MANHEIM    17027    PA
 1      PRESTON PARK    17000    PA
 2       BIGLERVILLE    17307    PA
 3           INDIANA    18653    PA
 4      CURWENSVILLE    18321    PA
 5             CROWN    18580    PA
 6      HARVEYS LAKE    18316    PA
 7   MINERAL SPRINGS    16855    PA
 8         CASSVILLE    17334    PA
 9        HANNASTOWN    15635    PA
 10        SALTSBURG    15681    PA
 11      TUNKHANNOCK    18657    PA
 12       PITTSBURGH    18663    PA
 13        LEMASTERS    18768    PA,
 '--------------------------------------------------------',
    2010 Census Population  Zipcode
 0                     282    16855
 1                    5241    15681
 2                   11985    18657
 3                    5899    17307
 4                     220    15635)

In [48]:
# Merging popul and df_cities into single DF, linking City and State with a given Zipcode
# Combning the DF and aligning the 'Zipcode' column
pd.merge(popul, df_cities) # inner oin by deufalt

# this merges all the columns that occur in both DFs

Unnamed: 0,2010 Census Population,Zipcode,City,State
0,282,16855,MINERAL SPRINGS,PA
1,5241,15681,SALTSBURG,PA
2,11985,18657,TUNKHANNOCK,PA
3,5899,17307,BIGLERVILLE,PA
4,220,15635,HANNASTOWN,PA


In [49]:
# Another example
dict_bronze = {'NOC': ['USA', 'URS', 'GBR', 'FRA', 'GER'],
               'Country': ['United States', 'Soviet Union',
                           'United Kingdom', 'France', 'Germany'],
               'Total': [1052, 584, 505, 475, 454]}

dict_gold = {'NOC': ['USA', 'URS', 'GBR', 'ITA', 'GER'],
             'Country': ['United States', 'Soviet Union', 'United Kingdom', 
                         'Italy', 'Germany'],
             'Total': [2088, 838, 498, 460, 407]}

bronze = pd.DataFrame(dict_bronze)
gold = pd.DataFrame(dict_gold)

bronze, empty_line, gold

(          Country  NOC  Total
 0   United States  USA   1052
 1    Soviet Union  URS    584
 2  United Kingdom  GBR    505
 3          France  FRA    475
 4         Germany  GER    454,
 '--------------------------------------------------------',
           Country  NOC  Total
 0   United States  USA   2088
 1    Soviet Union  URS    838
 2  United Kingdom  GBR    498
 3           Italy  ITA    460
 4         Germany  GER    407)

In [50]:
# By default merge() merges all common columns to both tables to merge
pd.merge(bronze, gold)

Unnamed: 0,Country,NOC,Total


This gives an empty DF because by default merge() returns the rows of both DF that columns are identical. The 'Total' columns are different a well as the Italy and Frnce does not match. We can use a particular column to make the match:

In [53]:
pd.merge(bronze, gold, on='NOC')

# this gives suffixes to the column labels of each sub_DFs. Thus _x for bronze and _y for gold. 
# notice how italy and france are not included since it is an inner join

Unnamed: 0,Country_x,NOC,Total_x,Country_y,Total_y
0,United States,USA,1052,United States,2088
1,Soviet Union,URS,584,Soviet Union,838
2,United Kingdom,GBR,505,United Kingdom,498
3,Germany,GER,454,Germany,407


The 'Country_x' and 'Country_y' columns are identical. To eliminate one of them we could make the merge also on 'Country' column

In [54]:
pd.merge(bronze, gold, on=['NOC', 'Country'])

Unnamed: 0,Country,NOC,Total_x,Total_y
0,United States,USA,1052,2088
1,Soviet Union,URS,584,838
2,United Kingdom,GBR,505,498
3,Germany,GER,454,407


The suffixes for the column labels can also be modified via 'suffixes=' argument

In [55]:
pd.merge(bronze, gold, on=['NOC', 'Country'], suffixes=['_bronze', '_gold'])

Unnamed: 0,Country,NOC,Total_bronze,Total_gold
0,United States,USA,1052,2088
1,Soviet Union,URS,584,838
2,United Kingdom,GBR,505,498
3,Germany,GER,454,407


Merging() mehtod works well when column labels match, but lets see what to do when they dont.

In [57]:
dict_counties = {'CITY NAME': ['SALTSBURG', 'MINERAL SPRINGS', 'BIGLERVILLE',
                               'HANNASTOWN', 'TUNKHANNOCK'],
                 'COUNTY NAME': ['INDIANA', 'CLEARFIELD', 'ADAMS',
                                 'WESTMORELAND', 'WYOMING']}
counties = pd.DataFrame(dict_counties)
counties, empty_line, df_cities.tail()

(         CITY NAME   COUNTY NAME
 0        SALTSBURG       INDIANA
 1  MINERAL SPRINGS    CLEARFIELD
 2      BIGLERVILLE         ADAMS
 3       HANNASTOWN  WESTMORELAND
 4      TUNKHANNOCK       WYOMING,
 '--------------------------------------------------------',
            City  Zipcode State
 9    HANNASTOWN    15635    PA
 10    SALTSBURG    15681    PA
 11  TUNKHANNOCK    18657    PA
 12   PITTSBURGH    18663    PA
 13    LEMASTERS    18768    PA)

In [58]:
# thus by city name we can do
pd.merge(counties, df_cities, left_on='CITY NAME', right_on='City')

Unnamed: 0,CITY NAME,COUNTY NAME,City,Zipcode,State
0,SALTSBURG,INDIANA,SALTSBURG,15681,PA
1,MINERAL SPRINGS,CLEARFIELD,MINERAL SPRINGS,16855,PA
2,BIGLERVILLE,ADAMS,BIGLERVILLE,17307,PA
3,HANNASTOWN,WESTMORELAND,HANNASTOWN,15635,PA
4,TUNKHANNOCK,WYOMING,TUNKHANNOCK,18657,PA


### Joining DFs

In [59]:
bronze, empty_line, gold

(          Country  NOC  Total
 0   United States  USA   1052
 1    Soviet Union  URS    584
 2  United Kingdom  GBR    505
 3          France  FRA    475
 4         Germany  GER    454,
 '--------------------------------------------------------',
           Country  NOC  Total
 0   United States  USA   2088
 1    Soviet Union  URS    838
 2  United Kingdom  GBR    498
 3           Italy  ITA    460
 4         Germany  GER    407)

In [60]:
# we can specify the 'how=' argument
pd.merge(bronze, gold, on=['NOC', 'Country'], how='inner') # how=inner is defualt

Unnamed: 0,Country,NOC,Total_x,Total_y
0,United States,USA,1052,2088
1,Soviet Union,URS,584,838
2,United Kingdom,GBR,505,498
3,Germany,GER,454,407


In [62]:
pd.merge(bronze, gold, on=['NOC', 'Country'], how='left', suffixes=['_bronze', '_gold'])

Unnamed: 0,Country,NOC,Total_bronze,Total_gold
0,United States,USA,1052,2088.0
1,Soviet Union,URS,584,838.0
2,United Kingdom,GBR,505,498.0
3,France,FRA,475,
4,Germany,GER,454,407.0


In [63]:
pd.merge(bronze, gold, on=['NOC', 'Country'], how='right', suffixes=['_bronze', '_gold'])

Unnamed: 0,Country,NOC,Total_bronze,Total_gold
0,United States,USA,1052.0,2088
1,Soviet Union,URS,584.0,838
2,United Kingdom,GBR,505.0,498
3,Germany,GER,454.0,407
4,Italy,ITA,,460


In [64]:
pd.merge(bronze, gold, on=['NOC', 'Country'], how='outer', suffixes=['_bronze', '_gold'])

Unnamed: 0,Country,NOC,Total_bronze,Total_gold
0,United States,USA,1052.0,2088.0
1,Soviet Union,URS,584.0,838.0
2,United Kingdom,GBR,505.0,498.0
3,France,FRA,475.0,
4,Germany,GER,454.0,407.0
5,Italy,ITA,,460.0


In [65]:
population, empty_line, unemployment

(               2010 Census Population
 Zip Code ZCTA                        
 66407                             479
 72732                            4716
 50579                            2405
 46241                           30670,
 '--------------------------------------------------------',
        unemployment  participants
 Zip                              
 2860           0.11         34447
 46241          0.02          4800
 1097           0.33            42)

Usin '.join()' method

In [66]:
population.join(unemployment, how='left')

Unnamed: 0_level_0,2010 Census Population,unemployment,participants
Zip Code ZCTA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
66407,479,,
72732,4716,,
50579,2405,,
46241,30670,0.02,4800.0


In [67]:
population.join(unemployment, how='right')

Unnamed: 0_level_0,2010 Census Population,unemployment,participants
Zip,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2860,,0.11,34447
46241,30670.0,0.02,4800
1097,,0.33,42


In [68]:
population.join(unemployment, how='inner')

Unnamed: 0,2010 Census Population,unemployment,participants
46241,30670,0.02,4800


In [69]:
population.join(unemployment, how='outer')

Unnamed: 0,2010 Census Population,unemployment,participants
1097,,0.33,42.0
2860,,0.11,34447.0
46241,30670.0,0.02,4800.0
50579,2405.0,,
66407,479.0,,
72732,4716.0,,


### Ordered merges

In [2]:
import pandas as pd

file = 'csv_files/feb_sales.csv'

sales_df = pd.read_csv(file, parse_dates=['Date']).sort_values('Date')
sales_df

Unnamed: 0,Date,Company,Product,Units
3,2015-02-02 08:33:01,Hooli,Software,3
17,2015-02-02 20:54:49,Mediacore,Hardware,9
2,2015-02-03 14:14:18,Initech,Software,13
14,2015-02-04 15:36:29,Streeplex,Software,13
8,2015-02-04 21:52:45,Acme Coporation,Hardware,14
5,2015-02-05 01:53:06,Acme Coporation,Software,19
13,2015-02-05 22:05:03,Hooli,Service,10
10,2015-02-07 22:58:10,Acme Coporation,Hardware,1
6,2015-02-09 08:57:30,Streeplex,Service,19
9,2015-02-09 13:09:55,Mediacore,Software,7


In [77]:
software = sales_df[sales_df.Product == 'Software']
hardware = sales_df[sales_df.Product == 'Hardware']

software, empty_line, hardware

(                  Date          Company   Product  Units
 3  2015-02-02 08:33:01            Hooli  Software      3
 2  2015-02-03 14:14:18          Initech  Software     13
 14 2015-02-04 15:36:29        Streeplex  Software     13
 5  2015-02-05 01:53:06  Acme Coporation  Software     19
 9  2015-02-09 13:09:55        Mediacore  Software      7
 7  2015-02-11 20:03:08          Initech  Software      7
 11 2015-02-11 22:50:44            Hooli  Software      4
 1  2015-02-16 12:09:19            Hooli  Software     10
 18 2015-02-21 05:01:26        Mediacore  Software      3,
 '--------------------------------------------------------',
                   Date          Company   Product  Units
 17 2015-02-02 20:54:49        Mediacore  Hardware      9
 8  2015-02-04 21:52:45  Acme Coporation  Hardware     14
 10 2015-02-07 22:58:10  Acme Coporation  Hardware      1
 16 2015-02-19 10:59:33        Mediacore  Hardware     16
 19 2015-02-21 20:41:47            Hooli  Hardware      3)

In [72]:
pd.merge_ordered(hardware, software) # the default join 'how='outer''

Unnamed: 0,Date,Company,Product,Units
0,2015-02-02 08:33:01,Hooli,Software,3
1,2015-02-02 20:54:49,Mediacore,Hardware,9
2,2015-02-03 14:14:18,Initech,Software,13
3,2015-02-04 15:36:29,Streeplex,Software,13
4,2015-02-04 21:52:45,Acme Coporation,Hardware,14
5,2015-02-05 01:53:06,Acme Coporation,Software,19
6,2015-02-07 22:58:10,Acme Coporation,Hardware,1
7,2015-02-09 13:09:55,Mediacore,Software,7
8,2015-02-11 20:03:08,Initech,Software,7
9,2015-02-11 22:50:44,Hooli,Software,4


In [79]:
# we could have used ordinary merge as well
pd.merge(hardware, software, how='outer')

Unnamed: 0,Date,Company,Product,Units
0,2015-02-02 20:54:49,Mediacore,Hardware,9
1,2015-02-04 21:52:45,Acme Coporation,Hardware,14
2,2015-02-07 22:58:10,Acme Coporation,Hardware,1
3,2015-02-19 10:59:33,Mediacore,Hardware,16
4,2015-02-21 20:41:47,Hooli,Hardware,3
5,2015-02-02 08:33:01,Hooli,Software,3
6,2015-02-03 14:14:18,Initech,Software,13
7,2015-02-04 15:36:29,Streeplex,Software,13
8,2015-02-05 01:53:06,Acme Coporation,Software,19
9,2015-02-09 13:09:55,Mediacore,Software,7


In [85]:
# get the dates ordered
pd.merge(hardware, software, how='outer').sort_values('Date').reset_index(drop=True)

Unnamed: 0,Date,Company,Product,Units
0,2015-02-02 08:33:01,Hooli,Software,3
1,2015-02-02 20:54:49,Mediacore,Hardware,9
2,2015-02-03 14:14:18,Initech,Software,13
3,2015-02-04 15:36:29,Streeplex,Software,13
4,2015-02-04 21:52:45,Acme Coporation,Hardware,14
5,2015-02-05 01:53:06,Acme Coporation,Software,19
6,2015-02-07 22:58:10,Acme Coporation,Hardware,1
7,2015-02-09 13:09:55,Mediacore,Software,7
8,2015-02-11 20:03:08,Initech,Software,7
9,2015-02-11 22:50:44,Hooli,Software,4
