# **PANDAS Merge/Pivot**

Based on [PANDAS-TUTORIAL](https://github.com/TirendazAcademy/PANDAS-TUTORIAL/tree/main)

[PANDAS API Reference](https://pandas.pydata.org/docs/reference/index.html)

-----------------------------------------------------------------------------------------------------------------------------------------------------

In [1]:
# Installation
!pip install pandas
import pandas as pd # Let's import pandas with pd
import numpy as np

Defaulting to user installation because normal site-packages is not writeable


-----------------------------------------------------------------------------------------------------------------------------------------------------

# Merge Tables

### Joining DataFrame

In [2]:
d1=pd.DataFrame(
    {"key":["a","b","c","c","d","e"],
     "num1":range(6)})

d2=pd.DataFrame(
    {"key":["b","c","e","f"],
     "num2":range(4)})

In [3]:
print(d1)
print(d2)

  key  num1
0   a     0
1   b     1
2   c     2
3   c     3
4   d     4
5   e     5
  key  num2
0   b     0
1   c     1
2   e     2
3   f     3


In [4]:
pd.merge(d1, d2)
pd.merge(d1, d2, on='key')

Unnamed: 0,key,num1,num2
0,b,1,0
1,c,2,1
2,c,3,1
3,e,5,2


In [5]:
d3=pd.DataFrame(
    {"key1":["a","b","c","c","d","e"],
     "num1":range(6)})

d4=pd.DataFrame(
    {"key2":["b","c","e","f"],
     "num2":range(4)})

In [6]:
pd.merge(
    d3,d4,left_on="key1",right_on="key2"
)

pd.merge(d1,d2,how="outer")
pd.merge(d1,d2,how="left")
pd.merge(d1,d2,how="right")
pd.merge(d1, d2, how='inner')

Unnamed: 0,key,num1,num2
0,b,1,0
1,c,2,1
2,c,3,1
3,e,5,2


In [7]:
df1=pd.DataFrame(
    {"key":["a","b","c","c","d","e"],
     "num1":range(6),
     "count":["one","three","two",
              "one","one","two"]})

df2=pd.DataFrame(
    {"key":["b","c","e","f"],
     "num2":range(4),
     "count":["one","two","two","two"]})

In [8]:
pd.merge(df1, df2, on=['key', 'count'], 
         how='outer')
pd.merge(df1, df2, on="key", how='outer')
pd.merge(df1, df2, 
         on='key', 
         suffixes=('_data1', '_data2'))

Unnamed: 0,key,num1,count_data1,num2,count_data2
0,b,1,three,0,one
1,c,2,two,1,two
2,c,3,one,1,two
3,e,5,two,2,two


### Merging on index

In [9]:
df1=pd.DataFrame(
    {"letter":["a","a","b",
               "b","a","c"],
     "num":range(6)}) 

df2=pd.DataFrame(
    {"value":[3,5,7]},
    index=["a","b","e"])

In [10]:
print(df1)
print(df2)

  letter  num
0      a    0
1      a    1
2      b    2
3      b    3
4      a    4
5      c    5
   value
a      3
b      5
e      7


In [11]:
pd.merge(df1,df2,
         left_on="letter",
         right_index=True)

Unnamed: 0,letter,num,value
0,a,0,3
1,a,1,3
2,b,2,5
3,b,3,5
4,a,4,3


In [12]:
right=pd.DataFrame(
    [[1,2],[3,4],[5,6]],
    index=["a","c","d"],
    columns=["Tom","Tim"])

left=pd.DataFrame(
    [[7,8],[9,10],[11,12],[13,14]],
    index=["a","b","e","f"],
    columns=["Sam","Kim"])

In [13]:
pd.merge(right,left, 
         right_index=True, 
         left_index=True, 
         how="outer")

Unnamed: 0,Tom,Tim,Sam,Kim
a,1.0,2.0,7.0,8.0
b,,,9.0,10.0
c,3.0,4.0,,
d,5.0,6.0,,
e,,,11.0,12.0
f,,,13.0,14.0


In [14]:
left.join(right)
left.join(right,how="outer")

Unnamed: 0,Sam,Kim,Tom,Tim
a,7.0,8.0,1.0,2.0
b,9.0,10.0,,
c,,,3.0,4.0
d,,,5.0,6.0
e,11.0,12.0,,
f,13.0,14.0,,


In [15]:
data=pd.DataFrame([[1,3],[5,7],[9,11]],            
                  index=["a","b","f"],      
                  columns=["Alex","Keta"])

In [16]:
left.join([right,data])

Unnamed: 0,Sam,Kim,Tom,Tim,Alex,Keta
a,7.0,8.0,1.0,2.0,1.0,3.0
b,9.0,10.0,,,5.0,7.0
e,11.0,12.0,,,,
f,13.0,14.0,,,9.0,11.0


### Concatenating Along an Axis


In [17]:
seq= np.arange(20).reshape((4, 5))

In [18]:
np.concatenate([seq,seq], axis=1)
np.concatenate([seq, seq], axis=0)

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14],
       [15, 16, 17, 18, 19],
       [ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14],
       [15, 16, 17, 18, 19]])

In [19]:
data1 = pd.Series(
    [0, 1], index=['a', 'b'])
data2 = pd.Series(
    [2,3,4], index=['c','d','e'])
data3 = pd.Series(
    [5, 6], index=['f', 'g'])

In [20]:
pd.concat([data1,data2,data3])
pd.concat([data1, data2, data3], axis=1)

Unnamed: 0,0,1,2
a,0.0,,
b,1.0,,
c,,2.0,
d,,3.0,
e,,4.0,
f,,,5.0
g,,,6.0


In [21]:
data4= pd.Series([10,11,12], 
                 index=['a','b',"c"])

In [22]:
pd.concat([data1,data4],axis=1,join="inner")

Unnamed: 0,0,1
a,0,10
b,1,11


In [23]:
x=pd.concat([data1, data2, data4], 
            keys=['one', 'two','three'])
x

one    a     0
       b     1
two    c     2
       d     3
       e     4
three  a    10
       b    11
       c    12
dtype: int64

In [24]:
x=pd.concat([data1, data2, data4], 
            axis=1,
            keys=['one', 'two', 'three'])
x

Unnamed: 0,one,two,three
a,0.0,,10.0
b,1.0,,11.0
c,,2.0,12.0
d,,3.0,
e,,4.0,


In [25]:
df1 = pd.DataFrame(
    np.arange(6).reshape(3, 2),
    index=['a', 'b', 'c'],
    columns=['one', 'two'])

df2 = pd.DataFrame(
    10+np.arange(4).reshape(2,2),
    index=['a', 'c'],
    columns=['three', 'four'])

In [26]:
pd.concat([df1, df2], axis=1, 
          keys=['s1', 's2'],
          sort=False)

Unnamed: 0_level_0,s1,s1,s2,s2
Unnamed: 0_level_1,one,two,three,four
a,0,1,10.0,11.0
b,2,3,,
c,4,5,12.0,13.0


In [27]:
data1 = pd.DataFrame(
    np.random.randn(3, 4),
    columns=['a','b','c','d'])

data2 = pd.DataFrame(
    np.random.randn(2, 3),
    columns=['b','d','a'])

In [28]:
pd.concat([data1, data2], ignore_index=True)

Unnamed: 0,a,b,c,d
0,0.137476,0.940753,0.015088,-0.149826
1,1.373517,-0.240621,0.196727,1.407608
2,0.683043,0.498399,-0.593724,0.155462
3,1.16941,-1.841331,,2.003502
4,0.902556,0.466291,,1.58863


-----------------------------------------------------------------------------------------------------------------------------------------------------

## More Merge Practice

-----------------------------------------------------------------------------------------------------------------------------------------------------

## Groupby, Pivot, Crosstab

#### Importing Employee Dataset for performing the Operations

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# lets read the dataset
data = pd.read_csv('data/employee.csv')
print(data.shape)

(1470, 35)


In [None]:
# let's check the head of the dataset
pd.set_option('max_Columns', 35)
data.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,2,Female,94,3,2,Sales Executive,4,Single,5993,19479,8,Y,Yes,11,3,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,3,Male,61,2,2,Research Scientist,2,Married,5130,24907,1,Y,No,23,4,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,4,Male,92,2,1,Laboratory Technician,3,Single,2090,2396,6,Y,Yes,15,3,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,4,Female,56,3,1,Research Scientist,3,Married,2909,23159,1,Y,Yes,11,3,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,1,Male,40,3,1,Laboratory Technician,2,Married,3468,16632,9,Y,No,12,3,4,80,1,6,3,3,2,2,2,2


### Groupby Function

* A groupby operation involves some combination of splitting the object, applying a function, and combining the results. This can be used to group large amounts of data and compute operations on these groups.

In [None]:
# lets read the documentation of groupby function using help function

help(pd.DataFrame.groupby)

Help on function groupby in module pandas.core.generic:

groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, group_keys=True, squeeze=False, observed=False, **kwargs)
    Group DataFrame or Series using a mapper or by a Series of columns.
    
    A groupby operation involves some combination of splitting the
    object, applying a function, and combining the results. This can be
    used to group large amounts of data and compute operations on these
    groups.
    
    Parameters
    ----------
    by : mapping, function, label, or list of labels
        Used to determine the groups for the groupby.
        If ``by`` is a function, it's called on each value of the object's
        index. If a dict or Series is passed, the Series or dict VALUES
        will be used to determine the groups (the Series' values are first
        aligned; see ``.align()`` method). If an ndarray is passed, the
        values are used as-is determine the groups. A label or list of
        l

The general syntax is ```groupby(data, by, ...)```.
* ```data``` is a dataframe
* ```by```   columns on which grouping is to be done
* ```agg```  the aggregate function can be used separately

Let's see some examples.

In [None]:
data.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
       'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
       'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')

In [None]:
data[['Age','DailyRate','Department']].groupby(['Department']).agg(['min',
                                                'max','mean'])

Unnamed: 0_level_0,Age,Age,Age,DailyRate,DailyRate,DailyRate
Unnamed: 0_level_1,min,max,mean,min,max,mean
Department,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Human Resources,19,59,37.809524,106,1444,751.539683
Research & Development,18,60,37.042664,102,1496,806.851197
Sales,18,60,36.542601,107,1499,800.275785


In [None]:
# let's groupby the departments and their mean age

data[['Department','Age']].groupby(by = ['Department']).agg('mean')

Unnamed: 0_level_0,Age
Department,Unnamed: 1_level_1
Human Resources,37.809524
Research & Development,37.042664
Sales,36.542601


In [None]:
# let's groupby the departments and their maximum age

data[['Department','Age']].groupby(by = ['Department']).agg('max')

Unnamed: 0_level_0,Age
Department,Unnamed: 1_level_1
Human Resources,59
Research & Development,60
Sales,60


In [None]:
# let's groupby the departments and their minimum age

data[['Department','Age']].groupby(by = ['Department']).agg('min')

Unnamed: 0_level_0,Age
Department,Unnamed: 1_level_1
Human Resources,19
Research & Development,18
Sales,18


In [None]:
# let's group more than two attributes at the same time

# trying to check Different Education Fields and their Mean Salaries in each of the Department

data[['Department','EducationField',
      'MonthlyRate']].groupby(by= ['Department','EducationField']).agg('mean')

Unnamed: 0_level_0,Unnamed: 1_level_0,MonthlyRate
Department,EducationField,Unnamed: 2_level_1
Human Resources,Human Resources,14810.740741
Human Resources,Life Sciences,12813.875
Human Resources,Medical,12668.230769
Human Resources,Other,9275.0
Human Resources,Technical Degree,13158.5
Research & Development,Life Sciences,14594.704545
Research & Development,Medical,14163.603306
Research & Development,Other,13051.765625
Research & Development,Technical Degree,14142.393617
Sales,Life Sciences,14523.786667


In [None]:
# trying to check Different Departments and their Mean Salaries in each of the Education Fields.

data[['EducationField',
      'Department','MonthlyRate']].groupby(by = ['EducationField',
                                                 'Department']).agg('mean')

Unnamed: 0_level_0,Unnamed: 1_level_0,MonthlyRate
EducationField,Department,Unnamed: 2_level_1
Human Resources,Human Resources,14810.740741
Life Sciences,Human Resources,12813.875
Life Sciences,Research & Development,14594.704545
Life Sciences,Sales,14523.786667
Marketing,Sales,14076.943396
Medical,Human Resources,12668.230769
Medical,Research & Development,14163.603306
Medical,Sales,15077.625
Other,Human Resources,9275.0
Other,Research & Development,13051.765625


In [None]:
# trying to check Different Departments and their Mean Salaries in each of the Education Fields.
# Including more fields.

data[['EducationField','Department',
      'MonthlyRate','DailyRate']].groupby(by = ['EducationField',
                                                'Department']).agg('mean')

Unnamed: 0_level_0,Unnamed: 1_level_0,MonthlyRate,DailyRate
EducationField,Department,Unnamed: 2_level_1,Unnamed: 3_level_1
Human Resources,Human Resources,14810.740741,675.259259
Life Sciences,Human Resources,12813.875,753.0625
Life Sciences,Research & Development,14594.704545,789.195455
Life Sciences,Sales,14523.786667,854.58
Marketing,Sales,14076.943396,727.836478
Medical,Human Resources,12668.230769,875.615385
Medical,Research & Development,14163.603306,825.730028
Medical,Sales,15077.625,802.909091
Other,Human Resources,9275.0,1005.0
Other,Research & Development,13051.765625,763.359375


### Pivot Tables Function

* We can create a spreadsheet-style pivot table as a DataFrame. The levels in the pivot table will be stored in MultiIndex objects (hierarchical indexes) on the index and columns of the result DataFrame.

In [None]:
# Read documentation

help(pd.DataFrame.pivot_table)

Help on function pivot_table in module pandas.core.frame:

pivot_table(self, values=None, index=None, columns=None, aggfunc='mean', fill_value=None, margins=False, dropna=True, margins_name='All', observed=False)
    Create a spreadsheet-style pivot table as a DataFrame. The levels in
    the pivot table will be stored in MultiIndex objects (hierarchical
    indexes) on the index and columns of the result DataFrame.
    
    Parameters
    ----------
    values : column to aggregate, optional
    index : column, Grouper, array, or list of the previous
        If an array is passed, it must be the same length as the data. The
        list can contain any of the other types (except list).
        Keys to group by on the pivot table index.  If an array is passed,
        it is being used as the same manner as column values.
    columns : column, Grouper, array, or list of the previous
        If an array is passed, it must be the same length as the data. The
        list can contain any o

The general syntax is ```pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', ...)```.
* ```data``` is a dataframe
* ```values``` contains the column to aggregate
* ```index``` is the row in the pivot table
* ```columns``` contains the columns you want in the pivot table
* ```aggfunc``` is the aggregate function

Let's see some examples.

In [None]:
# let's make a pivot table for the department and their mean ages

data.pivot_table(values ='Age', index = 'Department', aggfunc = 'mean')

Unnamed: 0_level_0,Age
Department,Unnamed: 1_level_1
Human Resources,37.809524
Research & Development,37.042664
Sales,36.542601


In [None]:
# let's try making a pivot table for department and their maximum ages

data.pivot_table(values = 'Age', index = 'Department', aggfunc = 'max')

Unnamed: 0_level_0,Age
Department,Unnamed: 1_level_1
Human Resources,59
Research & Development,60
Sales,60


In [None]:
# let's try making a pivot table for department and their minimum ages

data.pivot_table(values = 'Age', index = 'Department', aggfunc = 'min')

Unnamed: 0_level_0,Age
Department,Unnamed: 1_level_1
Human Resources,19
Research & Development,18
Sales,18


In [None]:
# let's try making a pivot table for department and their median ages

data.pivot_table(values = 'Age', index = 'Department', aggfunc = 'median')

Unnamed: 0_level_0,Age
Department,Unnamed: 1_level_1
Human Resources,37
Research & Development,36
Sales,35


In [None]:
# let's try making a pivot table with multi index
# Department and Education Field and check the maximum salary values of employees

data.pivot_table(values = ['MonthlyRate','DailyRate'],
                 index = ['Department','EducationField'],
                 aggfunc = 'max')

Unnamed: 0_level_0,Unnamed: 1_level_0,DailyRate,MonthlyRate
Department,EducationField,Unnamed: 2_level_1,Unnamed: 3_level_1
Human Resources,Human Resources,1420,25811
Human Resources,Life Sciences,1383,26894
Human Resources,Medical,1398,25657
Human Resources,Other,1444,12832
Human Resources,Technical Degree,1107,24017
Research & Development,Life Sciences,1490,26968
Research & Development,Medical,1495,26999
Research & Development,Other,1474,26537
Research & Development,Technical Degree,1496,26849
Sales,Life Sciences,1498,26204


### Differences between pivot table and groupby function

* Both can produce same results.
* But, When we  have to show aggregations 

### Crosstab Function

* Compute a simple cross tabulation of two (or more) factors. By default computes a frequency table of the factors unless an array of values and an aggregation function are passed

In [None]:
# let's check the documentation

help(pd.crosstab)

Help on function crosstab in module pandas.core.reshape.pivot:

crosstab(index, columns, values=None, rownames=None, colnames=None, aggfunc=None, margins=False, margins_name='All', dropna=True, normalize=False)
    Compute a simple cross tabulation of two (or more) factors. By default
    computes a frequency table of the factors unless an array of values and an
    aggregation function are passed.
    
    Parameters
    ----------
    index : array-like, Series, or list of arrays/Series
        Values to group by in the rows.
    columns : array-like, Series, or list of arrays/Series
        Values to group by in the columns.
    values : array-like, optional
        Array of values to aggregate according to the factors.
        Requires `aggfunc` be specified.
    rownames : sequence, default None
        If passed, must match number of row arrays passed.
    colnames : sequence, default None
        If passed, must match number of column arrays passed.
    aggfunc : function, optio

The general syntax is ```crosstab(data, values=None, index=None, columns=None, aggfunc='mean', ...)```.
* ```data``` is a dataframe
* ```values``` contains the column to aggregate
* ```index``` is the row in the pivot table
* ```columns``` contains the columns you want in the pivot table
* ```aggfunc``` is the aggregate function

Let's see some examples.

In [None]:
# lets make a simple crosstab

x = pd.crosstab(data['Department'], data['EducationField'])
x = pd.DataFrame(x)
x

EducationField,Human Resources,Life Sciences,Marketing,Medical,Other,Technical Degree
Department,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Human Resources,27,16,0,13,3,4
Research & Development,0,440,0,363,64,94
Sales,0,150,159,88,15,34


## Indexing, Selection, & Filtering in Series

In [3]:
obj=pd.Series(np.arange(5),
              index=["a","b","c","d","e"])

In [None]:
obj

In [None]:
obj["c"]

In [None]:
obj[2]

In [None]:
obj[0:3]

In [None]:
obj[["a","c"]]

In [None]:
obj[[0,2]]

In [None]:
obj[obj<2]

In [None]:
obj["a":"c"]

In [None]:
obj["b":"c"] = 5
obj

-----------------------------------------------------------------------------------------------------------------------------------------------------

## DataFrame Indexing