# Pandas: Data Transformation

## 1. Miscellaneous techniques

### 1.1. Mapping
Map is the generalization of function.

In [None]:
from util import np, pd
from util._data import df_student

In [1]:
df_student

Unnamed: 0,student_id,gender
0,10001,Male
1,30001,Female
2,70001,Female
3,80001,Female
4,110001,Male
5,120001,Female


#### Mapping with a dictionary

In [3]:
gender_map = {'Female': 0, 'Male': 1}
df_student['gender'].map(gender_map)

0    1
1    0
2    0
3    0
4    1
5    0
Name: gender, dtype: int64

#### Mapping with a function

In [4]:
df_student['grade'] = df_student['student_id'].map(lambda x: int(x[:2]))
df_student

Unnamed: 0,student_id,gender,grade
0,10001,Male,1
1,30001,Female,3
2,70001,Female,7
3,80001,Female,8
4,110001,Male,11
5,120001,Female,12


In [5]:
def stage_map(grade):
    if grade in range(1, 6):
        stage = 'Primary'
    if grade in range(6, 10):
        stage = 'Secondary'
    if grade in range(10, 13):
        stage = 'High'
    return stage

df_student['stage'] = df_student['grade'].map(stage_map)
df_student

Unnamed: 0,student_id,gender,grade,stage
0,10001,Male,1,Primary
1,30001,Female,3,Primary
2,70001,Female,7,Secondary
3,80001,Female,8,Secondary
4,110001,Male,11,High
5,120001,Female,12,High


### 1.2. Window functions

In [None]:
from util import np, pd

In [19]:
columns = ['manufacturer', 'model', 'type', 'price', ]
df_car = pd.read_csv('../data/cars.csv', usecols=columns)
df_car.head()

Unnamed: 0,manufacturer,model,type,price
0,Chevrolet,Cavalier,Compact,13.4
1,Chevrolet,Corsica,Compact,11.4
2,Chevrolet,Camaro,Sporty,15.1
3,Chevrolet,Lumina,Midsize,15.9
4,Chevrolet,Lumina_APV,Van,16.3


In [20]:
(
    df_car
    .select_columns('manufacturer', 'price')
    .assign(rank=df_car.groupby('manufacturer')['price'].rank().astype(int))
    .assign(dense_rank=df_car.groupby('manufacturer')['price'].rank(method='dense').astype(int))
    .assign(row_number=df_car.groupby('manufacturer')['price'].rank(method='first').astype(int))
    .sort_values(['manufacturer', 'price'])
    .query("manufacturer=='Hyundai'")
)

Unnamed: 0,manufacturer,price,rank,dense_rank,row_number
21,Hyundai,8.0,1,1,1
24,Hyundai,10.0,2,2,2
25,Hyundai,10.0,2,2,3
26,Hyundai,13.9,4,3,4
23,Hyundai,16.1,5,4,5
22,Hyundai,18.8,6,5,6


## 2. Pivot table

### 2.1. Wide and long form

#### Wide form table
Wide form table divides a variable and places each part in a column. Therefore, it allows displaying more data, and is convenient for keeping tract of exactly one index (sales or profit, for example). However, wide form supports not very well storing two variables ore more.

Color|2000 Q1|2000 Q2|2000 Q3|2000 Q4|
:----|------:|------:|------:|------:|
Red  |\$ 1000|\$ 1200|\$ 1500|\$ 1700|
Green|\$ 1500|\$ 1500|\$ 1575|\$ 1800|
Blue |\$ 2000|\$ 2200|\$ 2000|\$ 2800|

#### Long form table
Long form stores each variable in only one column, enables unlimited number of features. In data analysis, long form is considered tidy data and is used as standard tabular data format.

Color|Quarter|Sales   |Quantity|Price|
:----|:------|-------:|-------:|----:|
Red  |2000 Q1|\$ 1000 |50      |\$ 20|
Green|2000 Q1|\$ 1500 |50      |\$ 30|
Blue |2000 Q1|\$ 2000 |40      |\$ 50|
Red  |2000 Q2|\$ 1200 |60      |\$ 20|
Green|2000 Q2|\$ 1500 |50      |\$ 30|
Blue |2000 Q2|\$ 2200 |40      |\$ 55|
Red  |2000 Q3|\$ 1500 |75      |\$ 20|
Green|2000 Q3|\$ 1575 |45      |\$ 35|
Blue |2000 Q3|\$ 2000 |40      |\$ 50|
Red  |2000 Q4|\$ 1700 |85      |\$ 20|
Green|2000 Q4|\$ 1800 |20      |\$ 60|
Blue |2000 Q4|\$ 2800 |70      |\$ 40|

### 2.2. Unpivoting
Unpivoting is the process of transforming a table from wide form to long form. This technique is very useful in tidying up messy data.

In [3]:
from util import np, pd
from util._data import df_sales

In [4]:
df_sales

Unnamed: 0,color,2000 Q1,2000 Q2,2000 Q3,2000 Q4
0,red,1000,1200,1500,1700
1,green,1500,1500,1575,1800
2,blue,2000,2200,2000,2800


In [4]:
df_sales.melt(id_vars='color', var_name='quarter', value_name='sales')

Unnamed: 0,color,quarter,sales
0,red,2000 Q1,1000
1,green,2000 Q1,1500
2,blue,2000 Q1,2000
3,red,2000 Q2,1200
4,green,2000 Q2,1500
5,blue,2000 Q2,2200
6,red,2000 Q3,1500
7,green,2000 Q3,1575
8,blue,2000 Q3,2000
9,red,2000 Q4,1700


### 2.3. Pivot table
Pivoting is the process of transforming a table from long form to wide form. Notice that the table in this case is already tidy, so pivot table mainly works as a tool to summarise data.

In [5]:
from util import np, pd
from util._data import df_long

In [6]:
df_long

Unnamed: 0,Market,Color,Size,Price,Sales
0,Asian,Red,Large,17,68000
1,Asian,Red,Small,11,44000
2,Asian,Blue,Large,19,57000
3,Asian,Blue,Small,13,52000
4,Europe,Red,Large,18,81000
5,Europe,Red,Small,12,72000
6,Europe,Blue,Large,20,90000
7,Europe,Blue,Small,14,77000


#### Basic pivoting
Pivoting usually returns duplicated values, handling them requires an aggregate function. The default function used in the `pivot_table()` method is `np.mean()`. In this example, using `np.sum()` for `Sales` and `np.mean` for `Price` makes sense.

In [20]:
(
    df_long
    .pivot_table(
        index=['Market', 'Color'],
        columns='Size',
        values='Sales',
        aggfunc=np.sum)
    .reset_index()
    .rename_axis(None, axis=1)
)

Unnamed: 0,Market,Color,Large,Small
0,Asian,Blue,57000,52000
1,Asian,Red,68000,44000
2,Europe,Blue,90000,77000
3,Europe,Red,81000,72000


In [21]:
(
    df_long
    .pivot_table(
        index=['Market', 'Color'],
        columns='Size',
        values='Price',
        aggfunc=np.mean)
    .reset_index()
    .rename_axis(None, axis=1)
)

Unnamed: 0,Market,Color,Large,Small
0,Asian,Blue,19,13
1,Asian,Red,17,11
2,Europe,Blue,20,14
3,Europe,Red,18,12


#### Multivariate pivoting
The more advanced the technique, the more complicated the tools are.

In [22]:
df_wide = df_long.pivot_table(
    index='Market',
    columns='Color',
    aggfunc={'Price': np.mean, 'Sales': np.sum}
)
df_wide

Unnamed: 0_level_0,Price,Price,Sales,Sales
Color,Blue,Red,Blue,Red
Market,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Asian,16,14,109000,112000
Europe,17,15,167000,153000


&#9800;&nbsp;<b>Note</b><br>
By passing a dictionary to the `aggfunc` parameter, the columns to be used as values are specified. Therefore, the `values` parameter can be skipped.

In [23]:
df_wide.columns = df_wide.columns.values
df_wide = df_wide.reset_index()
df_wide

Unnamed: 0,Market,"(Price, Blue)","(Price, Red)","(Sales, Blue)","(Sales, Red)"
0,Asian,16,14,109000,112000
1,Europe,17,15,167000,153000


#### Pivoting vs. grouping
Skipping the `columns` parameter ends up returning the same result as the `groupby()` method.

In [24]:
(
    df_long
    .pivot_table(
        index=['Market', 'Color'],
        aggfunc={'Price': np.mean, 'Sales': np.sum})
    .reset_index()
)

Unnamed: 0,Market,Color,Price,Sales
0,Asian,Blue,16,109000
1,Asian,Red,14,112000
2,Europe,Blue,17,167000
3,Europe,Red,15,153000


In [25]:
(
    df_long
    .groupby(['Market', 'Color'])
    .agg({'Price': np.mean, 'Sales': np.sum})
    .reset_index()
)

Unnamed: 0,Market,Color,Price,Sales
0,Asian,Blue,16,109000
1,Asian,Red,14,112000
2,Europe,Blue,17,167000
3,Europe,Red,15,153000


## 3. Combining datasets

### 3.1. Concatenation
The function
<code style="font-size:13px"><a href="https://pandas.pydata.org/docs/reference/api/pandas.concat.html">pd.concat()</a></code>
is equivalent to
<code style="font-size:13px">UNION ALL</code>
statement in SQL, which combines data vertically based on matched columns. Pandas matches columns using column names rather than order.

In [3]:
import util
from util import np, pd

In [10]:
db = pd.read_excel('../data/world_population.xlsx', sheet_name=None)
db.keys()

dict_keys(['1960s', '1970s', '1980s', '1990s', '2000s', '2010s'])

In [11]:
pd.concat(db.values())

Unnamed: 0,year,country,population
0,1960,Afghanistan,8996351.0
1,1961,Afghanistan,9166764.0
2,1962,Afghanistan,9345868.0
3,1963,Afghanistan,9533954.0
4,1964,Afghanistan,9731361.0
...,...,...,...
1509,2012,Zimbabwe,14710826.0
1510,2013,Zimbabwe,15054506.0
1511,2014,Zimbabwe,15411675.0
1512,2015,Zimbabwe,15777451.0


### 3.2. Merging
The method 
<code style="font-size:13px"><a href="https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.merge.html">merge()</a></code>
is equivalent to the
<code style="font-size:13px">JOIN</code>
statement in SQL, which combines columns based on matched rows. Row matching is performed on *key columns*, and can be one of four types: *left*, *right*, *inner* and *outer*.

In [13]:
from util import np, pd
from util._data import df_income, df_tax

In [14]:
df_income

Unnamed: 0,name,income_before_tax,tax_band
0,Hannah,12000,Allowance
1,James,30000,Basic
2,Gabriel,7000,Allowance
3,Smith,20000,Basic
4,Alex,100000,Higher


In [15]:
df_tax

Unnamed: 0,band,income_range,tax_rate
0,Allowance,"Up to 12,500",0.0
1,Basic,"12,501 to 50,000",0.2
2,Higher,"50,001 to 150,000",0.4
3,Additional,"Over 150,000",0.45


In [16]:
df_income.rename(columns={'tax_band': 'band'}).merge(df_tax, how='left')

Unnamed: 0,name,income_before_tax,band,income_range,tax_rate
0,Hannah,12000,Allowance,"Up to 12,500",0.0
1,James,30000,Basic,"12,501 to 50,000",0.2
2,Gabriel,7000,Allowance,"Up to 12,500",0.0
3,Smith,20000,Basic,"12,501 to 50,000",0.2
4,Alex,100000,Higher,"50,001 to 150,000",0.4
