# Pandas
## Python Data Analysis Library

# Table of Contents

### 1. Trouble Shooting

- [Convert df to dictionary](#to_dict)
- [null value when pivoting](#null_value)
- [dtypes on concat](#dtype_concat)
- [keep dataframe class after aggregation](#keep_dtype_agg)

<br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br>

In [1]:
import pandas as pd

from pprint import pprint

<a id="to_dict"></a>
<br>
# Convert df to dictionary

In [2]:
df = pd.DataFrame({
    'A':['a','b','c'],
    'B':[1,2,3],
})

In [3]:
df

Unnamed: 0,A,B
0,a,1
1,b,2
2,c,3


In [4]:
_dic = df.to_dict(orient='list')

In [5]:
pprint(_dic)

{'A': ['a', 'b', 'c'], 'B': [1, 2, 3]}


## make df from dictionary

In [6]:
_df = pd.DataFrame.from_dict(_dic)

In [7]:
_df

Unnamed: 0,A,B
0,a,1
1,b,2
2,c,3


# options on to_dict orient

## orient = dict

In [8]:
conv = df.to_dict(orient='dict')
conv

{'A': {0: 'a', 1: 'b', 2: 'c'}, 'B': {0: 1, 1: 2, 2: 3}}

## orient = list

In [9]:
conv = df.to_dict(orient='list')
conv

{'A': ['a', 'b', 'c'], 'B': [1, 2, 3]}

## orient = series

In [10]:
conv = df.to_dict(orient='series')
conv

{'A': 0    a
 1    b
 2    c
 Name: A, dtype: object, 'B': 0    1
 1    2
 2    3
 Name: B, dtype: int64}

## orient = split

In [11]:
conv = df.to_dict(orient='split')
conv

{'columns': ['A', 'B'],
 'data': [['a', 1], ['b', 2], ['c', 3]],
 'index': [0, 1, 2]}

## orient = records

In [12]:
conv = df.to_dict(orient='records')
conv

[{'A': 'a', 'B': 1}, {'A': 'b', 'B': 2}, {'A': 'c', 'B': 3}]

## orient = index

In [13]:
conv = df.to_dict(orient='index')
conv

{0: {'A': 'a', 'B': 1}, 1: {'A': 'b', 'B': 2}, 2: {'A': 'c', 'B': 3}}

<a id="dtype_concat"></a>
<br>
# dtypes when concat

In [14]:
df1 = pd.DataFrame(
    {
        'A':[1],
        'B':[1],
    }
)

In [15]:
df2 = pd.DataFrame(
    {
        'A':[2],
        'B':['a'],
    }
)

In [16]:
df_conc = pd.concat([df1, df2])

<h2 style="color:blue;">concat is performed by it's column names not consider columns's dtype</h2>
<h2 style="color:blue;">dtype of columns set properly</h2>

In [17]:
df_conc.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2 entries, 0 to 0
Data columns (total 2 columns):
A    2 non-null int64
B    2 non-null object
dtypes: int64(1), object(1)
memory usage: 48.0+ bytes


In [18]:
df_conc

Unnamed: 0,A,B
0,1,1
0,2,a


<a id="null_value"></a>
<br>
# null value when pivot

## pivot operation

In [19]:
import numpy as np

In [20]:
df = pd.DataFrame(
    {
    'Index1':['A','A','B','B','C','C'],
    'Index2':['a','b','c','d','e','f'],
    'Columns':['a','a','a','b','b','b'],
    'value':[1,2,3,4,5,6],
    }
)

In [21]:
df

Unnamed: 0,Columns,Index1,Index2,value
0,a,A,a,1
1,a,A,b,2
2,a,B,c,3
3,b,B,d,4
4,b,C,e,5
5,b,C,f,6


In [22]:
pivot = pd.pivot_table(
    df,
    index=['Index1','Index2'],
    columns=['Columns'],
    values='value',
    aggfunc=np.mean,
)

In [23]:
pivot

Unnamed: 0_level_0,Columns,a,b
Index1,Index2,Unnamed: 2_level_1,Unnamed: 3_level_1
A,a,1.0,
A,b,2.0,
B,c,3.0,
B,d,,4.0
C,e,,5.0
C,f,,6.0


## missing value in pivot @ Index
<h3 style="color:red;">index null value data missed after pivot</h3>

In [24]:
df = pd.DataFrame(
    {
    'Index1':['A','A','B','B','C','C'],
    'Index2':[np.nan,'b','c','d',np.nan,'f'],
    'Columns':['a','a','a','b','b','b'],
    'value':[1,2,3,4,5,6],
    }
)

In [25]:
df

Unnamed: 0,Columns,Index1,Index2,value
0,a,A,,1
1,a,A,b,2
2,a,B,c,3
3,b,B,d,4
4,b,C,,5
5,b,C,f,6


In [26]:
pivot = pd.pivot_table(
    df,
    index=['Index1','Index2'],
    columns=['Columns'],
    values='value',
    aggfunc=np.mean,
)

<h2 style="color:red;">result: missing index values removed</h2>

In [27]:
pivot

Unnamed: 0_level_0,Columns,a,b
Index1,Index2,Unnamed: 2_level_1,Unnamed: 3_level_1
A,b,2.0,
B,c,3.0,
B,d,,4.0
C,f,,6.0


<br>
## missing value in pivot @ columns
<h3 style="color:red;">column null value data missed after pivot</h3>

In [28]:
df = pd.DataFrame(
    {
    'Index1':['A','A','B','B','C','C'],
    'Index2':['a','b',np.nan,np.nan,'e','f'],
    'Columns':['a','a','a','b','b','b'],
    'value':[1,2,3,4,5,6],
    }
)

In [29]:
df

Unnamed: 0,Columns,Index1,Index2,value
0,a,A,a,1
1,a,A,b,2
2,a,B,,3
3,b,B,,4
4,b,C,e,5
5,b,C,f,6


In [30]:
pivot = pd.pivot_table(
    df,
    index=['Index1','Index2'],
    columns=['Columns'],
    values='value',
    aggfunc=np.mean,
)

<h2 style="color:red;">result: missing column values removed</h2>

In [31]:
pivot

Unnamed: 0_level_0,Columns,a,b
Index1,Index2,Unnamed: 2_level_1,Unnamed: 3_level_1
A,a,1.0,
A,b,2.0,
C,e,,5.0
C,f,,6.0


<br>
## missing value in pivot @ values

In [32]:
df = pd.DataFrame(
    {
    'Index1':['A','A','B','B','C','C'],
    'Index2':['a','b','c','d','e','f'],
    'Columns':['a','a','a','b','b','b'],
    'value':[1,np.nan,3,4,np.nan,6],
    }
)

In [33]:
df

Unnamed: 0,Columns,Index1,Index2,value
0,a,A,a,1.0
1,a,A,b,
2,a,B,c,3.0
3,b,B,d,4.0
4,b,C,e,
5,b,C,f,6.0


In [34]:
pivot = pd.pivot_table(
    df,
    index=['Index1','Index2'],
    columns=['Columns'],
    values='value',
    aggfunc=np.mean,
)

<h2 style="color:blue;">result: missing values remain in pivoted df</h2>

In [35]:
pivot

Unnamed: 0_level_0,Columns,a,b
Index1,Index2,Unnamed: 2_level_1,Unnamed: 3_level_1
A,a,1.0,
A,b,,
B,c,3.0,
B,d,,4.0
C,e,,
C,f,,6.0


<a id="keep_dtype_agg"></a>
<br>
# keep dataframe class after aggregation

In [36]:
df = pd.read_table('pivot_sample.txt')

In [37]:
df

Unnamed: 0,LOT,WF,ITEM,VALUE
0,A,1,Vtsat,0.901
1,A,2,Vtsat,
2,A,1,Idsat,0.353807
3,A,2,Idsat,0.934208
4,B,1,Vtsat,0.165206
5,B,2,Vtsat,0.713711
6,B,1,Idsat,0.241653
7,B,2,Idsat,0.547055


<h2 style="color:blue;">brackets in VALUE column makes result as df</h2>

In [38]:
agg = df.groupby(['LOT','WF','ITEM'])[['VALUE']].agg(lambda x: x.mean())

In [39]:
agg

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,VALUE
LOT,WF,ITEM,Unnamed: 3_level_1
A,1,Idsat,0.353807
A,1,Vtsat,0.901
A,2,Idsat,0.934208
A,2,Vtsat,
B,1,Idsat,0.241653
B,1,Vtsat,0.165206
B,2,Idsat,0.547055
B,2,Vtsat,0.713711
