# Pandas

---

# Table of Contents

### 1. Basic Usage

- [index merge](#index_merge)
- [row by row append](#row_append)

### 2. Wrangling

- [remove redundancy](#rem_red)
- [values by group](#group_values)

### 3. String

- [integer to 2-digit string](#2digit)

### 4. Correlation

- [corr](#corr)

<br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br>

---

In [3]:
import pandas as pd

<a id="index_merge"></a>

# Pandas Index

In [2]:
df1 = pd.read_table(
    r'C:\Codes\Snippets\Correlator\pivot_a.txt',
    index_col = ['lot_wf'],
)

df1

Unnamed: 0_level_0,kz000000,kz000001
lot_wf,Unnamed: 1_level_1,Unnamed: 2_level_1
0X1,1,4
0X2,2,3
0X3,3,2
0X4,4,1


In [3]:
df1.columnsdf2 = pd.read_table(
    r'C:\Codes\Snippets\Correlator\pivot_b.txt',
    index_col = ['lot_wf'],
)

df2

Unnamed: 0_level_0,stepa,stepb
lot_wf,Unnamed: 1_level_1,Unnamed: 2_level_1
0X1,a,a
0X2,a,b
0X3,a,a
0X4,b,a


In [4]:
df1.columns

Index(['kz000000', 'kz000001'], dtype='object')

In [5]:
df2.columns

Index(['stepa', 'stepb'], dtype='object')

## merge two dataframe by index

Reference :https://stackoverflow.com/questions/40468069/python-pandas-merge-two-dataframes-by-index

### 1st method : use join

In [6]:
res = df1.join(df2)

res

Unnamed: 0_level_0,kz000000,kz000001,stepa,stepb
lot_wf,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0X1,1,4,a,a
0X2,2,3,a,b
0X3,3,2,a,a
0X4,4,1,b,a


### 2nd method : use merge

In [7]:
res2 = pd.merge(df1, df2, left_index=True, right_index=True)

res2

Unnamed: 0_level_0,kz000000,kz000001,stepa,stepb
lot_wf,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0X1,1,4,a,a
0X2,2,3,a,b
0X3,3,2,a,a
0X4,4,1,b,a


### 3rd method : use concat

In [8]:
# by default it uses row-wise operation
res3 = pd.concat([df1, df2], ignore_index=True)

res3

Unnamed: 0,kz000000,kz000001,stepa,stepb
0,1.0,4.0,,
1,2.0,3.0,,
2,3.0,2.0,,
3,4.0,1.0,,
4,,,a,a
5,,,a,b
6,,,a,a
7,,,b,a


In [9]:
res4 = pd.concat([df1, df2], axis=1)

res4

Unnamed: 0_level_0,kz000000,kz000001,stepa,stepb
lot_wf,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0X1,1,4,a,a
0X2,2,3,a,b
0X3,3,2,a,a
0X4,4,1,b,a


## df indexes to list

In [12]:
df1.index.tolist()

['0X1', '0X2', '0X3', '0X4']

---

<a id="2digit"></a>

# Integer to 2-digit string

In [9]:
df = pd.DataFrame({
    'lot':['0XX','155','165'],
    'A':[1,2,3],
    'B':[1.0,2.0,3.0],
})

## sample df

In [11]:
df

Unnamed: 0,A,B,lot
0,1,1.0,0XX
1,2,2.0,155
2,3,3.0,165


### Convert integer to 2 digit String

In [12]:
df['A2'] = df['A'].map("{:02}".format)

In [13]:
df

Unnamed: 0,A,B,lot,A2
0,1,1.0,0XX,1
1,2,2.0,155,2
2,3,3.0,165,3


### Convert float to 2 digit String

In [24]:
df['B2'] = df['B'].astype(int)
df['B2'] = df['B2'].map("{:02}".format)

In [25]:
df

Unnamed: 0,A,B,lot,A2,B2,lot_wf
0,1,1.0,0XX,1,1,0XX_01
1,2,2.0,155,2,2,155_02
2,3,3.0,165,3,3,165_03


In [26]:
df['lot_wf'] = df['lot'] + "_" + df['A'].map("{:02}".format)

In [27]:
df

Unnamed: 0,A,B,lot,A2,B2,lot_wf
0,1,1.0,0XX,1,1,0XX_01
1,2,2.0,155,2,2,155_02
2,3,3.0,165,3,3,165_03


---

<a id="row_append"></a>

<a id="row_append"></a>
# Append df row by row

In [2]:
df = pd.DataFrame({
    'A':[1,2],
    'B':[3,4],
})

In [3]:
df

Unnamed: 0,A,B
0,1,3
1,2,4


In [4]:
df2 = pd.DataFrame({
    'A':[100,200],
    'B':[300,400],
})

In [5]:
df2

Unnamed: 0,A,B
0,100,300
1,200,400


### method1: append two df using append

In [6]:
df.append(df2)

Unnamed: 0,A,B
0,1,3
1,2,4
0,100,300
1,200,400


In [7]:
df.append(df2, ignore_index=True)

Unnamed: 0,A,B
0,1,3
1,2,4
2,100,300
3,200,400


### method2: append two df using concat

In [8]:
pd.concat([df, df2], ignore_index=True)

Unnamed: 0,A,B
0,1,3
1,2,4
2,100,300
3,200,400


<br>
### merge with different dtypes

In [9]:
sample1 = pd.DataFrame({
    'wf':[1,2,3],    
})

In [10]:
sample2 = pd.DataFrame({
    'wf':[1.0,2.0,3.0],
    'data':['col1','col2','col3'],
})

### integer & float types are compatible

In [11]:
res = pd.merge(sample1, sample2, on=['wf'], how='left')

In [12]:
res

Unnamed: 0,wf,data
0,1,col1
1,2,col2
2,3,col3


## repalce value using set_value

In [13]:
res.set_value(0, 'wf', 'replaced')

Unnamed: 0,wf,data
0,replaced,col1
1,2,col2
2,3,col3


<a id="rem_red"></a>

---

# Remove Redundancy

In [2]:
dfdf = pd.read_table(
    r'C:\Codes\Snippets\Correlator\redundancy.txt',
    index_col=['lot_wf'],
)

In [3]:
df

Unnamed: 0_level_0,step,CHB,ppid
lot_wf,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,eSiGe 성장,3,A
A,eSiGe 성장,5,A
B,eSD 성장,3,A
B,eSD 성장,7,A


### only select redundancy factor

In [6]:
exc = pd.read_table(
    r'C:\Codes\Snippets\Correlator\exc_chb.txt',
)

exc['exclude'] = 'O'

In [7]:
exc

Unnamed: 0,step,CHB,exclude
0,eSiGe 성장,3,O
1,eSiGe 성장,4,O
2,eSD 성장,3,O
3,eSD 성장,4,O


In [8]:
df = pd.merge(df, exc, on=['step','CHB'], how='left')

In [9]:
df

Unnamed: 0,step,CHB,ppid,exclude
0,eSiGe 성장,3,A,O
1,eSiGe 성장,5,A,
2,eSD 성장,3,A,O
3,eSD 성장,7,A,


In [12]:
df = df[df['exclude'].isnull()]; del df['exclude']

In [13]:
df

Unnamed: 0,step,CHB,ppid
1,eSiGe 성장,5,A
3,eSD 성장,7,A


<a id="corr"></a>

# Correlation

## pandas corr method not matched with scipy.linregress

In [4]:
df = pd.read_table(
    r'C:\Codes\Snippets\python\pandas\corr.txt',
)

In [6]:
df.head()

Unnamed: 0,item1,item2,item3,item4,item5,item6
0,0.381561,0.627673,0.572998,0.41818,0.397462,0.874766
1,0.329634,0.024759,0.705531,0.470418,0.90078,0.998611
2,0.275587,0.695989,0.017921,0.753645,0.386568,0.400279
3,0.644938,0.387296,0.82824,0.832683,0.759714,0.663033
4,0.612575,0.142906,0.322322,0.242441,0.355048,0.749029


### using pandas linregress

In [16]:
res = df.corr(method='pearson')

In [17]:
res

Unnamed: 0,item1,item2,item3,item4,item5,item6
item1,1.0,0.212289,-0.000168,-0.140716,-0.039237,0.128532
item2,0.212289,1.0,0.013734,0.255733,0.032732,0.061241
item3,-0.000168,0.013734,1.0,0.14551,0.253018,0.109234
item4,-0.140716,0.255733,0.14551,1.0,0.076706,0.045498
item5,-0.039237,0.032732,0.253018,0.076706,1.0,0.221628
item6,0.128532,0.061241,0.109234,0.045498,0.221628,1.0


### using scipy linregress

In [20]:
import scipy as sci

In [22]:
item = 'item1'
cor_item = ['item2','item3','item4','item5','item6']

In [34]:
result = []

for col in cor_item:
    res = sci.stats.linregress(df[item], df[col])
    
    result.append(res[2]**2)    

In [35]:
result

[0.045066722188776415,
 2.82380836331879e-08,
 0.01980096446192698,
 0.001539570571274407,
 0.016520392862579069]

---

<a id="group_values"></a>

# values by group

In [71]:
df = pd.DataFrame({
    'A':['a','a','a','a','a','a','b','b','b','b'],
    'B':[0,1,2,3,4,5,6,7,8,9]
})

In [72]:
df

Unnamed: 0,A,B
0,a,0
1,a,1
2,a,2
3,a,3
4,a,4
5,a,5
6,b,6
7,b,7
8,b,8
9,b,9


### make function input, value separate

In [88]:
def separate_value_by_item(df, item='A', value='B'):
    
    result = []
    
    grp = df.groupby([item])[value]
    
    for ind, row in grp:
        result.append(row.values)
        
    return result

In [89]:
res = separate_value_by_item(df, item='A', value='B')

In [90]:
res

[array([0, 1, 2, 3, 4, 5], dtype=int64), array([6, 7, 8, 9], dtype=int64)]

---