# Data Transformation in Pandas

In [1]:
import pandas as pd

In [2]:
data=pd.DataFrame({"a":["one","two"]*3,
                   "b":[1,1,2,3,2,3]})
data

Unnamed: 0,a,b
0,one,1
1,two,1
2,one,2
3,two,3
4,one,2
5,two,3


In [3]:
data.duplicated()

0    False
1    False
2    False
3    False
4     True
5     True
dtype: bool

In [4]:
data.drop_duplicates()

Unnamed: 0,a,b
0,one,1
1,two,1
2,one,2
3,two,3


In [5]:
data["c"]=range(6)
data

Unnamed: 0,a,b,c
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,2,4
5,two,3,5


In [6]:
df=pd.DataFrame({"names":["Tim","tom","Sam",
                          "kate","Kim"],
                "scores":[60,50,70,80,40]})
df

Unnamed: 0,names,scores
0,Tim,60
1,tom,50
2,Sam,70
3,kate,80
4,Kim,40


In [7]:
classes={"Tim":"A","Tom":"A","Sam":"B",
         "Kate":"B","Kim":"B"}

In [8]:
n=df["names"].str.capitalize()

In [9]:
df["branches"]=n.map(classes)

In [10]:
df

Unnamed: 0,names,scores,branches
0,Tim,60,A
1,tom,50,A
2,Sam,70,B
3,kate,80,B
4,Kim,40,B


In [11]:
s=pd.Series([80,70,90,60])
s

0    80
1    70
2    90
3    60
dtype: int64

In [12]:
import numpy as np

In [13]:
s.replace(70,np.nan)

0    80.0
1     NaN
2    90.0
3    60.0
dtype: float64

In [14]:
s.replace([70,60],[np.nan,0])

0    80.0
1     NaN
2    90.0
3     0.0
dtype: float64

In [15]:
s.replace({90:100,60:0})

0     80
1     70
2    100
3      0
dtype: int64

In [16]:
df=pd.DataFrame(
    np.arange(12).reshape(3,4),
    index=[0,1,2],
    columns=["tim","tom","kim","sam"])
df

Unnamed: 0,tim,tom,kim,sam
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11


In [17]:
s=pd.Series(["one","two","three"])
df.index=df.index.map(s)

In [18]:
df

Unnamed: 0,tim,tom,kim,sam
one,0,1,2,3
two,4,5,6,7
three,8,9,10,11


In [19]:
df.rename(index=str.title,columns=str.upper)

Unnamed: 0,TIM,TOM,KIM,SAM
One,0,1,2,3
Two,4,5,6,7
Three,8,9,10,11


In [20]:
df.rename(index={"one":"ten"},
          columns={"sam":"kate"},
          inplace=True)
df

Unnamed: 0,tim,tom,kim,kate
ten,0,1,2,3
two,4,5,6,7
three,8,9,10,11


In [21]:
data=pd.DataFrame(np.random.randn(1000,4))
data.head()

Unnamed: 0,0,1,2,3
0,-0.735611,0.347088,-0.471066,0.248023
1,-0.832226,0.545714,-0.526975,1.889453
2,-0.348026,0.189467,-1.19869,1.345937
3,1.253273,-0.276719,-1.028094,-0.290143
4,-0.304915,-0.582397,0.479874,0.549905


In [22]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.024163,0.000769,0.01339,0.066394
std,1.015275,1.00708,1.031401,0.981692
min,-2.893816,-3.132902,-3.662802,-3.531215
25%,-0.66347,-0.680194,-0.699242,-0.562057
50%,0.002324,-0.003217,0.019303,0.038113
75%,0.693417,0.684286,0.674377,0.740291
max,3.292166,3.600409,3.246751,2.940322


In [23]:
col=data[1]

In [24]:
col[np.abs(col)>3]

545   -3.132902
552    3.600409
Name: 1, dtype: float64

In [25]:
data[(np.abs(data)>3).any(1)]

Unnamed: 0,0,1,2,3
171,3.107302,-0.81251,-1.349047,0.033291
545,-0.232725,-3.132902,-0.754403,1.858646
552,1.925085,3.600409,1.902793,0.724673
571,3.0976,0.701304,1.369306,1.928704
678,1.613494,-0.854156,1.081314,-3.531215
733,1.418406,0.493568,3.246751,-0.632983
759,-1.304689,-0.994812,-3.282398,-0.804788
785,3.292166,0.971666,-0.329208,-1.512833
805,3.111156,-0.418591,1.655902,0.291483
849,3.060269,0.201408,0.584705,-0.560731


## Dummy Variable

In [26]:
data=pd.DataFrame(
    {"letter":["c","b","a","b","b","a"],
                   "number":range(6)})
data

Unnamed: 0,letter,number
0,c,0
1,b,1
2,a,2
3,b,3
4,b,4
5,a,5


In [27]:
pd.get_dummies(data["letter"])

Unnamed: 0,a,b,c
0,0,0,1
1,0,1,0
2,1,0,0
3,0,1,0
4,0,1,0
5,1,0,0
