# Data Cleaning and Preparation Part_2
## Data Transformation
How To Handle Duplicate Rows and Values?

In [17]:
import pandas as pd
import numpy as np
dup = pd.DataFrame({'key1': ['one', 'three'] * 3 + ['three', 'two'], 
              'key2': [1, 1, 2, 3, 3, 4, 4, 5]})
dup

Unnamed: 0,key1,key2
0,one,1
1,three,1
2,one,2
3,three,3
4,one,3
5,three,4
6,three,4
7,two,5


In [18]:
dup.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
7    False
dtype: bool

In [19]:
dup.drop_duplicates()
# dup.drop_duplicates(['key2'])
# dup.drop_duplicates(['key1', 'key2'], keep='last')

Unnamed: 0,key1,key2
0,one,1
1,three,1
2,one,2
3,three,3
4,one,3
5,three,4
7,two,5


In [20]:
dup['key3'] = np.arange(8)
dup.drop_duplicates(['key3'])

Unnamed: 0,key1,key2,key3
0,one,1,0
1,three,1,1
2,one,2,2
3,three,3,3
4,one,3,4
5,three,4,5
6,three,4,6
7,two,5,7


## Data Transformation
#### How To Transform Data Usig Functions and/or Mapping?

In [21]:
# pd.Series.map?

In [22]:
data_tr = pd.DataFrame({'Names': ['Raja', 'vali', 'Salu', 'Balu', 'Vali', 'mali'], 
                       'Score': [4, 3, 2, 6, 5, 1,]})
data_tr

Unnamed: 0,Names,Score
0,Raja,4
1,vali,3
2,Salu,2
3,Balu,6
4,Vali,5
5,mali,1


In [23]:
match_data_tr = {'raja':'Yellow', 'vali':'Red', 'salu':'Green', 'balu':'Green', 'mali':'Dark'}
match_data_tr

{'raja': 'Yellow',
 'vali': 'Red',
 'salu': 'Green',
 'balu': 'Green',
 'mali': 'Dark'}

In [24]:
lower_str = data_tr['Names'].str.lower()
lower_str

0    raja
1    vali
2    salu
3    balu
4    vali
5    mali
Name: Names, dtype: object

In [25]:
data_tr['Color'] = lower_str.map(match_data_tr)
data_tr

Unnamed: 0,Names,Score,Color
0,Raja,4,Yellow
1,vali,3,Red
2,Salu,2,Green
3,Balu,6,Green
4,Vali,5,Red
5,mali,1,Dark


In [26]:
data_tr['Color'] = data_tr['Names'].map(lambda x: match_data_tr[x.lower()])
data_tr

Unnamed: 0,Names,Score,Color
0,Raja,4,Yellow
1,vali,3,Red
2,Salu,2,Green
3,Balu,6,Green
4,Vali,5,Red
5,mali,1,Dark


## Data Transformation
#### How To Replace Values?

In [27]:
pd.DataFrame.replace?

In [28]:
import pandas as pd
import numpy as np
ser_data = pd.Series([1., -9., 2., -9., -1., 3.])
ser_data

0    1.0
1   -9.0
2    2.0
3   -9.0
4   -1.0
5    3.0
dtype: float64

In [29]:
ser_data.replace(-9, np.nan)
#ser_data.replace([-9, -1], np.nan)
#ser_data.replace([-9, -1], [100, 500])
#ser_data.replace({-9:50, -1:60})

0    1.0
1    NaN
2    2.0
3    NaN
4   -1.0
5    3.0
dtype: float64

## Data Transformation

#### How To Rename Axis Indexes?

In [30]:
import pandas as pd
import numpy as np
data_tor = pd.DataFrame(np.arange(12).reshape((3, 4)), 
             index=['Apple', 'Banana', 'Grapes'], 
             columns=['one', 'two', 'three', 'four'])
data_tor

Unnamed: 0,one,two,three,four
Apple,0,1,2,3
Banana,4,5,6,7
Grapes,8,9,10,11


In [31]:
upper = lambda x: x[:5].upper()
data_tor.index.map(upper)

Index(['APPLE', 'BANAN', 'GRAPE'], dtype='object')

In [32]:
data_tor.index = data_tor.index.map(upper)

In [33]:
data_tor

Unnamed: 0,one,two,three,four
APPLE,0,1,2,3
BANAN,4,5,6,7
GRAPE,8,9,10,11


In [34]:
pd.DataFrame.rename?

In [35]:
data_tor.rename(index=str.lower, columns=str.title)

Unnamed: 0,One,Two,Three,Four
apple,0,1,2,3
banan,4,5,6,7
grape,8,9,10,11


In [36]:
data_tor.rename(index={'APPLE': 'Appale'}, columns={'two':2})

Unnamed: 0,one,2,three,four
Appale,0,1,2,3
BANAN,4,5,6,7
GRAPE,8,9,10,11


## Data Transformation

#### How To Descretize and/or Bin The Data?

In [37]:
# pd.cut?
# pd.qcut?

In [38]:
import pandas as pd
import numpy as np
s_m = [21, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
s_m

[21, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

In [39]:
bin_size = [20, 35, 45, 60, 100]

In [40]:
binned = pd.cut(s_m, bin_size)
binned

[(20, 35], (20, 35], (20, 35], (20, 35], (20, 35], ..., (20, 35], (60, 100], (35, 45], (35, 45], (20, 35]]
Length: 12
Categories (4, interval[int64, right]): [(20, 35] < (35, 45] < (45, 60] < (60, 100]]

In [41]:
binned.codes

array([0, 0, 0, 0, 0, 0, 1, 0, 3, 1, 1, 0], dtype=int8)

In [42]:
binned.categories

IntervalIndex([(20, 35], (35, 45], (45, 60], (60, 100]], dtype='interval[int64, right]')

In [43]:
binned.value_counts()

(20, 35]     8
(35, 45]     3
(45, 60]     0
(60, 100]    1
dtype: int64

In [44]:
binned = pd.cut(s_m, bin_size, right=False)
print(binned.categories)
print(binned.value_counts())

IntervalIndex([[20, 35), [35, 45), [45, 60), [60, 100)], dtype='interval[int64, left]')
[20, 35)     8
[35, 45)     2
[45, 60)     1
[60, 100)    1
dtype: int64


In [45]:
rebinned = pd.cut(s_m, bin_size, labels=['Fail', 'Pass', 'SC', 'FCorFCD'], right=False)
print(rebinned.categories)
print(rebinned.value_counts())

Index(['Fail', 'Pass', 'SC', 'FCorFCD'], dtype='object')
Fail       8
Pass       2
SC         1
FCorFCD    1
dtype: int64


In [46]:
rd = np.random.rand(10)
rd

array([0.20865165, 0.49064838, 0.47060524, 0.60608371, 0.07226092,
       0.78697329, 0.53477193, 0.26649371, 0.62985028, 0.48538211])

In [47]:
pd.cut(rd, 4, precision=1)

[(0.07, 0.3], (0.4, 0.6], (0.4, 0.6], (0.4, 0.6], (0.07, 0.3], (0.6, 0.8], (0.4, 0.6], (0.3, 0.4], (0.6, 0.8], (0.4, 0.6]]
Categories (4, interval[float64, right]): [(0.07, 0.3] < (0.3, 0.4] < (0.4, 0.6] < (0.6, 0.8]]

In [48]:
ran_data = np.random.randn(1000) 
ran_data[:20]

array([-0.8814657 , -0.28534493, -1.16940628,  0.18998394,  1.34370573,
        0.89207553, -0.49535178, -1.44337933, -1.40402903, -0.64531284,
       -0.05043434,  0.22823249, -2.52627471,  0.34293656, -0.09394891,
        0.53463332, -1.98620476,  1.00762242, -0.40895083,  0.19511062])

In [49]:
quantiles_bins = pd.qcut(ran_data, 6)

In [50]:
quantiles_bins

[(-3.657, -0.852], (-0.401, 0.0396], (-3.657, -0.852], (0.0396, 0.499], (1.052, 3.044], ..., (0.0396, 0.499], (-0.852, -0.401], (-0.401, 0.0396], (-0.852, -0.401], (-0.852, -0.401]]
Length: 1000
Categories (6, interval[float64, right]): [(-3.657, -0.852] < (-0.852, -0.401] < (-0.401, 0.0396] < (0.0396, 0.499] < (0.499, 1.052] < (1.052, 3.044]]

In [51]:
quantiles_bins.value_counts()

(-3.657, -0.852]    167
(-0.852, -0.401]    167
(-0.401, 0.0396]    166
(0.0396, 0.499]     167
(0.499, 1.052]      166
(1.052, 3.044]      167
dtype: int64

## Data Transformation

#### How To Detect and Filter Outliers?

In [52]:
import pandas as pd
import numpy as np
ug_data = pd.DataFrame({'A':pd.Series(np.arange(10)), 'B':pd.Series(np.arange(5, 15)),'C':pd.Series(np.arange(10, 20)),
                        'D':pd.Series(np.arange(15, 25))})
ug_data

Unnamed: 0,A,B,C,D
0,0,5,10,15
1,1,6,11,16
2,2,7,12,17
3,3,8,13,18
4,4,9,14,19
5,5,10,15,20
6,6,11,16,21
7,7,12,17,22
8,8,13,18,23
9,9,14,19,24


In [53]:
ug_data.describe()

Unnamed: 0,A,B,C,D
count,10.0,10.0,10.0,10.0
mean,4.5,9.5,14.5,19.5
std,3.02765,3.02765,3.02765,3.02765
min,0.0,5.0,10.0,15.0
25%,2.25,7.25,12.25,17.25
50%,4.5,9.5,14.5,19.5
75%,6.75,11.75,16.75,21.75
max,9.0,14.0,19.0,24.0


In [54]:
col = ug_data['A']
col[np.abs(col) > 4]

5    5
6    6
7    7
8    8
9    9
Name: A, dtype: int64

In [55]:
ug_data[(np.abs(ug_data) > 20).any(1)]

Unnamed: 0,A,B,C,D
6,6,11,16,21
7,7,12,17,22
8,8,13,18,23
9,9,14,19,24


## Data Transformation
- How To Reorder and Select Rondomly?
- Type Markdown and LaTeX:  𝛼2


In [56]:
#np.random.permutation?
#pd.DataFrame.take?
#pd.DataFrame.sample?

In [57]:
import pandas as pd
import numpy as np
ran_df = pd.DataFrame(np.arange(20).reshape((5, 4)))
ran_df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [58]:
sampler = np.random.permutation(5)
sampler

array([4, 0, 3, 1, 2])

In [59]:
ran_df.take(sampler)

Unnamed: 0,0,1,2,3
4,16,17,18,19
0,0,1,2,3
3,12,13,14,15
1,4,5,6,7
2,8,9,10,11


In [60]:
ran_df.sample(n=2)

Unnamed: 0,0,1,2,3
0,0,1,2,3
3,12,13,14,15


In [61]:
rep = ran_df.sample(n=10, replace=True)
rep

Unnamed: 0,0,1,2,3
3,12,13,14,15
4,16,17,18,19
2,8,9,10,11
0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
0,0,1,2,3
0,0,1,2,3
0,0,1,2,3
2,8,9,10,11


## Data Transformation
#### How To Compute Indicator/Dummy Variables?

In [62]:
pd.get_dummies?

In [63]:
import pandas as pd
import numpy as np
data_dummies = pd.DataFrame({'key': ['b', 'a', 'a', 'c', 'a'], 'data1': range(5)})
data_dummies

Unnamed: 0,key,data1
0,b,0
1,a,1
2,a,2
3,c,3
4,a,4


In [64]:
pd.get_dummies(data_dummies['key'])
# pd.get_dummies(data_dummies['key'], prefix='Key')

Unnamed: 0,a,b,c
0,0,1,0
1,1,0,0
2,1,0,0
3,0,0,1
4,1,0,0


In [65]:
data_dummies_df = data_dummies[['data1']].join(pd.get_dummies(data_dummies['key'], prefix='Key'))
data_dummies_df

Unnamed: 0,data1,Key_a,Key_b,Key_c
0,0,0,1,0
1,1,1,0,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0


In [66]:
np.random.seed(42)
v = np.random.rand(10)
v

array([0.37454012, 0.95071431, 0.73199394, 0.59865848, 0.15601864,
       0.15599452, 0.05808361, 0.86617615, 0.60111501, 0.70807258])

In [67]:
bins = [0.1, 0.3, 0.5, 0.7, 1.0]

In [68]:
pd.get_dummies(pd.cut(v, bins))

Unnamed: 0,"(0.1, 0.3]","(0.3, 0.5]","(0.5, 0.7]","(0.7, 1.0]"
0,0,1,0,0
1,0,0,0,1
2,0,0,0,1
3,0,0,1,0
4,1,0,0,0
5,1,0,0,0
6,0,0,0,0
7,0,0,0,1
8,0,0,1,0
9,0,0,0,1


# Data Cleaning and Preparation Part_3

## How To Manipulate With Strings?

In [69]:
# str.split?
# str.join?
# str.index?
# str.find?

In [70]:
Python_sentence = 'python,Is, a programming, Language'
Python_sentence

'python,Is, a programming, Language'

In [71]:
Python_sentence.split(sep=',')

['python', 'Is', ' a programming', ' Language']

In [72]:
cs = [x.strip() for x in Python_sentence.split(',')]
cs

['python', 'Is', 'a programming', 'Language']

In [73]:
':'.join(cs)

'python:Is:a programming:Language'

In [74]:
one, two, three, four = cs
one

'python'

In [75]:
one + ':::' + two + '...>' + three + '#1' + four

'python:::Is...>a programming#1Language'

In [76]:
'python' in cs

True

In [77]:
print(Python_sentence)
Python_sentence.index(',')

python,Is, a programming, Language


6

In [78]:
Python_sentence.find(',')

6

In [79]:
Python_sentence.find(';')

-1

In [80]:
Python_sentence.count(',')

3

In [81]:
Python_sentence.replace(',', ':')

'python:Is: a programming: Language'

In [82]:
Python_sentence.replace(',', '')

'pythonIs a programming Language'

## How To Use Regular Expressions?

In [83]:
re.split?
re.findall?
regex.search?
regex.match?
regex.sub?

Object `re.split` not found.
Object `re.findall` not found.


In [84]:
import re
text = "python Is \ta programming\t Language"
re.split('\s+', text)

['python', 'Is', 'a', 'programming', 'Language']

In [85]:
regex = re.compile('\s+')
regex.split(text)

['python', 'Is', 'a', 'programming', 'Language']

In [86]:
regex.findall(text)

[' ', ' \t', ' ', '\t ']

In [87]:
# Note: To avoid unwanted escaping with \ in a regular expression, use raw string literals like r'E:\x' instead of the equivalent 'E:\\x'

In [88]:
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'
regex = re.compile(pattern, flags=re.IGNORECASE)

In [89]:
text = """Dave dave@google.com 
Steve steve@gmail.com 
Rob rob@gmail.com 
Ryan ryan@yahoo.com"""
text

'Dave dave@google.com \nSteve steve@gmail.com \nRob rob@gmail.com \nRyan ryan@yahoo.com'

In [90]:
regex.findall(text)

['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']

In [92]:
regex.search(text)

<re.Match object; span=(5, 20), match='dave@google.com'>

In [94]:
print(regex.match(text))

None


In [96]:
print(regex.sub('Python', text))

Dave Python 
Steve Python 
Rob Python 
Ryan Python


In [97]:
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'

In [98]:
regex = re.compile(pattern, flags=re.IGNORECASE)

In [99]:
m = regex.match('Paru@hotmail.com')
m

<re.Match object; span=(0, 16), match='Paru@hotmail.com'>

In [100]:
m.groups()

('Paru', 'hotmail', 'com')

In [101]:
regex.findall(text)

[('dave', 'google', 'com'),
 ('steve', 'gmail', 'com'),
 ('rob', 'gmail', 'com'),
 ('ryan', 'yahoo', 'com')]

In [102]:
print(regex.sub(r'Username: \1, Domain: \2, Suffix: \3', text))

Dave Username: dave, Domain: google, Suffix: com 
Steve Username: steve, Domain: gmail, Suffix: com 
Rob Username: rob, Domain: gmail, Suffix: com 
Ryan Username: ryan, Domain: yahoo, Suffix: com


## How To Work With The Vectorized String Functions in Pandas?

In [115]:
# pd.Series.str.findall?
# pd.Series.str.get?

In [103]:
import pandas as pd
maild = {'Pruthvi': 'pruthvi@google.com', 'Stella': 'stella@gmail.com', 'Roby': 'roby@gmail.com', 'Navar': np.nan}
maild

In [105]:
maild_s = pd.Series(maild)
maild_s

Pruthvi    pruthvi@google.com
Stella       stella@gmail.com
Roby           roby@gmail.com
Navar                     NaN
dtype: object

In [106]:
maild_s.isnull()

Pruthvi    False
Stella     False
Roby       False
Navar       True
dtype: bool

In [107]:
maild_s.str.contains('gmail')

Pruthvi    False
Stella      True
Roby        True
Navar        NaN
dtype: object

In [108]:
pattern

'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})'

In [110]:
maild_s.str.findall(pattern, flags=re.IGNORECASE)

Pruthvi    [(pruthvi, google, com)]
Stella       [(stella, gmail, com)]
Roby           [(roby, gmail, com)]
Navar                           NaN
dtype: object

In [111]:
matching = maild_s.str.match(pattern, flags=re.IGNORECASE)
matching

Pruthvi    True
Stella     True
Roby       True
Navar       NaN
dtype: object

In [113]:
print(maild_s)
maild_s.str.get(2)

Pruthvi    pruthvi@google.com
Stella       stella@gmail.com
Roby           roby@gmail.com
Navar                     NaN
dtype: object


Pruthvi      u
Stella       e
Roby         b
Navar      NaN
dtype: object

In [114]:
maild_s.str[:4]

Pruthvi    prut
Stella     stel
Roby       roby
Navar       NaN
dtype: object