<a href="https://colab.research.google.com/github/jiangenhe/insc-486-2021-spring/blob/main/week5/week5_lecture_part1_data_cleaning_ipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Cleaning and Preparation

In [None]:
import numpy as np
import pandas as pd


## Data Transformation

### Detecting and Filtering Outliers

In [24]:
data = pd.DataFrame(np.random.randn(1000, 4))
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.037423,-0.041331,-0.05986,0.014658
std,0.997512,1.00119,0.997207,1.024014
min,-3.496868,-2.734651,-2.990192,-3.65566
25%,-0.668105,-0.684568,-0.69136,-0.662796
50%,0.025042,-0.074988,-0.044015,0.024224
75%,0.643542,0.647821,0.590086,0.705893
max,3.063674,3.139106,3.026795,3.421761


In [25]:
col = data[2]
col[np.abs(col) > 3]

501    3.026795
Name: 2, dtype: float64

In [None]:
data[(np.abs(data) > 3).any(1)]

In [26]:
data[np.abs(data) > 3] = np.sign(data) * 3
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.036348,-0.04147,-0.059887,0.014755
std,0.993688,1.000757,0.997124,1.01949
min,-3.0,-2.734651,-2.990192,-3.0
25%,-0.668105,-0.684568,-0.69136,-0.662796
50%,0.025042,-0.074988,-0.044015,0.024224
75%,0.643542,0.647821,0.590086,0.705893
max,3.0,3.0,3.0,3.0


In [None]:
np.sign(data).head()

### Permutation and Random Sampling

In [27]:
df = pd.DataFrame(np.arange(5 * 4).reshape((5, 4)))
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [37]:
sampler = np.random.permutation(5)
sampler

array([1, 2, 3, 4, 0])

In [38]:
df
df.take(sampler)

Unnamed: 0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19
0,0,1,2,3


In [None]:
df.sample(n=3)

Unnamed: 0,0,1,2,3
3,12,13,14,15
4,16,17,18,19
0,0,1,2,3


In [46]:
choices = pd.Series([5, 7, -1, 6, 4])
draws = choices.sample(n=10, replace=True)
draws

2   -1
4    4
2   -1
4    4
2   -1
3    6
2   -1
2   -1
3    6
4    4
dtype: int64

### Computing Dummy Variables

In [49]:
df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                   'data1': range(6)})
df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [50]:
pd.get_dummies(df['key'])

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [51]:
dummies = pd.get_dummies(df['key'], prefix='key')
df_with_dummy = df[['data1']].join(dummies)
df_with_dummy

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


## String Manipulation

### String Object Methods

In [None]:
val = 'a,b,  guido'
val.split(',')

['a', 'b', '  guido']

In [52]:
pieces = [x.strip() for x in val.split(',')]
pieces

['a', 'b', 'guido']

In [53]:
first, second, third = pieces
first + '::' + second + '::' + third

'a::b::guido'

In [54]:
'::'.join(pieces)

'a::b::guido'

In [56]:
'guido' in val

True

In [57]:
val.index(',jj')

ValueError: ignored

In [59]:
val.find('ii:')

-1

In [None]:
val.index(':')

In [None]:
val.count(',')

In [None]:
val.replace(',', '::')
val.replace(',', '')

### Regular Expressions

In [None]:
import re
text = "foo    bar\t baz  \tqux"
re.split('\s+', text)

In [None]:
regex = re.compile('\s+')
regex.split(text)

In [None]:
regex.findall(text)

In [None]:
text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'

# re.IGNORECASE makes the regex case-insensitive
regex = re.compile(pattern, flags=re.IGNORECASE)

In [None]:
regex.findall(text)

In [None]:
m = regex.search(text)
m
text[m.start():m.end()]

In [None]:
print(regex.match(text))

In [None]:
print(regex.sub('REDACTED', text))

In [None]:
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
regex = re.compile(pattern, flags=re.IGNORECASE)

In [None]:
m = regex.match('wesm@bright.net')
m.groups()

In [None]:
regex.findall(text)

In [None]:
print(regex.sub(r'Username: \1, Domain: \2, Suffix: \3', text))

### Vectorized String Functions in pandas

In [None]:
data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com',
        'Rob': 'rob@gmail.com', 'Wes': np.nan}
data = pd.Series(data)
data
data.isnull()

In [None]:
data.str.contains('gmail')

In [None]:
pattern
data.str.findall(pattern, flags=re.IGNORECASE)

In [None]:
matches = data.str.match(pattern, flags=re.IGNORECASE)
matches

In [None]:
matches.str.get(1)
matches.str[0]

In [None]:
data.str[:5]

In [None]:
pd.options.display.max_rows = PREVIOUS_MAX_ROWS