<a href="https://colab.research.google.com/github/jiangenhe/insc-486-fall-2021/blob/main/week5/week5_lecture_part1_data_cleaning_ipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Cleaning and Preparation

In [None]:
import numpy as np
import pandas as pd


## Data Transformation

### Detecting and Filtering Outliers

In [None]:
data = pd.DataFrame(np.random.randn(1000, 4))
data.head()

Unnamed: 0,0,1,2,3
0,2.226023,-1.392102,0.090357,1.091373
1,-2.690031,-0.632341,-1.554637,1.390178
2,0.922577,0.075678,1.635759,-0.234883
3,1.790713,0.156227,-0.358378,0.16643
4,-1.03014,-0.036094,0.942048,0.632453


In [None]:
col = data[2]
col[np.abs(col) > 3]

65    -3.089457
147   -3.563716
265    3.136241
326   -3.271609
542    3.807337
627    3.358500
929   -3.054290
Name: 2, dtype: float64

In [None]:
data[(np.abs(data) > 3).any(axis=1)]

Unnamed: 0,0,1,2,3
65,1.645556,0.919987,-3.089457,0.154638
147,-0.922757,-3.04629,-3.563716,-0.649317
191,-3.03,0.374034,-0.206399,-0.09002
265,1.093349,0.044763,3.136241,1.595831
326,-0.942976,1.869838,-3.271609,0.472085
459,-1.290406,3.302144,0.493488,0.502637
542,-1.131022,-1.057255,3.807337,2.045015
575,-0.507902,3.384196,0.492122,-0.595908
627,1.299012,0.197331,3.3585,-0.079058
678,-1.117503,2.513846,0.380697,3.152418


In [None]:
data[np.abs(data) > 3] = np.sign(data) * 3
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.036348,-0.04147,-0.059887,0.014755
std,0.993688,1.000757,0.997124,1.01949
min,-3.0,-2.734651,-2.990192,-3.0
25%,-0.668105,-0.684568,-0.69136,-0.662796
50%,0.025042,-0.074988,-0.044015,0.024224
75%,0.643542,0.647821,0.590086,0.705893
max,3.0,3.0,3.0,3.0


In [None]:
np.sign(data).head()

## String Manipulation

### String Object Methods

In [None]:
val = 'a,b,  guido'
val.split(',')

['a', 'b', '  guido']

In [None]:
pieces = [x.strip() for x in val.split(',')]
pieces

['a', 'b', 'guido']

In [None]:
first, second, third = pieces
first + '::' + second + '::' + third

'a::b::guido'

In [None]:
'::'.join(pieces)

'a::b::guido'

In [None]:
'guido' in val

True

In [None]:
val.index(',jj')

ValueError: ignored

In [None]:
val.find('ii:')

-1

In [None]:
val.index(':')

In [None]:
val.count(',')

In [None]:
val.replace(',', '::')
val.replace(',', '')

### Regular Expressions

In [None]:
import re
text = "foo    bar\t baz  \tqux"
re.split('\s+', text)

In [None]:
regex = re.compile('\s+')
regex.split(text)

In [None]:
regex.findall(text)

In [None]:
text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'

# re.IGNORECASE makes the regex case-insensitive
regex = re.compile(pattern, flags=re.IGNORECASE)

In [None]:
regex.findall(text)

In [None]:
m = regex.search(text)
m

In [None]:
text[m.start():m.end()]

In [None]:
print(regex.match(text))

In [None]:
print(regex.sub('REDACTED', text))

In [None]:
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
regex = re.compile(pattern, flags=re.IGNORECASE)

In [None]:
m = regex.match('wesm@bright.net')
m.groups()

In [None]:
regex.findall(text)

In [None]:
print(regex.sub(r'Username: \1, Domain: \2, Suffix: \3', text))

### Vectorized String Functions in pandas

In [56]:
data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com',
        'Rob': 'rob@gmail.com', 'Wes': np.nan}
data = pd.Series(data)
data
data.isnull()

Dave     False
Steve    False
Rob      False
Wes       True
dtype: bool

In [None]:
data.str[:5]

In [57]:
data.str.contains('gmail')

Dave     False
Steve     True
Rob       True
Wes        NaN
dtype: object

In [58]:
pattern
data.str.findall(pattern, flags=re.IGNORECASE)

Dave     [(dave, google, com)]
Steve    [(steve, gmail, com)]
Rob        [(rob, gmail, com)]
Wes                        NaN
dtype: object

In [59]:
matches = data.str.match(pattern, flags=re.IGNORECASE)
matches

Dave     True
Steve    True
Rob      True
Wes       NaN
dtype: object

In [68]:
s = pd.Series(['a1', 'b2', 'c3'])
s.str.extract(r'([ab])(\d)')


Unnamed: 0,0,1
0,a,1.0
1,b,2.0
2,,
