# Prevent Data quality Problems 
- the detection and correction of problems 
- algorithms 


# Measurement and Data Collection Errors 

- Measurement error (측정자체의 오류) = Outlier 
  - Noise 
  - Artifact (기계가 망가진 경우) 

- Data collection error (수집 중 오류 - 데이터를 빼먹거나, 속성 값 하나 뺀 경우 등) = Outlier, Missing Values, Inconsistent Values, Duplicate Data 

[이상치, 결측치, 일관성 없는 값(음수인 키), 중복데이터] 

# Data Preprocessing 

## 01. Aggregation (집계연산)
- 2개 이상의 objects를 한 개로 합침 

- 왜 사용? 
  - less memory 
  - high-level view (각 상점 -> 각 지역)
  - more stable (시간 별 온도 -> 하루 온도(평균)) 

- 단점 
  - 원본데이터만큼의 정보가 날라간다

In [3]:
department = [10,10,20,20,20,20,30,30,40,50,50,70,70,70,70,70]
salary = [100,200,300,40,1000,200,340,500,60,700,100,2000,300,400,5000,100]
import pandas as pd
df = pd.DataFrame({
    "de" : department, 
    "sal" : salary
})
df

Unnamed: 0,de,sal
0,10,100
1,10,200
2,20,300
3,20,40
4,20,1000
5,20,200
6,30,340
7,30,500
8,40,60
9,50,700


In [5]:
df2 = df.groupby('de').sum()
df2.reset_index(inplace = True)
df2

Unnamed: 0,de,sal
0,10,300
1,20,1540
2,30,840
3,40,60
4,50,800
5,70,7800


## 02. Sampling 

- 왜 사용? 
  - 통계학자 : 전체 데이터 수집 too expensive 
  - 데이터 : 전체 데이터 전처리 too expensive 

- Key principle 
  - Use a representative sample 

### 2-1) simple random sampling 

In [6]:
# simple random sampling 

## 복원 추출 
import numpy as np 

### 1) 시드설정 
np.random.seed(0)

### 2) 난수생성 
np.random.rand(16)

array([0.5488135 , 0.71518937, 0.60276338, 0.54488318, 0.4236548 ,
       0.64589411, 0.43758721, 0.891773  , 0.96366276, 0.38344152,
       0.79172504, 0.52889492, 0.56804456, 0.92559664, 0.07103606,
       0.0871293 ])

In [7]:
### 3) 데이터 순서 바꾸기 
x = np.arange(10)
x 

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [8]:
np.random.shuffle(x) # inplace = True와 같음 
x

array([4, 1, 6, 7, 2, 8, 5, 9, 0, 3])

In [9]:
x = np.arange(10)
x

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [10]:
np.random.permutation(x) # return만 

array([5, 2, 7, 4, 1, 0, 6, 8, 9, 3])

`np.random.choice(a, size = , replace =, p = )`

- a : 배열이면 배열 자체, 정수면 arange(a)명령으로 배열 생성 
- size : 정수, 샘플 숫자 
- replace : True면 복원 추출, False 비복원추출 
- p : 배열, 각 데이터가 선택될 수 있는 확률 




In [11]:
### 4) 데이터 샘플링 
np.random.choice(5,5, replace = True)

array([0, 2, 4, 3, 3])

In [12]:
x = np.arange(10)
np.random.choice(x,5)

array([2, 7, 2, 0, 0])

In [14]:
np.random.choice(x,5, replace = False)

array([2, 3, 7, 1, 9])

In [15]:
### 5) 난수 생성 

np.random.rand(5) # 0~1사이의 균일분포 

array([0.44994999, 0.61306346, 0.90234858, 0.09928035, 0.96980907])

In [16]:
np.random.rand(5,5)

array([[0.65314004, 0.17090959, 0.35815217, 0.75068614, 0.60783067],
       [0.32504723, 0.03842543, 0.63427406, 0.95894927, 0.65279032],
       [0.63505887, 0.99529957, 0.58185033, 0.41436859, 0.4746975 ],
       [0.6235101 , 0.33800761, 0.67475232, 0.31720174, 0.77834548],
       [0.94957105, 0.66252687, 0.01357164, 0.6228461 , 0.67365963]])

In [17]:
np.random.randn(10) # 가우시안 표준 정규분포 

array([-0.68658948,  0.01487332, -0.3756659 , -0.03822364,  0.36797447,
       -0.0447237 , -0.30237513, -2.2244036 ,  0.72400636,  0.35900276])

In [18]:
np.random.randn(2,3)

array([[1.07612104, 0.19214083, 0.85292596],
       [0.01835718, 0.42830357, 0.99627783]])

In [20]:
np.random.randint(10, size = 5) # low , high = None, size = None 
# 0부터 10 사이의 정수 출력 

array([8, 8, 8, 2, 3])

In [21]:
np.random.randint(10,30,size=(3,5))

array([[12, 21, 23, 26, 18],
       [18, 29, 18, 12, 13],
       [22, 24, 10, 14, 13]])

In [25]:
 # 샘플링 해보기 
 sample_index = np.random.choice(df.index,size=5,replace=True)
 sample_index

array([ 6, 13,  9, 13, 11])

In [40]:
df_sample = pd.DataFrame(columns = ["de", "sal", "랜덤"])
for i in sample_index : 
  df_sample = df_sample.append(df.loc[i]) # df.loc[i] : 전체 데이터프레임에서 인덱스 이름이 i인 행만 추출해줘 
  # df.iloc[i] : 전체 데이터프레임에서 i번째 행에 있는 값들만 추출해줘 

df_sample

Unnamed: 0,de,sal,랜덤
6,30.0,340.0,0.315498
13,70.0,400.0,0.237143
9,50.0,700.0,0.390117
13,70.0,400.0,0.237143
11,70.0,2000.0,0.444931


### 2-2) Stratified sampling (층화 추출)
 - select objects from each group
  - 같은 크기로 뽑음 
  - 그룹의 크기에 비례해서 뽑을 수도 있음

  [stratify](https://towardsdatascience.com/stratified-random-sampling-using-python-and-pandas-1c84f0362ebc)
  

In [41]:
df

Unnamed: 0,de,sal,랜덤
0,10,100,0.517878
1,10,200,0.960086
2,20,300,0.886876
3,20,40,0.362184
4,20,1000,0.644523
5,20,200,0.225139
6,30,340,0.315498
7,30,500,0.417479
8,40,60,0.131773
9,50,700,0.390117


In [45]:
# preparing to stratify 

(df['de'].value_counts() / len(df)).sort_values()

40    0.0625
30    0.1250
10    0.1250
50    0.1250
20    0.2500
70    0.3125
Name: de, dtype: float64

In [44]:
df['de'].unique()

array([10, 20, 30, 40, 50, 70])

In [47]:
def stratify_data(df_data, stratify_column_name, stratify_values, stratify_proportions, random_state=None):
  
    df_stratified = pd.DataFrame(columns = df_data.columns) # Create an empty DataFrame with column names matching df_data

    pos = -1
    for i in range(len(stratify_values)): # iterate over the stratify values (e.g. "Male, Undergraduate" etc.)
        pos += 1
        if pos == len(stratify_values) - 1: 
            ratio_len = len(df_data) - len(df_stratified) # if this is the final iteration make sure we calculate the number of values for the last set such that the return data has the same number of rows as the source data
        else:
            ratio_len = int(len(df_data) * stratify_proportions[i]) # Calculate the number of rows to match the desired proportion

        df_filtered = df_data[df_data[stratify_column_name] ==stratify_values[i]] # Filter the source data based on the currently selected stratify value
        df_temp = df_filtered.sample(replace=True, n=ratio_len, random_state=random_state) # Sample the filtered data using the calculated ratio
        
        df_stratified = pd.concat([df_stratified, df_temp]) # Add the sampled / stratified datasets together to produce the final result
        
    return df_stratified # Return the stratified, re-sampled data   

In [49]:
stratify_values = [10, 20, 30, 40, 50, 70]
stratify_proportions = [0.125,0.25,0.125,0.125,0.3125]

df_stratified = stratify_data(df, 'de', stratify_values, stratify_proportions, random_state=42)
df_stratified

Unnamed: 0,de,sal,랜덤
0,10,100,0.517878
1,10,200,0.960086
4,20,1000,0.644523
5,20,200,0.225139
2,20,300,0.886876
4,20,1000,0.644523
6,30,340,0.315498
7,30,500,0.417479
8,40,60,0.131773
8,40,60,0.131773


In [50]:
df.drop('랜덤', axis = 1, inplace = True)

In [52]:
# sklearn.model_selection 사용 
!pip install sklearn
from sklearn.model_selection import StratifiedShuffleSplit


split = StratifiedShuffleSplit(n_splits=6, test_size=None, train_size=None, random_state=None)
split.get_n_splits(df)



6

....음 여기부터는 언젠가 다시 하길 

### 2-3) Progressive sampling 
- 셈플을 작게 잡아서 모델 돌리고 
- 점점 샘플 크게 허용 
- 성능이 더이상 안늘어날 때까지 

## 03. Dimensionality Reduction