In [1]:
import pandas as pd

### I/O 속도

In [2]:
%%time
df = pd.read_parquet("target_raw.parquet")

CPU times: user 13.2 s, sys: 3.37 s, total: 16.6 s
Wall time: 13.1 s


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 28154385 entries, 0 to 3465865
Data columns (total 8 columns):
 #   Column              Dtype         
---  ------              -----         
 0   review_id           object        
 1   app_name            object        
 2   author_name         object        
 3   pseudo_author_id    object        
 4   author_app_version  object        
 5   review_rating       float64       
 6   review_likes        int64         
 7   review_timestamp    datetime64[ns]
dtypes: datetime64[ns](1), float64(1), int64(1), object(5)
memory usage: 1.9+ GB


In [4]:
%%time
df.to_parquet("target_raw.parquet")

CPU times: user 13.6 s, sys: 1.57 s, total: 15.1 s
Wall time: 15.2 s


In [5]:
%%time
df = df.reset_index(drop=True)

CPU times: user 732 ms, sys: 300 ms, total: 1.03 s
Wall time: 1.03 s


In [6]:
%%time
df.to_csv("target_raw.csv", index=False)

CPU times: user 1min 11s, sys: 2.21 s, total: 1min 13s
Wall time: 1min 14s


In [7]:
%%time
df = pd.read_csv("target_raw.csv")



CPU times: user 30.8 s, sys: 8.69 s, total: 39.5 s
Wall time: 43.4 s


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28154385 entries, 0 to 28154384
Data columns (total 8 columns):
 #   Column              Dtype  
---  ------              -----  
 0   review_id           object 
 1   app_name            object 
 2   author_name         object 
 3   pseudo_author_id    object 
 4   author_app_version  object 
 5   review_rating       float64
 6   review_likes        int64  
 7   review_timestamp    object 
dtypes: float64(1), int64(1), object(6)
memory usage: 1.7+ GB


In [17]:
df.review_timestamp = pd.to_datetime(df.review_timestamp)

### 데이터 타입 정리

In [9]:
df.isna().mean()

review_id             2.997116e-01
app_name              0.000000e+00
author_name           0.000000e+00
pseudo_author_id      2.997116e-01
author_app_version    2.079358e-01
review_rating         3.196660e-07
review_likes          0.000000e+00
review_timestamp      0.000000e+00
dtype: float64

In [10]:
df_raw = df.copy()

In [11]:
df = df_raw.sample(frac=0.1)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2815438 entries, 16473695 to 19973362
Data columns (total 8 columns):
 #   Column              Dtype  
---  ------              -----  
 0   review_id           object 
 1   app_name            object 
 2   author_name         object 
 3   pseudo_author_id    object 
 4   author_app_version  object 
 5   review_rating       float64
 6   review_likes        int64  
 7   review_timestamp    object 
dtypes: float64(1), int64(1), object(6)
memory usage: 193.3+ MB


In [12]:
for col, data_type in df.dtypes.items():
    if data_type == "object":
        ser_target = df[col].value_counts()
        print(f"{col}({data_type}): {len(ser_target):,}가지")
    elif data_type in ("float", "int"):
        max_value = df[col].max()
        min_value = df[col].min()
        if (((df[col].dropna() % 1) != 0).sum() == 0) or (data_type == "int"):
            target_data_type = "int"
        else:
            target_data_type = "float"
        print(f"{col}({target_data_type}): {min_value:,.2f} ~ {max_value:,.2f}")
    else:
        raise Exception("New data type:", data_type)

review_id(object): 1,972,203가지
app_name(object): 7가지
author_name(object): 585,007가지
pseudo_author_id(object): 1,960,085가지
author_app_version(object): 4,746가지
review_rating(int): 0.00 ~ 5.00
review_likes(int): 0.00 ~ 138,051.00
review_timestamp(object): 2,786,949가지


In [13]:
df = df_raw
del df_raw
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28154385 entries, 0 to 28154384
Data columns (total 8 columns):
 #   Column              Dtype  
---  ------              -----  
 0   review_id           object 
 1   app_name            object 
 2   author_name         object 
 3   pseudo_author_id    object 
 4   author_app_version  object 
 5   review_rating       float64
 6   review_likes        int64  
 7   review_timestamp    object 
dtypes: float64(1), int64(1), object(6)
memory usage: 1.7+ GB


In [14]:
int32_cols = ["review_likes"]
int8_cols = ["review_rating"]
cate_cols = ["app_name", "author_app_version"]

memory_usage_before = df.memory_usage().sum()

df.info()
for col in int32_cols:
    assert abs(df[col].max()) < 2_147_483_647 
    df[col] = df[col].astype(pd.Int32Dtype())
    
df.info()
for col in int8_cols:
    assert abs(df[col].max()) < 127
    df[col] = df[col].astype(pd.Int8Dtype())
    
df.info()
for col in cate_cols:
    assert df[col].nunique() < 10_000
    df[col] = df[col].astype("category")
    
df.info()

memory_usage_after = df.memory_usage().sum()
reduction_ratio = 1 - (memory_usage_after / memory_usage_before)

print(f"Memory Usage: {memory_usage_before:,} -> {memory_usage_after:,} ({reduction_ratio*100:.2f}% reduced)")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28154385 entries, 0 to 28154384
Data columns (total 8 columns):
 #   Column              Dtype  
---  ------              -----  
 0   review_id           object 
 1   app_name            object 
 2   author_name         object 
 3   pseudo_author_id    object 
 4   author_app_version  object 
 5   review_rating       float64
 6   review_likes        int64  
 7   review_timestamp    object 
dtypes: float64(1), int64(1), object(6)
memory usage: 1.7+ GB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28154385 entries, 0 to 28154384
Data columns (total 8 columns):
 #   Column              Dtype  
---  ------              -----  
 0   review_id           object 
 1   app_name            object 
 2   author_name         object 
 3   pseudo_author_id    object 
 4   author_app_version  object 
 5   review_rating       float64
 6   review_likes        Int32  
 7   review_timestamp    object 
dtypes: Int32(1), float64(1), object(6)
memory usa

In [18]:
%%time
df.to_parquet("df_optimized.parquet")

CPU times: user 11 s, sys: 1.1 s, total: 12.1 s
Wall time: 12.2 s
