## 무신사 AF 첫 구매 데이터 추출

In [None]:
import os
import pyarrow as pa
import pyarrow.csv as pacsv
import pandas as pd
import datetime
from dateutil.relativedelta import relativedelta

# 관련 패키지 없는 경우 아래 코드 수정하여 설치
# !pip install [패키지이름]

### 경로 및 추출일자 설정

In [None]:
raw_dir = 'D:/Dropbox (주식회사매드업)/광고사업부/4. 광고주/무신사/★ 무신사 통합/raw_data/appsflyer'
# raw_data 파일 경로

result_dir = 'C:/Users/MADUP/Downloads'
# 결과파일 다운로드할 경로

yearmonth = '202208'
# 추출하고자 하는 년월 입력

### 데이터 정제

In [None]:
def musinsa_rawdata_read(yearmonth):
    raw_files = os.listdir(raw_dir)
    raw_files = [f for f in raw_files if '.csv' in f]
    
    date = datetime.datetime.strptime(yearmonth,'%Y%m')
    start_date = date.strftime('%Y%m%d')
    end_date = (date + relativedelta(months=1) - relativedelta(days=1)).strftime('%Y%m%d')
    raw_files = [f for f in raw_files if (int(str(f)[-12:-4]) >= int(start_date)) & (int(str(f)[-12:-4]) <= int(end_date))]

    dtypes = {
        'attributed_touch_type' : pa.string(),
        'attributed_touch_time' : pa.string(),
        'install_time' : pa.string(),
        'event_time': pa.string(),
        'event_name': pa.string(),
        'event_revenue': pa.string(),
        'event_revenue_krw': pa.string(),
        'media_source': pa.string(),
        'channel': pa.string(),
        'keywords': pa.string(),
        'keyword_id': pa.string(),
        'campaign': pa.string(),
        'campaign_id': pa.string(),
        'adset': pa.string(),
        'adset_id': pa.string(),
        'ad': pa.string(),
        'site_id': pa.string(),
        'appsflyer_id': pa.string(),
        'advertising_id': pa.string(),
        'idfa': pa.string(),
        'android_id': pa.string(),
        'idfv': pa.string(),
        'platform': pa.string(),
        'device_type': pa.string(),
        'is_retargeting': pa.string(),
        'retargeting_conversion_type': pa.string(),
        'is_primary_attribution' : pa.string(),
        'attribution_lookback': pa.string(),
        'carrier': pa.string(),
        # 'collected_at': pa.string(),
        'customer_user_id': pa.string()
    }
    index_columns = list(dtypes.keys())
    convert_ops = pacsv.ConvertOptions(column_types=dtypes, include_columns=index_columns)
    ro = pacsv.ReadOptions(block_size=10 << 20)

    table_list = []
    for f in raw_files:
        try:
            temp = pacsv.read_csv(raw_dir + '/' + f, convert_options=convert_ops, read_options=ro)
            table_list.append(temp)
        except Exception as e:
            print(f)
            print(e)

    print('원본 데이터 Read 완료')

    table = pa.concat_tables(table_list)
    df = table.to_pandas()

    return df

In [None]:
def prep_raw_df(df):
    df.is_primary_attribution = df.is_primary_attribution.apply(str.lower)
    
    con1 = (df['attributed_touch_type'] == 'click')
    con2 = (df['event_name'] == 'first_purchase')
    con3 = (df['is_primary_attribution'] == 'true')
    
    filtered_data = df.loc[con1 & con2 & con3]
    filtered_data['customer_user_id'] = filtered_data['customer_user_id'].fillna(0)
    
    return filtered_data

### 실행 및 결과파일 다운로드

In [None]:
df = musinsa_rawdata_read(yearmonth)
filtered_data = prep_raw_df(df)
filtered_data.to_csv(result_dir + "/musinsa_first_purchase_log_with_cuid_202208.csv", encoding='utf-8-sig', index=False)
# 결과 파일 다운로드