In [183]:
import os
import pandas as pd
from urllib.parse import urlparse, parse_qs, urlencode, urlunparse


In [184]:


# 필터링할 기간의 시작 날짜와 끝 날짜 정의 (예: 2024년 3월 1일부터 2024년 3월 31일까지)
start_date = "20240325"
end_date = "20240331"

# 폴더 내의 파일 목록 가져오기
file_list = os.listdir(r'C:\Users\ch.kang\Desktop\새 폴더')

# 'ga'로 시작하고 지정된 날짜 범위 내에 있는 파일 필터링
ga_files = [file for file in file_list if file.startswith('ga') and start_date <= file.split('_')[1].split('.')[0] <= end_date]

# 빈 데이터프레임을 생성하여 파일을 하나씩 읽어서 합치기
raw_df_ga = pd.DataFrame()
for file in ga_files:
    file_path = os.path.join(r'C:\Users\ch.kang\Desktop\새 폴더', file)  # 파일의 전체 경로 생성
    temp_df = pd.read_excel(file_path, dtype={'아이디': str}) # 파일을 데이터프레임으로 읽기
    raw_df_ga = pd.concat([raw_df_ga, temp_df], ignore_index=True)  # 읽은 데이터프레임을 기존 데이터프레임에 합치기


In [185]:
df_ga = raw_df_ga.copy()

In [186]:
def modify_url(url_0, all_keys):
    if url_0 is None or not isinstance(url_0, str):
        return url_0  # Skip if the URL is None or not a string

    # Remove '=' from the beginning
    if url_0.startswith('='):
        url_0 = url_0[1:]
    
    url_0 = url_0.replace('https://m.', 'https://')
    url_0 = url_0.replace('https://s.', 'https://')

    parsed_url = urlparse(url_0)
    scheme = parsed_url.scheme
    netloc = parsed_url.netloc
    path = parsed_url.path
    query = parsed_url.query
    fragment = parsed_url.fragment

    if path.endswith('/PostView.naver') or path.endswith('/PostList.naver') or path.endswith('/FeedList.naver'):
        path = ''

    query_dict = parse_qs(query)

    all_keys.update(query_dict.keys())

    params_to_keep_query = ['blogId', 'logNo', 'bmode', 'idx']  # Include 'bmode' and 'idx'

    # Collect items to remove
    items_to_remove = []
    for param, values in query_dict.items():
        if param not in params_to_keep_query:
            items_to_remove.append(param)

    # Remove items after iteration
    for param in items_to_remove:
        del query_dict[param]

    modified_query = urlencode(query_dict, doseq=True)
    modified_url = urlunparse((scheme, netloc, path, '', modified_query, ''))

    if modified_url.endswith('/'):
        modified_url = modified_url[:-1]

    return modified_url




def extract_korean(text):
    if text is None or not isinstance(text, str):
        return text  # 입력값이 None 또는 문자열이 아닌 경우 그대로 반환

    # '+'를 띄어쓰기로 치환한 후, 모든 한글을 추출하는 정규표현식
    korean_pattern = re.compile('[ㄱ-ㅎㅏ-ㅣ가-힣]+')
    text = text.replace('\\+', ' ')
    result = korean_pattern.findall(text)
    return ''.join(result) if result else text

# Custom function to parse the date and time
def custom_datetime_parser(date_str):
    # Regular expression pattern to match date and time elements
    pattern = r'(\d{4})\. (\d{1,2})\. (\d{1,2})\. (\S+) (\d{1,2}:\d{2}:\d{2})'

    # Extract date and time components
    match = re.match(pattern, date_str)
    if match:
        year, month, day, am_pm, time = match.groups()
        hour, minute, second = map(int, time.split(':'))
        if am_pm == '오후' and hour < 12:
            hour += 12
        elif am_pm == '오전' and hour == 12:
            hour = 0
        return pd.Timestamp(year=int(year), month=int(month), day=int(day), hour=hour, minute=minute, second=second)
    else:
        return pd.NaT
    
    
def shift_values(row):
    cols = ['1', '2', '3', '4', '5']
    # Filter out None or NaN values
    filtered_values = [row[col] for col in cols if pd.notna(row[col])]
    # Shift values to the left
    for i, col in enumerate(cols):
        row[col] = filtered_values[i] if i < len(filtered_values) else None
    return row

def convert_column_name(col_name):
    if isinstance(col_name, str) and col_name.isdigit():
        return int(col_name)
    else:
        return col_name
    


# Function to determine the prefix based on "node"
def get_prefix(node):
    if node.startswith("https://blog.naver.com/hanpro911"):
        return "BH"
    elif node.startswith("https://blog.naver.com/mysecondplace"):
        return "BM"
    elif node.startswith("https://blog.naver.com"):
        return "BZ"
    elif node.startswith("https://cafe.naver.com/mysecondhouse1"):
        return "CM"
    elif node.startswith("https://mysecondplace.co.kr") and ("bmode=write" in node or "prod_code" in node):
        return "HA"
    elif node.startswith("https://mysecondplace.co.kr"):
        return "HH"
    elif node.startswith("https://maily.so"):
        return "HB"
    elif node.startswith("https://whattime.co.kr"):
        return "HC"
    elif node.startswith("https://m2place.imweb.me"):
        return "HO"
    elif node.startswith("https://instagram.com/mysecondplace_"):
        return "|M"
    elif node.startswith("http://pf.kakao.com/_vxkxngb"):
        return "KC"
    elif node.startswith("http"):
        return "SE"
    else:
        return "SK"
    
def get_last_number(column, prefix):
    last_number = 0
    for value in column.dropna():  # Exclude NaN values
        if str(value).startswith(prefix):
            last_number = max(last_number, int(str(value).split('-')[-1]))
    return last_number
    

In [187]:
df_ga.columns

Index(['아이디', '시간', '이벤트명', '페이지제목', '정규화링크', '원래링크', '소스', '매체', '캠페인', '컨텐츠',
       '검색어', '체류시간', '전환여부'],
      dtype='object')

In [188]:
# 체류시간 0인 경우 채우기
df_ga['체류시간'].fillna(0, inplace=True)

# node_standarization
all_keys = set()
df_ga['modified_node']=df_ga['정규화링크'].apply(lambda x:modify_url(x, all_keys))

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_ga['체류시간'].fillna(0, inplace=True)


In [189]:

condition_a = df_ga['정규화링크'].str.contains('write|prod_code', case=False, regex=True)
condition_a = condition_a.fillna(False)  # NaN 값을 False로 대체
condition_a.value_counts()

filtered_a = df_ga[condition_a]

ids_a = filtered_a['아이디'].unique().tolist()  # 기존의 고유한 아이디 목록을 리스트로 변환
ids_a.extend(['1828185651.1705389498', '417241095.1705634348', '1602359817.1707964923'])  # 새로운 아이디를 추가

# condition_b = df_ga.groupby('아이디')['체류시간'].sum() >= 18000
# ids_b = condition_b[condition_b].index

ids_b = ['1021566177.1707466356', '1083942841.1695711394',
       '1120703410.1706587025', '112167875.1704799413',
       '1140615326.1706539096', '1158917536.1707880810',
       '1188470945.1704806360', '1195499539.1705461693',
       '1208982378.1707455391', '1250133156.1706801009',
       '130036583.1660629517', '1302147567.1707929419',
       '1312396219.1707191882', '1312429530.1677388875',
       '1323533573.1707880592', '1341286132.1689068932',
       '1341500281.1707228435', '135745452.1687348999',
       '1362795537.1707364318', '1392746774.1707093754',
       '1414598046.1706604478', '1437195581.1701747873',
       '1535161370.1707530975', '1542469471.1706330459',
       '1549941021.1706832674', '1588483927.1696296964',
       '1592112601.1686896453', '1627436795.1707440156',
       '1644808335.1706455834', '1645381003.1707756395',
       '1739828001.1705369497', '1757415153.1705996376',
       '1771636829.1707531355', '198483479.1707868974',
       '2054491282.1707951088', 
       '2078435867.1706074815', '2105957382.1707103400',
       '2115567724.1705386386', '278436795.1700370232', '288263341.1705334410',
       '302251614.1707783216', '306548010.1699417096', '335460286.1707105953',
       '352146708.1707529167', '400501807.1707570514', '420765241.1705305938',
       '48543513.1707440466', '52044247.1693770278', '54599874.1707924934',
       '555071005.1702130820', '577908767.1706330356', '593811738.1706411252',
       '605077705.1702539997', '639043677.1706935086', '649116625.1707636536',
       '736497018.1707340441', '740700171.1705212161', '744027507.1707038633',
       '768492148.1699924210', '771137756.1707490369', '799231654.1707925045',
       '813986486.1707308094', '838526411.1705675392', '858944222.1699852150',
       '861783116.1705336795', '892793016.1701953466', '910079147.1700545452',
       '920629470.1705452306', '936659568.1707636674', '965385222.1704683076',
       '97146443.1707440532']

admin_ids = set(ids_a) | set(ids_b)

print(len(ids_a), len(ids_b))
admin_ids

# 적용하려면 아래 코드를 주석처리하지 말고 실행하세요.
df_ga= df_ga[~df_ga['아이디'].isin(admin_ids)]

# # new_id mapping
# id_mapping_df = pd.read_csv(mapping_file_path, dtype = str)                               
# df_with_new_id = pd.merge(df_ga, id_mapping_df[['pseudo_id', 'new_id']], left_on='아이디', right_on="pseudo_id", how='left')

5 71


In [190]:
df_ga_1 = df_ga[df_ga['이벤트명'] == 'page_view']


In [191]:
df_ga_1

Unnamed: 0,아이디,시간,이벤트명,페이지제목,정규화링크,원래링크,소스,매체,캠페인,컨텐츠,검색어,체류시간,전환여부,modified_node
0,1018310597.1711372724,2024. 3. 25. 오후 10:18:44,page_view,마이세컨플레이스,https://maily.so/mysecondplace,https://maily.so/mysecondplace,,,,,,0.0,,https://maily.so/mysecondplace
1,1019987196.1711350912,2024. 3. 25. 오후 4:15:50,page_view,경기도 양평 세컨하우스 매물,https://mysecondplace.co.kr/ya0006,https://mysecondplace.co.kr/ya0006?utm_source=...,youtube,video,market,https://youtu.be/G6f7NNpHg1s,,0.0,,https://mysecondplace.co.kr/ya0006
2,1040964215.1711369460,2024. 3. 25. 오후 9:24:20,page_view,세컨하우스 고민 해결,https://mysecondplace.co.kr/secondhouse_trouble,https://mysecondplace.co.kr/secondhouse_troubl...,instagram,troble,세컨하우스고민해결,02_저질러놓고,,4.0,,https://mysecondplace.co.kr/secondhouse_trouble
4,1048306435.1711367547,2024. 3. 25. 오후 8:52:27,page_view,"🏡 신규매물 | 바닷마을 두번째집, 강화도 매물",https://maily.so/mysecondplace/posts/7ce054f4,https://maily.so/mysecondplace/posts/7ce054f4,naver.com,referral,(referral),,,0.0,,https://maily.so/mysecondplace/posts/7ce054f4
5,106679103.1711349287,2024. 3. 25. 오후 3:48:07,page_view,세컨하우스 고민 해결,https://mysecondplace.co.kr/secondhouse_trouble,https://mysecondplace.co.kr/secondhouse_troubl...,instagram,troble,세컨하우스고민해결,02_저질러놓고,,12.0,,https://mysecondplace.co.kr/secondhouse_trouble
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3146,984668685.1711803717,2024. 3. 30. 오후 10:02:00,page_view,기타지역 세컨하우스 매물 리스트 및 상세 정보,https://mysecondplace.co.kr/market_etc,https://mysecondplace.co.kr/market_etc,,,,,,3.0,,https://mysecondplace.co.kr/market_etc
3147,984668685.1711803717,2024. 3. 30. 오후 10:02:03,page_view,충남 공주 세컨하우스 매물 리스트 및 상세 정보,https://mysecondplace.co.kr/market_gongju,https://mysecondplace.co.kr/market_gongju,,,,,,6.0,,https://mysecondplace.co.kr/market_gongju
3149,984668685.1711803717,2024. 3. 30. 오후 10:02:13,page_view,마이세컨플레이스 공동소유 세컨하우스,https://mysecondplace.co.kr/pb_secondhouse_ads,https://mysecondplace.co.kr/pb_secondhouse_ads,,,,,,8.0,,https://mysecondplace.co.kr/pb_secondhouse_ads
3150,984668685.1711803717,2024. 3. 30. 오후 10:02:22,page_view,마세플 스탠다드홈,https://mysecondplace.co.kr/msp_standard,https://mysecondplace.co.kr/msp_standard,,,,,,21.0,,https://mysecondplace.co.kr/msp_standard


In [192]:
# 검색하려는 링크 목록
links_to_find = [
    "yb0001", "yc0001", "yc0002", "ga0001", "ga0002", "ga0003", "ya0001", "ya0002", "ya0011", "ya0004",
    "ya0005", "gpa0001", "gpa0002", "gpa0003", "nhz0001", "gpa0004", "gpa0005", "ya0006", "ya0007", "ya0008",
    "gpa0006", "gpa0007", "gpa0008", "ga0004", "ga0005", "gb0001", "gb0002", "gb0003", "gpa0009", "gpa0010",
    "gpa0011", "gb0004", "gb0005", "gb0006", "ya0009", "ya0010", "yb0002", "yb0003", "yc0003", "ga0006",
    "yb0004", "ga0007", "ya0011", "ya0012", "ya0013", "ya0003", "gpa0012", "gpa0013", "gpa0014"
]

# 조회수도 포함하여 링크별 통계 데이터를 다시 생성
link_stats_with_counts = {}

for link in links_to_find:
    matched_rows = df_ga_1[df_ga_1['정규화링크'].str.contains(link, regex=False)]
    if not matched_rows.empty:
        count = len(matched_rows)
        max_stay = matched_rows["체류시간"].max()
        mode_stay = matched_rows["체류시간"].mode()[0] if not matched_rows["체류시간"].mode().empty else None
        mean_stay = matched_rows["체류시간"].mean()
        link_stats_with_counts[link] = {
            "count": count,
            "max_stay_time": max_stay,
            "mode_stay_time": mode_stay,
            "mean_stay_time": mean_stay
        }

# 데이터프레임으로 변환
link_stats_with_counts_df = pd.DataFrame.from_dict(link_stats_with_counts, orient='index').reset_index()
link_stats_with_counts_df.columns = ['링크', '조회수', '최대 체류시간(초)', '최빈 체류시간(초)', '평균 체류시간(초)']

# 체류시간과 조회수를 소수점 아래 없이 반올림
link_stats_with_counts_df['최대 체류시간(초)'] = link_stats_with_counts_df['최대 체류시간(초)'].round().astype(int)
link_stats_with_counts_df['최빈 체류시간(초)'] = link_stats_with_counts_df['최빈 체류시간(초)'].astype('str').str.split('.').str[0]  # 최빈값이 NaN일 경우 대비
link_stats_with_counts_df['평균 체류시간(초)'] = link_stats_with_counts_df['평균 체류시간(초)'].round().astype(int)

# 평균 체류시간(초)를 기준으로 내림차순 정렬
link_stats_with_counts_df_sorted = link_stats_with_counts_df.sort_values(by='조회수', ascending=False).reset_index(drop=True)

# 파일 이름 생성
file_name = f'HomepageAnalysis_Market_{start_date}to{end_date}.xlsx'
full_path = os.path.join(r'C:\Users\ch.kang\OneDrive - 클리 주식회사\문서 - 클리주식회사\004_Project-4\11_매물 마케팅 데이터', file_name)
link_stats_with_counts_df_sorted.to_excel(full_path)

In [193]:
# Ensure we're using the previously defined method to find all matching Excel files
total_excel_file = [f for f in os.listdir(upper_path) if f.startswith('003') and f.endswith('.xlsx')]

# Initialize an empty DataFrame in case no files are found or readable
df_total_excel_file = pd.DataFrame()

if total_excel_file:
    # Assuming we want to read the first Excel file found
    excel_file_path = os.path.join(upper_path, total_excel_file[0])
    try:
        # Attempt to read the Excel file into a DataFrame
        df_total_excel_file = pd.read_excel(excel_file_path, dtype=str)
    except Exception as e:
        error_message = f"Error reading the Excel file: {e}"
else:
    error_message = "No matching Excel files found."

# Display a preview of the DataFrame or the error message, depending on the outcome
df_total_excel_file.head() if not df_total_excel_file.empty else error_message


NameError: name 'upper_path' is not defined

In [None]:
data = df_total_excel_file

# Identifying duplicated 'new_id's
duplicated_new_ids = data[data.duplicated('new_id', keep=False)].sort_values(by='new_id')

# Showing the head of the duplicated records to give an example
num_duplicate_new_ids = len(duplicated_new_ids['new_id'].unique())
print("전체 유입수 중 재방문수: ",num_duplicate_new_ids )

duplicated_new_ids


전체 유입수 중 재방문수:  1617


Unnamed: 0,new_id,date_info,conversion_1,conversion_2,conversion_3,source,medium,campaign,content,keyword,...,95,96,97,98,99,100,101,102,103,104
64,g-000005,20240115,,,,google,organic,(organic),,(not provided),...,,,,,,,,,,
1538,g-000005,20240130,,,,,,,,,...,,,,,,,,,,
600,g-000010,20240119,,,,igad,class_3rd,before_w,Instagram_Feed,,...,,,,,,,,,,
59,g-000010,20240115,,,,igad,class_3rd,after_w,Instagram_Feed,,...,,,,,,,,,,
51,g-000023,20240115,,,,google,organic,(organic),,(not provided),...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20658,g-018731,20240325,,,,daum,organic,(organic),,세컨하우스,...,,,,,,,,,,
20852,g-018783,20240327,,,,naver,organic,(organic),,마이세컨플레이스,...,,,,,,,,,,
20710,g-018783,20240326,,,,naver,organic,(organic),,마이세컨플레이스,...,,,,,,,,,,
20851,g-018829,20240327,,,,naver,blog_msp,세컨하우스의_모든것,https://blog.naver.com/mysecondplace/223298427705,,...,,,,,,,,,,


# 날짜 필터

In [None]:
# 'date_info' 열을 datetime 객체로 변환
data['date_info'] = pd.to_datetime(data['date_info'], format='%Y%m%d')

# 지정된 날짜 범위 내 데이터 필터링
start_date = pd.Timestamp(2024, 3, 8)
end_date = pd.Timestamp(2024, 3, 14)
filtered_data = data[(data['date_info'] >= start_date) & (data['date_info'] <= end_date)]

# 유입 분석

## UTM vs Keyword

In [None]:
total_visits = len(filtered_data)

# 모든 열 중에서 하나라도 값이 있는 경우를 그룹 1로 정의
utm_count = len(filtered_data[(filtered_data[['source', 'medium', 'campaign', 'content']].notnull()).any(axis=1)])

# keyword 값이 있는 그룹 (그룹 2)
keyword_count = len(filtered_data[(filtered_data['keyword'].notnull()) & 
                                  (filtered_data['keyword'] != 'missing')])
top_keywords = filtered_data[filtered_data['keyword'].notnull() & (filtered_data['keyword'] != 'missing')].groupby('keyword')['new_id'].nunique().reset_index(name='unique_visitors').sort_values('unique_visitors', ascending=False).head(10)

# 그룹 1과 그룹 2를 제외한 나머지는 그룹 3으로 정의
etc_count = total_visits - utm_count - keyword_count

print("UTM 유입수:", utm_count)
print("검색 유입수:", keyword_count)
print("기타 유입수:", etc_count)
print("총 유입수:", total_visits)

UTM 유입수: 9163
검색 유입수: 112
기타 유입수: 20
총 유입수: 9295


## First vs Revisit

In [None]:
total_visits = len(filtered_data)
print("A = B+C | 설정 기간 내 총유입수: ", total_visits )

num_duplicate_visits = filtered_data.duplicated(subset='new_id', keep=False).sum()
num_first_visits = total_visits - num_duplicate_visits
num_duplicate_visitor = filtered_data.duplicated(subset='new_id', keep='first').sum()
print("B| 설정 기간 내 첫방문: ", num_first_visits )
print("C| 설정 기간 내 재방문(복수): ", num_duplicate_visits )
print("|설정 기간 내 재방문자(고유): ", num_duplicate_visitor )



A = B+C | 설정 기간 내 총유입수:  9295
B| 설정 기간 내 첫방문:  8571
C| 설정 기간 내 재방문(복수):  724
|설정 기간 내 재방문자(고유):  375


## TOP KEYWORD

In [None]:
top_keywords

Unnamed: 0,keyword,unique_visitors
0,(not provided),58
13,마이세컨플레이스,23
14,마이세컨하우스,3
11,마세플,2
22,유럽스타일로쎄컨하우스지으려면얼마드나,1
21,시골땅 구매시 경계문제,1
20,세컨하우스토지분양하는곳,1
19,세컨하우스유구,1
18,세컨하우스 블로그,1
17,세컨하우스 iot시스템,1


## UTM별 분석

### UTM -> 홈페이지 방문

In [None]:
# 널값을 'unknown'으로 대체하여 그룹화하기
filtered_data_filled = filtered_data.fillna('unknown')

# 그룹 1: Medium (캠페인 단위)
group_1 = filtered_data_filled.groupby('medium')
group_1_count = group_1['new_id'].count()
group_1_duplicate_visits = group_1.apply(lambda x: x.duplicated(subset='new_id', keep=False).sum())
group_1_revisit_rate = group_1_duplicate_visits / group_1_count

group_1_summary = pd.DataFrame({
    'total_visitors': group_1_count,
    'revisits': group_1_duplicate_visits,
    'revisit_rate': group_1_revisit_rate
}).sort_values(by='total_visitors', ascending=False)

print("그룹 1 (Medium 단위):", group_1_summary['total_visitors'].sum()) 
print(group_1_summary.head(5))

# 그룹 2: source + medium (캠페인 + 매체)
filtered_data_filled['source_medium'] = (filtered_data_filled['source'].fillna('unknown') + 
                                         ' | ' + 
                                         filtered_data_filled['medium'].fillna('unknown'))
group_2 = filtered_data_filled.groupby('source_medium')
group_2_count = group_2['new_id'].count()
group_2_duplicate_visits = group_2.apply(lambda x: x.duplicated(subset='new_id', keep=False).sum())
group_2_revisit_rate = group_2_duplicate_visits / group_2_count

group_2_summary = pd.DataFrame({
    'total_visitors': group_2_count,
    'revisits': group_2_duplicate_visits,
    'revisit_rate': group_2_revisit_rate
}).sort_values(by='total_visitors', ascending=False)

print("\n그룹 2 (Source + Medium 단위):", group_2_summary['total_visitors'].sum())
print(group_2_summary.head(5))

# 그룹 3: All (캠페인 + 매체 + 소재)
filtered_data_filled['all'] = (filtered_data_filled['source'].fillna('unknown') + 
                               ' | ' + 
                               filtered_data_filled['medium'].fillna('unknown') + 
                               ' | ' + 
                               filtered_data_filled['campaign'].fillna('unknown') + 
                               ' | ' + 
                               filtered_data_filled['content'].fillna('unknown'))
group_3 = filtered_data_filled.groupby('all')
group_3_count = group_3['new_id'].count()
group_3_duplicate_visits = group_3.apply(lambda x: x.duplicated(subset='new_id', keep=False).sum())
group_3_revisit_rate = group_3_duplicate_visits / group_3_count

group_3_summary = pd.DataFrame({
    'total_visitors': group_3_count,
    'revisits': group_3_duplicate_visits,
    'revisit_rate': group_3_revisit_rate
}).sort_values(by='total_visitors', ascending=False)

print("\n그룹 3 (All 단위):", group_3_summary['total_visitors'].sum())
print(group_3_summary.head(5))


그룹 1 (Medium 단위): 9295
                total_visitors  revisits  revisit_rate
medium                                                
03_tour_locial            5837       341      0.058420
03_tour                   3073        97      0.031565
unknown                    132         8      0.060606
organic                    112        11      0.098214
referral                    89         4      0.044944

그룹 2 (Source + Medium 단위): 9295
                       total_visitors  revisits  revisit_rate
source_medium                                                
igad | 03_tour_locial            5257       308      0.058589
igad | 03_tour                   2398        63      0.026272
fbad | 03_tour                    665        32      0.048120
fbad | 03_tour_locial             578        30      0.051903
unknown | unknown                 132         8      0.060606

그룹 3 (All 단위): 9295
                                                total_visitors  revisits  \
all                         

  group_1_duplicate_visits = group_1.apply(lambda x: x.duplicated(subset='new_id', keep=False).sum())
  group_2_duplicate_visits = group_2.apply(lambda x: x.duplicated(subset='new_id', keep=False).sum())
  group_3_duplicate_visits = group_3.apply(lambda x: x.duplicated(subset='new_id', keep=False).sum())


### UTM -> 랜딩페이지

In [None]:
landing_page_summary = filtered_data_filled.groupby(['source_medium', '1']).size().reset_index(name='landing_page_visitors')
landing_page_summary_sorted = landing_page_summary.sort_values(by='landing_page_visitors', ascending=False)

print(landing_page_summary_sorted.head(10))

             source_medium                                               1  \
33   igad | 03_tour_locial            https://mysecondplace.co.kr/nopenzip   
30          igad | 03_tour            https://mysecondplace.co.kr/nopenzip   
6           fbad | 03_tour            https://mysecondplace.co.kr/nopenzip   
9    fbad | 03_tour_locial            https://mysecondplace.co.kr/nopenzip   
32   igad | 03_tour_locial                 https://mysecondplace.co.kr/msp   
29          igad | 03_tour                 https://mysecondplace.co.kr/msp   
31          igad | 03_tour  https://mysecondplace.co.kr/pb_secondhouse_ads   
115      unknown | unknown  https://mysecondplace.co.kr/secondhouse_market   
34   igad | 03_tour_locial  https://mysecondplace.co.kr/pb_secondhouse_ads   
68         naver | organic                     https://mysecondplace.co.kr   

     landing_page_visitors  
33                    4980  
30                    2275  
6                      628  
9                      57

# 투어 신청 전환 분석

## 홈페이지 -> 투어신청페이지

In [None]:
tour_application_pattern = 'tour_application|17967442'
reached_tour_application = filtered_data.apply(lambda row: row.astype(str).str.contains(tour_application_pattern).any(), axis=1)
tour_application_users_count = reached_tour_application.sum()
print("투어신청페이지 방문수:", tour_application_users_count)

투어신청페이지 방문수: 72


In [None]:
tour_complete_pattern = 'tourcomplete'
completed_tour_application = filtered_data.apply(lambda row: row.astype(str).str.contains(tour_complete_pattern).any(), axis=1)
tour_complete_users_count = completed_tour_application.sum()
tour_complete_users = filtered_data[completed_tour_application]['new_id']

print("투어신청완료 방문수:", tour_complete_users_count)
tour_complete_users

투어신청완료 방문수: 0


Series([], Name: new_id, dtype: object)

In [None]:
tour_application_users_ids = filtered_data[reached_tour_application]['new_id'].unique().tolist()
tour_application_users_ids

['g-010067',
 'g-010085',
 'g-010141',
 'g-010487',
 'g-010330',
 'g-010339',
 'g-010456',
 'g-010216',
 'g-007378',
 'g-010742',
 'g-011426',
 'g-011430',
 'g-011941',
 'g-011485',
 'g-010882',
 'g-007902',
 'g-011009',
 'g-012893',
 'g-012977',
 'g-012627',
 'g-012606',
 'g-013416',
 'g-013590',
 'g-012537',
 'g-012190',
 'g-012167',
 'g-012379',
 'g-014456',
 'g-014041',
 'g-014641',
 'g-014597',
 'g-014589',
 'g-013903',
 'g-013959',
 'g-015670',
 'g-015668',
 'g-015654',
 'g-015640',
 'g-015330',
 'g-016011',
 'g-016044',
 'g-015935',
 'g-015865',
 'g-015018',
 'g-014295',
 'g-015189',
 'g-015147',
 'g-016585',
 'g-016595',
 'g-016952',
 'g-016957',
 'g-016840',
 'g-016844',
 'g-017077',
 'g-017121',
 'g-016988',
 'g-006879',
 'g-001905',
 'g-016479',
 'g-016420',
 'g-016437',
 'g-016270',
 'g-017975',
 'g-018096',
 'g-018039',
 'g-018055',
 'g-017210',
 'g-017231',
 'g-001940',
 'g-017381',
 'g-017404']

# Summary

In [None]:
filtered_data_filled.columns

Index(['new_id', 'date_info', 'conversion_1', 'conversion_2', 'conversion_3',
       'source', 'medium', 'campaign', 'content', 'keyword',
       ...
       '97', '98', '99', '100', '101', '102', '103', '104', 'source_medium',
       'all'],
      dtype='object', length=116)

In [None]:
# source_medium과 landing_page 기준으로 그룹화하여 방문 횟수를 계산합니다.
landing_page_visits = filtered_data_filled.groupby(['source_medium', '1']).size().reset_index(name='visits')
# 'landing_page' 열의 이름을 '1'에서 'landing_page'로 변경합니다.
landing_page_visits.rename(columns={'1': 'landing_page'}, inplace=True)

# source_medium 별 총 방문 횟수를 계산합니다.
total_visits_by_source_medium = filtered_data_filled.groupby('source_medium').size().reset_index(name='total_visits')

# 결과를 병합하여 source_medium 별 홈페이지 유입수와 랜딩페이지별 방문수를 가진 DataFrame을 생성합니다.
summary_df = pd.merge(landing_page_visits, total_visits_by_source_medium, on='source_medium')


# 병합한 데이터프레임을 'visits' 기준으로 내림차순 정렬합니다.
summary_df_sorted = summary_df.sort_values(by='visits', ascending=False)

# 열 순서를 조정합니다.
summary_df_sorted = summary_df_sorted.rename(columns={'total_visits': 'total_visits_by_source_medium', 'visits': 'landing_page_visits'})

# 투어 신청 페이지 및 완료 페이지에 도달한 사용자 식별
reached_tour_application_ids = filtered_data_filled[reached_tour_application]['new_id'].unique()
completed_tour_application_ids = filtered_data_filled[completed_tour_application]['new_id'].unique()

# source_medium 별로 투어 신청 및 완료 사용자 수 집계
filtered_data_filled['reached_tour_application'] = filtered_data_filled['new_id'].isin(reached_tour_application_ids)
filtered_data_filled['completed_tour_application'] = filtered_data_filled['new_id'].isin(completed_tour_application_ids)


tour_application_by_source_medium = filtered_data_filled.groupby('source_medium')['reached_tour_application'].sum().reset_index(name='tour_application_visits')
tour_complete_by_source_medium = filtered_data_filled.groupby('source_medium')['completed_tour_application'].sum().reset_index(name='tour_complete_visits')

# summary_df_sorted에 병합
summary_df_sorted = pd.merge(summary_df_sorted, tour_application_by_source_medium, on='source_medium', how='left')
summary_df_sorted = pd.merge(summary_df_sorted, tour_complete_by_source_medium, on='source_medium', how='left')
summary_df_sorted = summary_df_sorted[['source_medium','total_visits_by_source_medium', 'landing_page', 'landing_page_visits', 'tour_application_visits', 'tour_complete_visits']]
# 숫자 열을 정수로 변환합니다.
numeric_columns = summary_df_sorted.select_dtypes(include=['number']).columns
summary_df_sorted[numeric_columns] = summary_df_sorted[numeric_columns].fillna(0).astype(int)

# 모든 열을 문자열로 변환합니다.
summary_df_sorted = summary_df_sorted.astype(str)


# 결과 출력
summary_df_sorted.head(10)


Unnamed: 0,source_medium,total_visits_by_source_medium,landing_page,landing_page_visits,tour_application_visits,tour_complete_visits
0,igad | 03_tour_locial,5257,https://mysecondplace.co.kr/nopenzip,4980,22,0
1,igad | 03_tour,2398,https://mysecondplace.co.kr/nopenzip,2275,11,0
2,fbad | 03_tour,665,https://mysecondplace.co.kr/nopenzip,628,6,0
3,fbad | 03_tour_locial,578,https://mysecondplace.co.kr/nopenzip,570,2,0
4,igad | 03_tour_locial,5257,https://mysecondplace.co.kr/msp,239,22,0
5,igad | 03_tour,2398,https://mysecondplace.co.kr/msp,75,11,0
6,igad | 03_tour,2398,https://mysecondplace.co.kr/pb_secondhouse_ads,48,11,0
7,unknown | unknown,132,https://mysecondplace.co.kr/secondhouse_market,43,6,0
8,igad | 03_tour_locial,5257,https://mysecondplace.co.kr/pb_secondhouse_ads,38,22,0
9,naver | organic,50,https://mysecondplace.co.kr,29,7,0


In [None]:
import pandas as pd

# 투어 완료 사용자들의 new_id 리스트 생성
tour_complete_users_ids = filtered_data_filled[completed_tour_application]['new_id'].unique().tolist()
# 모든 unique new_id를 가진 DataFrame 생성
all_new_ids = pd.Series(filtered_data_filled['new_id'].unique(), name='new_id').to_frame()

# 투어 신청 페이지 도달 여부와 투어 완료 여부를 나타내는 열 추가
all_new_ids['reached_tour_application'] = all_new_ids['new_id'].isin(tour_application_users_ids)
all_new_ids['completed_tour_application'] = all_new_ids['new_id'].isin(tour_complete_users_ids)

all_new_ids_filtered = all_new_ids[
    (all_new_ids['completed_tour_application']) | 
    (all_new_ids['reached_tour_application'])
]

# Step 1: new_id를 기준으로 source_medium, date, time 정보를 포함하는 매핑 데이터프레임 생성
new_id_to_info = filtered_data_filled[['new_id', 'date_info', 'source_medium', ]].drop_duplicates()

# Step 2: all_new_ids_filtered 데이터프레임과 병합하여 source_medium, date, time 열 추가
all_new_ids_with_info = pd.merge(
    all_new_ids_filtered,
    new_id_to_info,
    on='new_id',
    how='left'
)
all_new_ids_with_complete_info = all_new_ids_with_info[['new_id', 'date_info', 'source_medium', 'reached_tour_application', 'completed_tour_application']]

# 결과 확인
all_new_ids_with_complete_info



Unnamed: 0,new_id,date_info,source_medium,reached_tour_application,completed_tour_application
0,g-010067,2024-03-08,naver | blog_msp,True,False
1,g-010085,2024-03-08,igad | 03_tour_locial,True,False
2,g-010085,2024-03-13,igad | 03_tour_locial,True,False
3,g-010141,2024-03-08,blog | blog_hanpro911,True,False
4,g-010487,2024-03-08,linktr.ee | referral,True,False
...,...,...,...,...,...
74,g-017210,2024-03-14,linktr.ee | referral,True,False
75,g-017231,2024-03-14,blog | blog_hanpro911,True,False
76,g-001940,2024-03-14,igad | 03_tour_locial,True,False
77,g-017381,2024-03-14,linktr.ee | referral,True,False


In [None]:
# 날짜 범위를 문자열로 변환하여 파일 이름에 사용

# 파일 이름 생성
file_name = f'HomepageAnalysis_PaidMarketing_Tour_{start_date}to{end_date}.xlsx'
full_path = os.path.join(r'C:\Users\ch.kang\OneDrive - 클리 주식회사\문서 - 클리주식회사\004_Project-4\09_세일즈 프로세스 개선', file_name)

# ExcelWriter를 사용하여 Excel 파일로 내보내기
with pd.ExcelWriter(full_path, engine='xlsxwriter') as writer:
    summary_df_sorted.to_excel(writer, sheet_name='Summary', index=False)
    all_new_ids_with_complete_info.to_excel(writer, sheet_name='new_id', index=False)

print(f"File saved to {full_path}")


File saved to C:\Users\ch.kang\OneDrive - 클리 주식회사\문서 - 클리주식회사\004_Project-4\09_세일즈 프로세스 개선\homepage_2024-03-08to2024-03-14.xlsx
