## FastCampus PJT - Team 3
##### 심성식 김민수 심승현 이희상 이용기

### 라이브러리 관련 및 환경 셋업

###### 필요한 모듈 설치

In [None]:
# !pip install numpy
# !pip install pandas
# !pip install matplotlib
# !pip install scipy
# !pip install seaborn

###### 필요한 모듈 모두 불러오기

In [1]:
import os
import time
import numpy as np
import pandas as pd
from matplotlib import rc
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import re
import glob
import seaborn as sns
import datetime

###### 폰트 확인

In [2]:
rc('font', family='AppleGothic')
plt.rcParams['axes.unicode_minus'] = False
font_path = fm.findfont('AppleGothic')
if not font_path:
    print('Warning: AppleGothic font not found')
else:
    print("AppleGothic font found at ", font_path)

font_prop = fm.FontProperties(fname=font_path, size=12)


AppleGothic font found at  /System/Library/Fonts/Supplemental/AppleGothic.ttf


### 경로 설정

###### 폴더 생성 및 경로 variable 생성

In [3]:
# Creating folders and variables for data paths

dataset = "./dataset/"
GROUPED = dataset + "Grouped/"
FINAL_GROUP = dataset + "Grouped_Final/"

if not os.path.exists("./dataset"):
    os.mkdir("./dataset")

if not os.path.exists(dataset + "Grouped"):
    os.mkdir(dataset + "Grouped")

if not os.path.exists(dataset + "Grouped_Final"):
    os.mkdir(dataset + "Grouped_Final")

###### 파일 경로 불러올 딕셔너리 생성

In [4]:
# Adding up all file locations into a dictionary.

file_locations = {
    "dataset" : dataset,
    'grouped': GROUPED,
    'group_final' : FINAL_GROUP #,
    # to be added further
}

all_file_locations = {}
for key, value in file_locations.items():
    all_file_locations[key] = glob.glob(value + "*.csv")

'''

type all_file_locations["key"] to call
keys are "dataset", "grouped", "group_final", etc.; to be added further.

'''

'\n\ntype all_file_locations["key"] to call\nkeys are "dataset", "grouped", "group_final", etc.; to be added further.\n\n'

#### 준비

###### Definition 설정

In [5]:
# Local definitions - 1

def clean_course_name(course_name):
    '''
    course name 중 첫 구절이 ( 으로 시작하고, 
    ~ 로 스플릿한 구절 중 0번째 len이 7인 경우; 예시: "(220123"
    ) 이후의 course name을 return합니다.
    
    아래와 같이 사용해 주시면 편합니다.
    df["course_name"] = df["course_name"].apply(clean_course_name)

    '''
    if course_name.startswith("("):
        # Classify the date range to cleanse.
        if "~" in course_name:
            temp = course_name.split("~")[0]
            # if the value before ~ is in year format, remove the whole bracket.
            if len(temp) == 7:
                return course_name.split(")")[1]
            else:
                return course_name
        else:
            return course_name
    else:
        return course_name

In [6]:
# Local definitions - 2

def course_packages(row):
    '''
    패키지명을 return하는 코드입니다.
    
    아래와 같이 apply 해주시고 적용 희망하시는 row를 대치시켜주시면 됩니다.
    df["course_group"] = df.apply(course_packages, axis=1)
    
    '''
    # Splitting course classifier (: and -) to extract package name
    parts = re.split("[:|-]", row["course_name"])
    # Finding course names without package classifier
    if len(parts) <= 1:
        # [무료강의] as an exception, it makes more sense if we classify itself alone.
        if parts[0].startswith("[무료강의]"):
            return "[무료강의]"
        # Classify all other courses without package classifier into Other.
        return ("Other")
    # Finding course names with package classifier.
    elif len(parts) > 1:
        # Classify [kit] first, as it needs to be separated from other [ ] covered words.
        if parts[0].startswith("[kit]"):
            return "[kit]"
        # Removing (( ))
        elif parts[0].startswith("(("):
            return parts[0].split("))")[1]
        # Removing ( )
        elif parts[0].startswith("("):
            return parts[0].split(")")[1]
        # Removing [ ]
        elif parts[0].startswith("["):
            return parts[0].split("]")[1]
        else:
            return parts[0]
    else:
        return parts[0]


###### 컬럼명 영문으로 변경 - Encoding 이슈 방지

In [7]:
# Changing column names from Korean to English

df = pd.read_csv('./eda-proj-fc-purchase.csv')
df.rename(columns={"거래id":"transaction_id", "유형":"l_type", "고객id":"customer_id","코스ID":"courseID","사이트":"site", "포맷":"l_format",
                   "카테고리":"l_categories", "코스(상품) 이름":"course_name", "거래일자":"transaction_date_time","쿠폰이름":"coupon_name",
                   "판매가격":"sold_price","결제수단":"payment_method","실거래금액":"actual_sold_price","쿠폰할인액":"coupon_discount",
                   "거래금액":"transaction_amount","환불금액":"refund_amount"
                   },
          inplace=True)
df.to_csv(dataset + "rowdata_eng.csv", index=False)

###### 날짜/시간 데이터 변환 및 분리

In [8]:
# Converting date_time to date and time.

rowdata = pd.read_csv(dataset+"rowdata_eng.csv")

df = rowdata['transaction_date_time'].str.split(" ", n=5, expand=True)
rowdata['Date'] = df[0].str.cat([df[1], df[2]], sep="")
rowdata['Time'] = df[3].str.cat([df[4]], sep=" ")

# Converting time to 24H format

rowdata["Time"] = rowdata["Time"].str.replace("오후", "PM").str.replace("오전","AM")
rowdata['Time'] = rowdata['Time'].apply(lambda x: datetime.datetime.strptime(x, '%p %I:%M:%S').strftime('%H:%M:%S'))
df = rowdata
# rowdata.to_csv(dataset + "date_time_converted.csv", index=False)

###### 불필요한 데이터 처리

In [9]:
# Deleting Rows

# 크리에이티브
print("Number of rows before deletion:", len(df))
df = df[df["l_categories"] != "크리에이티브"]
print("Number of rows after deletion:", len(df))

# rowdata_eng.to_csv(dataset + "Bye_Creative.csv", index=False)

Number of rows before deletion: 159328
Number of rows after deletion: 159327


###### 숫자 데이터 컬럼 형 변환 (Obj -> Float)

In [10]:
# Convert dtype
df = df.replace("-",np.nan)
df["sold_price"] = df["sold_price"].astype(float)
df["actual_sold_price"] = df["actual_sold_price"].astype(float)
df["coupon_discount"] = df["coupon_discount"].astype(float)
df["transaction_amount"] = df["transaction_amount"].astype(float)
df["refund_amount"] = df["refund_amount"].astype(float)
type_converted = df

###### 코스 이름 컬럼 cleansing

In [12]:
# Removing date range in course_name

type_converted["course_name"] = type_converted["course_name"].apply(clean_course_name)
df = type_converted

# df.to_csv(dataset +"course_name_cleansed.csv", index = False)

###### 패키지 별 분류

In [21]:
# Grouping courses by Packages

# "The Red" had many variances, needed to cleanse the course_name once again solely for this
df["course_name"] = df["course_name"].apply(lambda x: x.replace(" ", "").lower())

# Applying def course_packages.
df["course_group"] = df.apply(course_packages, axis=1)

# Generating CSV file for each Package.
grouped_df = df.groupby("course_group")
# for group_name, group_df in grouped_df:
#     groupname = group_name.replace("/", "_")
#     filename = f"{groupname}.csv"
#     group_df.to_csv(GROUPED + f"{filename}", index=False)

###### 패키지 별 분류된 CSV파일 생성

In [26]:
# Some packages only had one course each, needed to re-classify them as others
for file in all_file_locations["grouped"]:
    df = pd.read_csv(file, low_memory=False)
    # print(file, file.split("/")[-1].split(".")[0] ,len(set(df["course_name"])))

df_new = pd.DataFrame()
for group_name, group_df in grouped_df:
    # We don't need to check Other.csv as we want to add more data to it
    if group_name == "Other":
        continue
    else:
        # Check if any Package has only one course
        if len(set(group_df["course_name"])) == 1:
            # There is only one course in [무료강의], but as it seemed more legit if we separate it from Others.
            if group_name == "[무료강의]":
                # Adding up [무료강의].csv to final CSV folder
                group_df.to_csv(FINAL_GROUP + "[무료강의].csv", index=False)
                pass
            else:
                # Add all packages that only has one course to df_new
                df_new = pd.concat([df_new, group_df])

        else:
            # Adding all the packages with multiple courses to the final CSV folder.
            filetitle = group_name.replace("/", "_")
            group_df.to_csv(FINAL_GROUP + f"{filetitle}.csv", index=False)

df_other = pd.read_csv(GROUPED + "Other.csv")
df_all = pd.concat([df_new, df_other])
df_all.to_csv(FINAL_GROUP + "Others.csv", index=False)

## 이 밑으로는 사실 기억이 잘 안납니다...

In [28]:
total_pck = pd.read_csv(dataset + "package_clarified.csv", encoding="utf-8-sig")

pck_prchsd_usr = total_pck.groupby("course_group")["customer_id"].nunique().rename("pck_purchased_user")
total_prchsd_usr = total_pck.groupby("course_group")["transaction_id"].nunique().rename("total_purchased_user")
comparison_df = pd.concat([pck_prchsd_usr, total_prchsd_usr], axis=1)
print(comparison_df)

AttributeError: 'DataFrameGroupBy' object has no attribute 'groupby'

이 아래 바차트까진 그래도... 그러려니 봐줄만은 합니다..

In [None]:
fig, ax = plt.subplots()

for file_loc in all_file_locations["group_final"]:
    filename = file_loc.split("/")[-1].split(".")[0]
    if filename == "Others":
        pass
    else:
        file = pd.read_csv(file_loc, encoding="utf-8-sig")
        pck_prchsd_usr = file.groupby("course_group")["customer_id"].nunique().rename("pck_purchased_user")
        ax.bar(pck_prchsd_usr.index, pck_prchsd_usr.values, width=0.8, label=filename[:4])

In [None]:
# Add chart title and axis labels
ax.set_title('Number of Unique Customers per Course Group')
ax.set_xlabel('Course Group')
ax.set_ylabel('Number of Unique Customers')
plt.xticks(alpha=0.5, fontsize=7,rotation=45)
ax.legend().remove()

# Display the chart
plt.show()

되도록이면... 아래는 실행시키지 않으시는게... 트릭아이를 보시게 될 수도 있습니다... 전 주의드렸습니다...ㅠㅠ

In [None]:
def my_autopct(pct):
    return ('%.3f' % pct) if pct > 5 else ''

compsizes = comparison_df["pck_purchased_user"].values
totalsizes = comparison_df["total_purchased_user"].values
percentages = compsizes / totalsizes
labels = comparison_df.index.values
print(comparison_df)
# print(sum(comparison_df.values))
print(comparison_df.values.sum())
print(labels)

colors = ['blue', 'orange', 'green', 'red', 'skyblue', 'yellow']
fig, ax = plt.subplots()
ax.pie(percentages, labels=labels, colors=colors, autopct=my_autopct, startangle=90, counterclock=False)

ax.tick_params(axis='both', which='both', length=0)
ax.legend().set_visible(False)

plt.title('Proportion of Customers per Course Group')


plt.axis("equal")
plt.show()