### Build Label
1 for success, 0 for failure

In [9]:
import pandas as pd
import os
data_root_path = './Data/Rawdata'
df = pd.read_csv(os.path.join(data_root_path, "ProjectList.csv"))
d = df.dropna()

d['success'] = 1
for idx, row in d.iterrows():
    text = row['cover_text']
    if "集資失敗" in text:
        d.at[idx, 'success'] = 0

# Save
d.to_csv('ProjectList_labeled.csv', index=False)

Unnamed: 0,cover_text,url,id,success
0,【營養正義】3效頂級8合1賦活青春NMN| 只賺100圓左右合理利潤 | 全成分世界頂尖大廠...,https://www.zeczec.com/projects/NMN2,NMN2,1
1,【掛繩二刀流】手機 X 證件｜一繩同行、雙面感應，為行動派而生！\n910%\nNT$ 45...,https://www.zeczec.com/projects/ZENLET-The-Dual,ZENLET-The-Dual,1
2,【 盆宅 】水晶消磁盆 | 會聚氣的傢飾品，遠景無限盆滿缽滿 | 淨化界的文青公關\n305...,https://www.zeczec.com/projects/hhl-penzhai,hhl-penzhai,1
3,【 i wash 薄型洗碗機 】 專為台灣住宅環境設計！超薄型Ｘ大容量Ｘ免安裝\n329%\...,https://www.zeczec.com/projects/scion-36,scion-36,1
4,AMT CUBE 氣動手工實木音響 | 藍牙、USB、AUX、內建電池，全方位使用\n294...,https://www.zeczec.com/projects/amt-cube,amt-cube,1
...,...,...,...,...
5670,"『Organs without body 2014 S/S 服裝秀』\n5%\nNT$ 5,...",https://www.zeczec.com/projects/organs-without...,organs-without-body-2014-s-s,0
5671,"維基愛古蹟攝影賽 行腳大使團\nNT$ 2,700\nperson\n3\n人\n集資失敗",https://www.zeczec.com/projects/wlm-tw,wlm-tw,0
5672,夢想舞台-校園歌唱接力賽\n2%\nNT$ 700\nperson\n2\n人\n集資失敗,https://www.zeczec.com/projects/8bowchou,8bowchou,0
5673,"蕉朋友 －阿蕉插畫展\n5%\nNT$ 4,300\nperson\n3\n人\n集資失敗",https://www.zeczec.com/projects/bananalin,bananalin,0


### CkipTagger
Requirements:
- python>=3.6
- tensorflow>=1.13.1 / tensorflow-gpu>=1.13.1 (one of them)
- gdown (optional, for downloading model files from google drive)

In [None]:
# !pip install gdown
# !pip install ckiptagger

### Setup

In [7]:
from ckiptagger import data_utils, construct_dictionary, WS, POS, NER

download_path = './ckip_data'
os.makedirs(download_path, exist_ok=True)

# download model and resources
data_utils.download_data_gdown(download_path) # gdrive-ckip

In [None]:
import re

# Initialize CKIP tokenizer
ws = WS('./ckip_data/data')

# read file
file_name = 'ProjectData_5000-5674.csv'
data = pd.read_csv(file_name)
data = data.dropna()

fields_to_tokenize = ['title', 'desc', 'text', 'color_text', 'bold_text', 'hyper_text']

# remove special token, number and English
pattern = r'[^\u4e00-\u9fa5]'

# tomlinNTUB-chinese-stopwords
stopwords_file = 'stopwords.txt'
with open(stopwords_file, 'r', encoding='utf-8') as file:
    chinese_stopwords = [line.strip() for line in file]

for idx, row in data.iterrows():
    for field in fields_to_tokenize:
        text = row[field]

        # remove special token, number and English
        text_cleaned = re.sub(pattern, '', str(text))

        # CKIP tokenize
        word_sentence_list = ws([text_cleaned])

        # remove stopwords
        word_sentence_filtered = [word for word in word_sentence_list[0] if word not in chinese_stopwords]

        # update
        data.at[idx, field] = ' '.join(word_sentence_filtered)

# Save
data.to_csv(f'Tokenized{file_name}', index=False)

### Labeling

In [None]:
f = 'TokenizedProjectData_5000-5674.csv'
df_A = pd.read_csv('ProjectList_labeled.csv')
df_B = pd.read_csv(f)

df_merged = pd.merge(df_B, df_A[['id', 'success']], on='id', how='left')

df_merged.to_csv(f, index=False)

### Merge

In [None]:
d1 = pd.read_csv("TokenizedProjectData_0-999.csv")
d2 = pd.read_csv("TokenizedProjectData_1000-1999.csv")
d3 = pd.read_csv("TokenizedProjectData_2000-2999.csv")
d4 = pd.read_csv("TokenizedProjectData_3000-3999.csv")
d5 = pd.read_csv("TokenizedProjectData_4000-4999.csv")
d6 = pd.read_csv("TokenizedProjectData_5000-5674.csv")

# Concat DataFrame
merged_df = pd.concat([d1, d2, d3, d4, d5, d6], ignore_index=True)

# save file
merged_df.to_csv('TokenizedProjectData_all.csv', index=False)