## Setup

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("TokenizedProjectData_all.csv")
df = df.replace(np.nan, " ")
df.columns

Index(['id', 'title', 'category', 'duration', 'desc', 'text', 'color_text',
       'bold_text', 'hyper_text', 'img_list', 'success_x', 'img_text',
       'success_y'],
      dtype='object')

In [3]:
df.shape

(5612, 13)

In [4]:
df['merged_content_text'] = df['title'] + ' ' + df['desc'] + ' ' + df['text'] +' '+df['img_text']
df.columns

Index(['id', 'title', 'category', 'duration', 'desc', 'text', 'color_text',
       'bold_text', 'hyper_text', 'img_list', 'success_x', 'img_text',
       'success_y', 'merged_content_text'],
      dtype='object')

In [7]:
df = df[['id', 'merged_content_text', 'success_x']]
df = df.dropna(subset=['merged_content_text'])
df.shape

(5612, 3)

### Tokenized

In [261]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [202]:
# initialize
tfidf_vectorizer = TfidfVectorizer(max_features=8000)

# TF-IDF
tfidf_vectors = tfidf_vectorizer.fit_transform(df['merged_content_text'])
tfidf_vectors_df = pd.DataFrame(tfidf_vectors.toarray(), columns=tfidf_vectorizer.get_feature_names())

df_tfidf = pd.concat([tfidf_vectors_df, df['success_x']], axis=1)



In [203]:
print(tfidf_vectors_df.shape)

(5612, 8000)


In [205]:
df_tfidf = df_tfidf.dropna()
df_tfidf.shape

(5612, 8001)

## Feature selection
> *Selecting feature with tfidf > threshold*

In [206]:
# sort tfidf
tfidf_sum_list = tfidf_vectors_df.sum().to_list()
tfidf_sum_list.sort()
threshold = tfidf_sum_list[5000]
print(threshold)

5.2984289748366376


In [207]:
# keep data above threshold
keep_col = []
for col in tfidf_vectors_df.columns:
    if tfidf_vectors_df[col].sum() > threshold:
        keep_col.append(col)

In [208]:
len(keep_col)

2999

In [209]:
tfidf_select_thres = tfidf_vectors_df[keep_col]
tfidf_select_thres.shape

(5612, 2999)

### PCA

In [210]:
from sklearn.decomposition import PCA,KernelPCA
from sklearn.manifold import MDS

In [211]:
pca = PCA(n_components=512, copy=True)
newX = pca.fit_transform(tfidf_select_thres)

In [212]:
pca_col = []
for i in range(512):
    pca_col.append("PC_"+str(i+1))
temp = pd.DataFrame(newX, columns=pca_col)

In [213]:
print(temp.shape)
print(df['success_x'].shape)

(5612, 512)
(5612,)


In [214]:
pca_512 = pd.concat([df['id'],temp, df['success_x']], axis=1)

In [215]:
pca_512 = pca_512.dropna()

In [216]:
pca_512.shape

(5612, 514)

In [217]:
pd.DataFrame(pca_512).to_csv('pca_512_0.csv', header=True, index=False)

### Kernel PCA

In [218]:
k_pca = KernelPCA(n_components=512, kernel='cosine')
k_pca_x = k_pca.fit_transform(tfidf_select_thres)

In [219]:
kpca_col = []
for i in range(512):
    kpca_col.append("KernelPCA_"+str(i+1))
kpca_vectors = pd.DataFrame(k_pca_x, columns=kpca_col)

In [220]:
kpca_512 = pd.concat([df['id'], kpca_vectors, df['success_x']], axis=1)
kpca_512 = kpca_512.dropna()

In [221]:
kpca_512.shape

(5612, 514)

In [222]:
pd.DataFrame(kpca_512).to_csv('kpca_512_0.csv', header=True, index=False)

### MDS

In [30]:
metric_mds = MDS(n_components=512, metric=True)

In [73]:
mds_x = metric_mds.fit_transform(tfidf_select_thres)

In [74]:
mds_col = []
for i in range(512):
    mds_col.append("MDS_"+str(i+1))
mds_vectors = pd.DataFrame(mds_x, columns=mds_col)

In [75]:
mds_512 = pd.concat([df['id'],mds_vectors, df['success_x']], axis=1)

In [76]:
pd.DataFrame(mds_512).to_csv('mds_512_np.csv', header=True, index=False)

### CountVectorizer

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

In [9]:
cv = CountVectorizer(max_features=4000)
cv_x = cv.fit_transform(df['merged_content_text'])
cv_xy = pd.DataFrame(cv_x.toarray(), columns=cv.get_feature_names())



In [10]:
cv_xy.shape

(5612, 4000)

In [11]:
cv_4000 = pd.concat([df['id'],cv_xy, df['success_x']], axis=1)

In [227]:
pd.DataFrame(cv_4000).to_csv('cv_4000_0.csv', header=True, index=False)

In [12]:
cv_4000.columns

Index(['id', '一一', '一下', '一併', '一半', '一同', '一定', '一手', '一月', '一生',
       ...
       '默默', '點心', '點擊', '點綴', '點選', '黴菌', '鼓勵', '齊全', '龐大', 'success_x'],
      dtype='object', length=4002)

### Chi square

In [229]:
# Chi-Square
from sklearn.feature_selection import SelectFdr
from sklearn.feature_selection import chi2

In [230]:
cv_2 = CountVectorizer()
cv_x_2 = cv_2.fit_transform(df['merged_content_text'])
cv_xy_2 = pd.DataFrame(cv_x_2.toarray(), columns=cv_2.get_feature_names())

In [231]:
feature_selector = SelectFdr(chi2, alpha=0.05)
feature_selector.fit(cv_xy_2, df['success_x'])

SelectFdr(score_func=<function chi2 at 0x000001CD1D36F820>)

In [232]:
chi2_X = feature_selector.transform(cv_xy_2)

In [233]:
chi2_X.shape

(5612, 4314)

In [234]:
chi2_X_df = pd.DataFrame(chi2_X)
chi_6238 = pd.concat([df['id'],chi2_X_df, df['success_x']], axis=1)

In [235]:
chi_6238.shape

(5612, 4316)

In [236]:
pd.DataFrame(chi_6238).to_csv('chi_0.csv', header=True, index=False)

### Tfidf 4000

In [237]:
vectorizer = TfidfVectorizer(max_features=4000)

tfidf_x = vectorizer.fit_transform(df['merged_content_text'])

In [238]:
tfidf_4000 = pd.DataFrame(tfidf_x.toarray(), columns=vectorizer.get_feature_names())

tfidf_4000 = pd.concat([df['id'],tfidf_4000, df['success_x']], axis=1)



In [239]:
pd.DataFrame(tfidf_4000).to_csv('tfidf_4000_1.csv', header=True, index=False)

### Log likelihood

In [279]:
import numpy as np

In [280]:
cv = CountVectorizer(max_features=8000)
x = cv.fit_transform(df['merged_content_text'])
x_df = pd.DataFrame(x.toarray(), columns=cv.get_feature_names())



In [281]:
log_8000 = pd.concat([df['id'],x_df, df['success_x']], axis=1)

In [282]:
log_success = log_8000[log_8000['success_x']==1]

In [283]:
log_f = log_8000[log_8000['success_x']==0]

In [284]:
log_8000.columns

Index(['id', '一一', '一下', '一併', '一個個', '一共', '一再', '一半', '一口', '一同',
       ...
       '黴菌', '鼓勵', '鼓舞', '鼻子', '齊全', '齒輪', '龍眼', '龍頭', '龐大', 'success_x'],
      dtype='object', length=8002)

In [285]:
log_8000['一一'].sum()

289

In [286]:
def likelihood(column):
    n11 = log_success[column].sum() +1
    n01 = log_f[column].sum() +1
    n10 = log_success.shape[0] - n11 +1
    n00 = log_f.shape[0] - n01 +1

    pt = (n11+n01) / (log_success.shape[0]+log_f.shape[0]+4)
    p1 = n11/(n11+n10)
    p2 = n01/(n01+n00)
    
    score = -2*np.log((pt**n11 * (1-pt)**n10 * pt**n01 * (1-pt)**n00)/(p1**n11 * (1-p1)**n10) * p2**n01 * (1-p2)**n00)
    return score

In [287]:
scores = {}
for column in log_8000.columns:
    if column == 'id'or column == 'success_x':
        continue
    else:
        scores[column] = likelihood(column)

  score = -2*np.log((pt**n11 * (1-pt)**n10 * pt**n01 * (1-pt)**n00)/(p1**n11 * (1-p1)**n10) * p2**n01 * (1-p2)**n00)
  score = -2*np.log((pt**n11 * (1-pt)**n10 * pt**n01 * (1-pt)**n00)/(p1**n11 * (1-p1)**n10) * p2**n01 * (1-p2)**n00)
  score = -2*np.log((pt**n11 * (1-pt)**n10 * pt**n01 * (1-pt)**n00)/(p1**n11 * (1-p1)**n10) * p2**n01 * (1-p2)**n00)
  score = -2*np.log((pt**n11 * (1-pt)**n10 * pt**n01 * (1-pt)**n00)/(p1**n11 * (1-p1)**n10) * p2**n01 * (1-p2)**n00)


In [288]:
sort = sorted(scores.items(), key=lambda x: x[1], reverse=True)
wanted_col = []
count = 0
for combine in sort:
    wanted_col.append(combine[0])
    count += 1
    if count >= 4000:
        break

In [289]:
len(wanted_col)

4000

In [290]:
log_4000 = pd.concat([df['id'], log_8000[wanted_col], df['success_x']], axis=1)

In [252]:
log_4000.shape

(5612, 4002)

In [253]:
pd.DataFrame(log_4000).to_csv('log_4000_0.csv', header=True, index=False)

In [254]:
log_4000.columns[50:80:]

Index(['一家人', '一度', '上山下海', '一貫', '一瞬間', '一日', '一目瞭然', '一再', '一口', '一但', '一萬',
       '上山', '一隅', '一張張', '一口氣', '三創', '上市', '上手', '上方', '上架', '上海', '上班',
       '上班族', '上線', '上色', '上蓋', '上百', '上演', '上台', '上網'],
      dtype='object')

## Model Evaluation

### Logistic regression

In [13]:
train = cv_4000.sample(4500)
test = cv_4000.sample(1000)

X_train = train.iloc[:, 1:-2]
y_train = train.iloc[:, -1]
X_test = test.iloc[:,1:-2]
y_test = test.iloc[:, -1]

In [14]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_test_predictions = lr.predict(X_test)
print ("train ACC:",(y_test_predictions == y_test).sum().astype(float) / y_test.shape[0])

train ACC: 0.932


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [257]:
len(lr.coef_[0])

3999

## Analysis
> *Frequent term in success/fail scenarios*

In [30]:
suc = cv_4000[cv_4000['success_x']==1]
fal = cv_4000[cv_4000['success_x']==0]

In [31]:
print(suc.shape)
print(fal.shape)

(4286, 4002)
(1326, 4002)


In [32]:
suc.columns

Index(['id', '一一', '一下', '一併', '一半', '一同', '一定', '一手', '一月', '一生',
       ...
       '默默', '點心', '點擊', '點綴', '點選', '黴菌', '鼓勵', '齊全', '龐大', 'success_x'],
      dtype='object', length=4002)

In [33]:
suc = suc.drop(['id', 'success_x'], axis=1)
fal = fal.drop(['id', 'success_x'], axis=1)

In [39]:
sd = {}
fd = {}

In [40]:
for col in suc.columns:
    sd[col]=suc[col].sum()
    fd[col]=fal[col].sum()

In [41]:
sd_l = sorted(sd.items(), key=lambda x: x[1], reverse=True)
fd_l = sorted(fd.items(), key=lambda x: x[1], reverse=True)

### Different Set from Top 500 words

In [50]:
s_set = set()
f_set = set()
for i in range(500):
    s_set.add(sd_l[i][0])
    f_set.add(fd_l[i][0])

In [51]:
s_set-f_set

{'不易',
 '今年',
 '保護法',
 '加購',
 '升級',
 '原價',
 '原廠',
 '原料',
 '口袋',
 '可愛',
 '售價',
 '回收',
 '地球',
 '壓力',
 '媒體',
 '守護',
 '容量',
 '市面',
 '彈性',
 '情緒',
 '想像',
 '感覺',
 '戶外',
 '探索',
 '推薦',
 '支援',
 '文件',
 '料理',
 '日曆',
 '機能',
 '檢測',
 '正常',
 '正式',
 '永續',
 '添加',
 '清洗',
 '物流',
 '玻璃',
 '療癒',
 '皮革',
 '睡眠',
 '穩定',
 '精油',
 '耳機',
 '股份',
 '螢幕',
 '裝置',
 '角度',
 '訂單',
 '記憶',
 '試用期',
 '變化',
 '貓咪',
 '透氣',
 '過去',
 '達標',
 '隨身',
 '高度'}

In [52]:
f_set-s_set

{'不能',
 '之間',
 '人員',
 '企業',
 '元素',
 '公益',
 '出版',
 '出現',
 '包包',
 '取得',
 '固定',
 '土地',
 '地區',
 '地點',
 '存在',
 '學校',
 '家人',
 '寄送',
 '寵物',
 '展覽',
 '工作室',
 '廠商',
 '後續',
 '得到',
 '思考',
 '想法',
 '應用',
 '接觸',
 '控制',
 '推廣',
 '提醒',
 '插畫',
 '改善',
 '教學',
 '數位',
 '數量',
 '毛孩',
 '溫暖',
 '為主',
 '畢業',
 '病毒',
 '知識',
 '管理',
 '經歷',
 '網站',
 '網路',
 '線上',
 '繼續',
 '自我',
 '舉辦',
 '表演',
 '設備',
 '議題',
 '資金',
 '退換貨',
 '遇到',
 '領域',
 '願意'}