# Recommendation Using TF-IDF weighted Words Embedding

**STEP** <br>
1. Create TF-IDF 
2. Convert a tf-idf dictionary with word as key, idf as a value
3. Get TF-IDF features
4. Combine pretrained words embedding with TF-IDF
5. Calculate Cosine Similarity 
6. Recommend Law 

### Read ckiptagger & Dataframe

In [216]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
from ckiptagger import data_utils, construct_dictionary, WS, POS, NER
import datetime
import pickle
import jieba

path = "./ckip/data"
ws = WS(path)

df = pd.read_csv('../data/courses.csv').fillna('')
# Replace '@' with ' ' in original dataframe
#df.token = df.token.apply(lambda text: str(text).replace('@',' '))

  cell = tf.compat.v1.nn.rnn_cell.LSTMCell(hidden_d, name=name)


### Import Words Dictionary

In [217]:
# dictionary
# dict_path = './dictionary'
# legal_name_file = dict_path + '/name_of_legal.txt'
# word_file = dict_path + '/oth_words.txt'
# split_rule_kw_file = dict_path + '/split_rule_words.txt'

# with open(legal_name_file, 'r', encoding='big5') as k1, open(word_file, 'r', encoding='big5') as k2:
#     k = k1.read().split('\n') + k2.read().split('\n')
#     word_to_weight = dict([(_, 1) for _ in k])
#word_dict = construct_dictionary(word_to_weight)

### Read Pretrained Words Embedding
詞向量訓練文本來源為中文維基百科，全部的訓練文本可於[此](https://dumps.wikimedia.org/zhwiki/latest/zhwiki-latest-pages-articles.xml.bz2)下載最新版的中文維基百科。<br>
維基百科2014（總詞彙數：655K，400維詞向量，下載大小為2.5G）<br>
來源：[元智大學自然語言處理實驗室](http://nlp.innobic.yzu.edu.tw/demo/word-embedding.html)

In [218]:
# https://ithelp.ithome.com.tw/articles/10194633
embeddings = {}
f = open('wiki.zh.vector', encoding = 'utf8') 
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings[word] = coefs
f.close()

In [219]:
course_df_original = pd.read_csv('../data/courses.csv').fillna('')
course_df = course_df_original.drop(['course_id', 'teacher_id', 'course_published_at_local', 'course_published_at_local', 'course_price'], axis=1)
course_df['description'] = course_df['description'].replace('([\<]).*?([\>])','',regex=True)
course_df = course_df.replace('\n', '',regex=True)
course_df = course_df.replace('[\d_]', '',regex=True).astype(str)
course_df.insert(0, 'course_id', course_df_original['course_id'])
course_df[:2]
info = []
for i in range(course_df.shape[0]):
    t = course_df.loc[i, :].values.flatten().tolist()
    info.append([t[0], ' '.join(t[:])])
info = pd.DataFrame(info, columns = ['course_id', 'text'])
info[:5]

Unnamed: 0,course_id,text
0,61888e868f154b000781b45a,61888e868f154b000781b45a 少女人妻華麗變身：七大妝容七彩的夢幻樂園 ...
1,54d5a117065a7e0e00725ac0,54d5a117065a7e0e00725ac0 幾何圖形分割 X 色塊組合 從學生時代開始...
2,54d5d9952246e60a009ec571,54d5d9952246e60a009ec571 數位拼貼的手感 自由工作者，致力於品牌視覺...
3,54d7148a2246e60a009ec588,54d7148a2246e60a009ec588 Line 的貼圖自己動手做！ 我是Dann...
4,5513e92b38239d10005778e1,5513e92b38239d10005778e1 為申請學校或工作寫好英文自傳 在北美長大，...


### Tf-idf for Tokenized Text in Dataframe

In [220]:
# TF-IDF Model
tfidf_ml = TfidfVectorizer(tokenizer=jieba.lcut)
tfidf_ml.fit(info.text)

# TF-IDF Dicitonary
dictionary = dict(zip(tfidf_ml.get_feature_names(), list(tfidf_ml.idf_)))

# feature name
tfidf_feature = tfidf_ml.get_feature_names()
'|'.join(tfidf_feature[5000:5050])



'oo|ooad|ooo|ooxx|opec|open|openapi|opencl|opencv|opendata|opener|openers|openframeworks|opengl|opening|openmindenjoylife|opensea|openstreetmap|openzeppelin|operability|operating|operation|operations|opi|opp|oprah|ops|optical|opticalflares|optimization|optimize|or|oracle|orbit|ord|order|oren|oreo|org|organise|organization|organize|oriented|origin|original|orm|os|osaka|oscar|oscow'

### Newly Entered Text Preprocess function
- Remove Punctuation
- Remove Spaces
- Sentence Segment
- turn into list

In [221]:
def Preprocess(text):
    rule = re.compile(r'[^a-zA-Z0-9\u4e00-\u9fa5]')
    text = rule.sub(' ',str(text))
    text = re.sub(' +', '',text)
    #text = ws([text],sentence_segmentation=True, recommend_dictionary=word_dict)
    text = ws([text],sentence_segmentation=True)
    text = [x for l in text for x in l]
    return(text)

### Calculate TF-IDF Weighted Word Embedding

In [222]:
starttime = datetime.datetime.now()

# TF-IDF weighted Word2Vec
tfidf_text_vect = [] # tfidf-w2v is stored in this list
row = 0

for text in info.text.apply(lambda text: text.split()):
    text_vect = np.zeros(400)
    weight_sum = 0
    for word in text:
        if word in embeddings.keys() and word in tfidf_feature:
            vec = embeddings[word]
            tf_idf = dictionary[word]*(text.count(word)/len(text))
            text_vect += (vec * tf_idf)
            weight_sum += tf_idf
    if weight_sum != 0:
        text_vect /= weight_sum
    tfidf_text_vect.append(text_vect)
    row += 1

# calculate running time
endtime = datetime.datetime.now()
print("建立模型時間: ",endtime - starttime)

建立模型時間:  0:00:06.522007


### Law Recommendation Function
輸入內文 --> 跑出推薦的前十個相近內文對應的法律

In [223]:
def recommend_law(text, tfidf_text_vect = tfidf_text_vect):
    text = Preprocess(text)
    #print(text)
    text_vect = np.zeros(400) # w2v size
    weight_sum = 0
    for word in text:
        if word in embeddings.keys() and word in tfidf_feature:
            vec = embeddings[word]
            tf_idf = dictionary[word]*(text.count(word)/len(text))
            text_vect += (vec * tf_idf)
            weight_sum += tf_idf
    if weight_sum != 0:
        text_vect /= weight_sum
    tmp_vect = [*tfidf_text_vect,text_vect]
    new_cos_sim = cosine_similarity(tmp_vect, tmp_vect)
    sim_score = np.sort(new_cos_sim[new_cos_sim.shape[0]-1])[::-1][1:51]
    tmp_top_10_law = info[['text']].iloc[np.argsort(new_cos_sim[new_cos_sim.shape[0]-1])[::-1][1:51]]
    tmp_top_10_law['similarity_score'] = [round(score*100,1) for score in sim_score]
    return tmp_top_10_law

In [224]:
#tfidf_feature

### Try an Example
輸入內容便可以推薦出適合的法律<br>
(這邊列出的CE_Comment純粹是用來比對「輸入的內容」跟「原本內文」是否真的相近)

In [225]:
###### starttime = datetime.datetime.now()

newtext = '手工'
result = recommend_law(newtext)

# calculate running time
endtime = datetime.datetime.now()
print("搜尋推薦時間: ",endtime - starttime)
result


搜尋推薦時間:  0:00:06.880243


Unnamed: 0,text,similarity_score
484,5ebc0e34df21684ebcd7637b 韓式香氛蠟燭 - 證照級蠟燭技法 HUE ...,47.0
105,57dd2673d5766a0700c6be33 怪獸工場 - 基礎角色造型篇 跟創作成為戀...,34.8
573,600538ff0cf6c91168243a88 電商人妻 IG 增粉攻略！三大領域經營術 ...,30.5
593,601d03e07e6747ef11bba066 Blender 初學全攻略 - 純手作療癒...,27.8
648,60792df91c463c4c395baa11 輕鬆發聲！質感說話！ 堂課讓您說話不疲勞 ...,27.7
657,609ca403b7b8b8ff4da52037 流轉的光景｜王建傑老師的風景水彩畫 每天都...,27.3
29,5639fe25423bdd0a00103d55 臉部素描－輕鬆打好人物畫基礎 專職插畫創作...,27.2
700,6135374d94b8350007f7fe43 植物水彩繪 - 石斛蘭的觀察紀錄 「植物藝...,26.8
408,5cc0f183f22cad0020777339 ZBrush 從初階到高階 - 隱藏密技...,26.2
190,5a3bc9bf110d8c001e577e75 穿越影像的魔幻世界 - PS 超現實合成後...,25.9


In [226]:
users = pd.read_csv('../data/users.csv').fillna('')
#users

In [227]:
def generate_phrases(txt):
    lst = []
    for i in range(1, len(txt)+1):
        for j in range(0, len(txt) + 1 - i):
            lst.append(txt[j:i+j])
    return lst

In [228]:
keywords = []
for title, interest, recreation in zip(users['occupation_titles'], users['interests'], users['recreation_names']):
    keywords.extend(title.split(','))
    keywords.extend(interest.split(','))
    keywords.extend(recreation.split(','))
keywords = list(set(keywords))
key_dict = {}
for keyword in keywords:
    score_not_zero = []
    for key in keyword.split('_'):
        check = generate_phrases(key) 
        for k in check:
            if(recommend_law(k).iloc[0]['similarity_score'] > 0):
                score_not_zero.append(k)
    key_dict[keyword] = score_not_zero
key_dict
        
    

{'': [],
 '手寫字': ['手', '寫', '字', '手寫', '寫字'],
 '藝術_角色設計': ['藝',
  '術',
  '藝術',
  '角',
  '色',
  '設',
  '計',
  '角色',
  '色設',
  '設計',
  '角色設',
  '色設計',
  '角色設計'],
 '語言_翻譯': ['語', '言', '語言', '翻', '譯', '翻譯'],
 '投資理財_金融商品': ['投',
  '資',
  '理',
  '財',
  '投資',
  '理財',
  '投資理',
  '資理財',
  '投資理財',
  '金',
  '商',
  '品',
  '金融',
  '商品',
  '融商品',
  '金融商品'],
 '語言_韓文': ['語', '言', '語言', '韓', '文', '韓文'],
 '營建工程': ['營', '建', '工', '程', '營建', '工程', '營建工', '建工程', '營建工程'],
 '手作_刺繡': ['手', '作', '刺', '繡'],
 '程式_更多程式': ['程',
  '式',
  '程式',
  '更',
  '多',
  '程',
  '式',
  '多程',
  '程式',
  '更多程',
  '多程式',
  '更多程式'],
 '瑜珈': ['瑜', '瑜珈'],
 '非營利組織': ['非',
  '營',
  '利',
  '組',
  '織',
  '非營',
  '營利',
  '組織',
  '非營利',
  '利組織',
  '非營利組',
  '營利組織',
  '非營利組織'],
 '插畫': ['插', '畫', '插畫'],
 '行銷_數位行銷': ['行', '行銷', '數', '位', '行', '數位', '行銷', '數位行', '位行銷', '數位行銷'],
 '設計_介面設計': ['設',
  '計',
  '設計',
  '介',
  '面',
  '設',
  '計',
  '介面',
  '設計',
  '介面設',
  '面設計',
  '介面設計'],
 '程式_遊戲開發': ['程',
  '式',
  '程式',
  '遊',
  '戲',
  '開',
  '發',
  '