[Python programming: using gensim to calculate Chinese characters for similarity](https://www.programmersought.com/article/5674656908/)

In [87]:
pip install gensim




In [88]:
path =  "/content/drive/Shareddrives/AI_project/"

In [127]:
# -*- coding: utf-8 -*-

import logging

import jieba
from gensim import corpora, models, similarities

logging.basicConfig(level=logging.DEBUG)
jieba.setLogLevel(logging.INFO)


class DocumentSimilar(object):
    def __init__(self, documents):
        self.documents = documents
        self.dictionary = None
        self.tfidf = None
        self.similar_matrix = None
        self.calculate_similar_matrix()

    @staticmethod
    def split_word(document):
        """
        Word segmentation, remove stop words
        """
        path =  "/content/drive/Shareddrives/AI_project/"
        text_file = open(path + "stop_words_chinese.txt", "r")
        stop_words = text_file.read().split("\n")
        # print(lines)
        text_file.close()
        # stop_words = {":", "of", "，", "”"}

        text = []
        for word in jieba.cut(document):
            if word not in stop_words:
                text.append(word)

        # logging.debug(text)

        return text

    def calculate_similar_matrix(self):
        """
                 Calculate the similarity matrix and some necessary data
        """
        words = [self.split_word(document) for document in self.documents]

        self.dictionary = corpora.Dictionary(words)
        corpus = [self.dictionary.doc2bow(word) for word in words]
        # Creating a transformation
        # self.tfidf = models.TfidfModel(corpus)
        self.tfidf = models.LsiModel(corpus, id2word=self.dictionary, num_topics=200)
        self.tfidf.print_topics(200)
        # apply a transformation to a whole corpus:
        corpus_tfidf = self.tfidf[corpus]
        # self.similar_matrix = similarities.MatrixSimilarity(corpus_tfidf, num_features=len(self.dictionary))
        self.similar_matrix = similarities.MatrixSimilarity(corpus_tfidf)
    def get_similar(self, document):
        """
                 Calculate the similarity between the document to be compared and each document in the corpus
        """
        words = self.split_word(document)
        # vectorize the phrase
        # create the bag-of-word representation for a document using the doc2bow method of the dictionary, which returns a sparse representation of the word counts
        corpus = self.dictionary.doc2bow(words)
        # The first entry in each tuple corresponds to the ID of the token in the dictionary, the second corresponds to the count of this token.
        corpus_tfidf = self.tfidf[corpus]
        return self.similar_matrix[corpus_tfidf]



In [90]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [91]:
pip install opencc-python-reimplemented



In [92]:
from opencc import OpenCC
cc = OpenCC('t2s')

In [93]:
import pandas as pd
import numpy as np
import csv
import os
import jieba


In [94]:

train = pd.read_csv(path + 'data/all_sim.csv', index_col=0)
# train = pd.read_csv(path + 'data/export.csv', index_col=0)
train.head(5)

Unnamed: 0_level_0,title,media,content_sim
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,快讯／东石确诊超商店员「感染源找呒」279人筛检全阴性,setn,记者洪正达／嘉义报导嘉义县东石乡15日出现一起确诊个案，该案为40岁超商女店员，但尚未找到感...
1.0,郭采洁又崩坏？「括号眉」造型曝光 网：阴间妆容,ltn,郭采洁出道多年，早期多以甜美形象示人。（报社资料照）〔即时新闻／综合报导〕女星郭采洁出道多年...
2.0,反击乔丹《最后一舞》？皮朋推出回忆录　揭「不曾公开的故事」,ettoday,记者游郁香／综合报导乔丹(Michael Jordan)纪录片《最后一舞》去年取得巨大成功，...
3.0,电视台摄影师猝逝后才发现确诊新冠肺炎！壹电视宣布：已安排全体员工紧急快筛,storm,台北市内湖区壹电视大楼于6月7日早上发现一名摄影师猝死厕所，8日死者PCR检测结果出炉，证实...
4.0,美参院通过FTC主席任命 反托拉斯大将掌旗,cna,（中央社华盛顿15日综合外电报导）反托拉斯大将、哥伦比亚大学法学院教授丽娜汗（Lina Kh...


In [95]:
# 對title 切字
# train['cut_title'] = train.title.apply(lambda sen:jieba.lcut(sen))
# train.head(3)

In [96]:
train.shape

(57539, 3)

In [97]:
# 把不是string 刪掉
# for my_index, row in train.iterrows():
#     # if(type(train['title'][my_index])!=type("hello")):
#     #     print(train['title'][my_index])
#         # train.drop(my_index, inplace=True)
# train.shape
# train.head(100)

In [98]:
# 把content轉成簡體
# train['content_sim'] = train.content.apply(lambda sen:cc.convert(sen))
# train.head(5)

In [99]:
# 對content 切字
# train['cut_content'] = train.content_sim.apply(lambda sen:jieba.lcut(sen))
# train.head(3)

In [100]:
train2 = train[['title', 'media']]
# train2 = train[['title', 'media', 'content_sim']]
# train2.to_csv(path + "/data/all_sim_limit.csv", index = True, header = True)

In [101]:
# train2.shape
# train2.head

In [102]:
from gensim.models import Word2Vec

In [103]:
# myWord2Vec = Word2Vec(train2.cut_title, size=250, iter = 10, sg=1)

In [104]:
# print(myWord2Vec)

In [105]:
# myWord2Vec.wv.vocab

In [106]:
# myWord2Vec.wv.get_vector('快讯')

In [107]:
# myWord2Vec.wv.similar_by_word('赖清德')

In [108]:
# cols = ['title']
cols = ['title']
train2 = train.loc[:, cols]
train2.head(3)
# print(type(train))
title_list = train2['title'].values.tolist()
print(len(title_list))

57539


In [109]:
# dictionary = corpora.Dictionary(texts)
# corpus = [dictionary.doc2bow(text) for text in texts]

In [110]:
# # 'index'只是把文章儲存到本地後的檔名，所以可以隨便命名，結果儲存的檔名是index.0，不是文字檔案，無法直接檢視
# index = similarities.Similarity('index', lsi[corpus], num_features=lsi.num_topics)
# for i in enumerate(index):
#     print(i)   # 輸出對整組的相似度
# # 或者，直接輸出文章id分組
# # percentage是相似度，可以手動設定0.9代表把90%相似度以上的輸出為1組等
# for l, degrees in enumerate(index):
#     print(contents[l][0], [contents[i][0] for i, similarity in enumerate(degrees) if similarity >= percentage])

In [128]:
doc_similar = DocumentSimilar(title_list[:25000])

INFO:gensim.corpora.dictionary:adding document #0 to Dictionary(0 unique tokens: [])
INFO:gensim.corpora.dictionary:adding document #10000 to Dictionary(25918 unique tokens: ['279', '「', '」', '东', '呒']...)
INFO:gensim.corpora.dictionary:adding document #20000 to Dictionary(37950 unique tokens: ['279', '「', '」', '东', '呒']...)
INFO:gensim.corpora.dictionary:built Dictionary(42511 unique tokens: ['279', '「', '」', '东', '呒']...) from 25000 documents (total 341163 corpus positions)
INFO:gensim.models.lsimodel:using serial LSI version on this node
INFO:gensim.models.lsimodel:updating model with new documents
INFO:gensim.models.lsimodel:preparing a new chunk of documents
DEBUG:gensim.models.lsimodel:converting corpus to csc format
INFO:gensim.models.lsimodel:using 100 extra samples and 2 power iterations
INFO:gensim.models.lsimodel:1st phase: constructing (42511, 300) action matrix
INFO:gensim.models.lsimodel:orthonormalizing (42511, 300) action matrix
DEBUG:gensim.matutils:computing QR of (42

In [129]:
doc_similar2 = DocumentSimilar(title_list[25001:])

INFO:gensim.corpora.dictionary:adding document #0 to Dictionary(0 unique tokens: [])
INFO:gensim.corpora.dictionary:adding document #10000 to Dictionary(26047 unique tokens: [' ', '1.37', '5', '下半年', '亿元']...)
INFO:gensim.corpora.dictionary:adding document #20000 to Dictionary(37997 unique tokens: [' ', '1.37', '5', '下半年', '亿元']...)
INFO:gensim.corpora.dictionary:adding document #30000 to Dictionary(46860 unique tokens: [' ', '1.37', '5', '下半年', '亿元']...)
INFO:gensim.corpora.dictionary:built Dictionary(48795 unique tokens: [' ', '1.37', '5', '下半年', '亿元']...) from 32538 documents (total 443743 corpus positions)
INFO:gensim.models.lsimodel:using serial LSI version on this node
INFO:gensim.models.lsimodel:updating model with new documents
INFO:gensim.models.lsimodel:preparing a new chunk of documents
DEBUG:gensim.models.lsimodel:converting corpus to csc format
INFO:gensim.models.lsimodel:using 100 extra samples and 2 power iterations
INFO:gensim.models.lsimodel:1st phase: constructing (48

In [113]:
# doc_similar_3 = DocumentSimilar(title_list[20001:30000])

In [114]:
# doc_similar_4 = DocumentSimilar(title_list[30001:40000])

In [115]:
# doc_similar_5 = DocumentSimilar(title_list[40001:50000])

In [116]:
# doc_similar_6 = DocumentSimilar(title_list[50001:])

In [134]:
target_list = []
if __name__ == '__main__':

    # documents = [
    #     "雖想得諾貝爾 梅琳達早想離開蓋茲 只為了等她",
    #     "印度疫情降溫 墨比爾斯看多印股",
    #     "身價3.5兆夫妻「正式離婚」！微軟創辦人比爾蓋茲宣布與妻子結束27年婚姻",
    #     "苗栗縣長徐耀昌 走訪田美堰視察水情",
    #     "圖畫書聖經《好餓的毛毛蟲》作者艾瑞•卡爾逝世 享耆壽91歲",
    # ]

    # doc_similar = DocumentSimilar(title_list[:10000])
    # doc_similar = DocumentSimilar(documents[:2])
    #Documents to compare
    new_doc = "美赠250万剂疫苗 外交部：展现对台坚定支持与高度重视"
    count = 0
    title_threshold = 0.7
    content_threshold = 0.1
    # target_list
    for value, document in zip(doc_similar.get_similar(new_doc), title_list[:25000]):
        # print("{:.2f}".format(value), document)
        # print(value)
        count+=1
        # if count > 10:
        #   break
        if value > title_threshold:
          # print("{:.2f}".format(value), document)
          target_list.append((value, document))
    for value, document in zip(doc_similar2.get_similar(new_doc), title_list[25001:]):
        # print("{:.2f}".format(value), document)
        # print(value)
        count+=1
        # if count > 10:
        #   break
        if value > title_threshold:
          # print("{:.2f}".format(value), document)   
          target_list.append((value, document))
    # for value, document in zip(doc_similar.get_similar(new_doc), title_list[:25000]):
    #     count+=1
    #     if value > content_threshold:
    #       print("{:.2f}".format(value), document)   
    # for value, document in zip(doc_similar2.get_similar(new_doc), title_list[25001:]):
    #     count+=1
    #     if value > content_threshold:
    #       print("{:.2f}".format(value), document)   
    # for value, document in zip(doc_similar_3.get_similar(new_doc), title_list[30001:40000]):
    #     count+=1
    #     if value > content_threshold:
    #       print("{:.2f}".format(value), document)   
    # for value, document in zip(doc_similar_4.get_similar(new_doc), title_list[40001:50000]):
    #     count+=1
    #     if value > content_threshold:
    #       print("{:.2f}".format(value), document)   
    # for value, document in zip(doc_similar_5.get_similar(new_doc), title_list[50001:]):
    #     count+=1
    #     if value > content_threshold:
    #       print("{:.2f}".format(value), document)             
    print("finish")

finish


In [118]:
# with open("news_selected.csv", 'a', newline='', encoding='utf-8-sig') as csvfile:
#     writer = csv.writer(csvfile)
#     # if i ==0:
#     writer.writerow(["index", "title", "media", "point"])
#     # writer.writerow(news_list)

In [119]:
train2["point"] = 0
train2.head(3)

Unnamed: 0_level_0,title,point
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,快讯／东石确诊超商店员「感染源找呒」279人筛检全阴性,0
1.0,郭采洁又崩坏？「括号眉」造型曝光 网：阴间妆容,0
2.0,反击乔丹《最后一舞》？皮朋推出回忆录　揭「不曾公开的故事」,0


In [131]:
sorted_by_second = sorted(target_list, key=lambda tup: -tup[0])
# in
appended_data = []
for a in sorted_by_second:
  print(a)
  # train2.loc[train2['title']==a[1], "point"] = a[0]
  # appended_data.append(train2.loc[train2['title']==a[1]])
  # print(train2.loc[train2['title']==a[1]])
  # with open("news_selected.csv", 'a', newline='', encoding='utf-8-sig') as csvfile:
  #   writer = csv.writer(csvfile)
  #   writer.writerow(news_list)
# train2.head(240)
# appended_data = pd.concat(appended_data)
# appended_data.to_csv(path + "/data/a.csv", index = True, header = True)

(0.99999994, '美赠250万剂疫苗 外交部：展现对台坚定支持与高度重视')
(0.98957336, '美赠250万剂疫苗 总统府证实：萧美琴已前往代表接收')
(0.9833814, '美赠250万剂疫苗抵台 陈其迈：台美真朋友真进展')
(0.9721608, '娇生疫苗爆产线污染 外媒：美再准1500万剂出口海外')
(0.9500799, '传美捐赠75万剂疫苗明抵台 谢长廷：令人安心的消息')
(0.93906015, '白宫：本月完成分配8000万剂疫苗 即刻出货')
(0.93839216, '美赠250万剂疫苗  吴钊燮：台湾人永远铭记在心')
(0.9068061, '美赠250万剂疫苗 萧美琴：加倍的爱启程运往台湾')
(0.90183246, '美赠250万剂疫苗将抵台 吴钊燮：台美关系坚若磐石')
(0.90143716, '美捐赠疫苗250万剂 江启臣：印证台美情谊')
(0.90097654, '美国加码捐赠250万剂疫苗 行政院：患难之际相助心存感激')
(0.8996077, '纽时：美捐台250万剂疫苗 美中关系恐更紧绷')
(0.8996077, '纽时：美捐台250万剂疫苗 美中关系恐更紧绷')
(0.8996077, '纽时：美捐台250万剂疫苗 美中关系恐更紧绷')
(0.89818454, '赠台250万剂疫苗 AIT：不会忘记台湾先前对美医疗援助')
(0.8973137, '美国赠台250万剂疫苗 谢长廷：可体会法律程序复杂和辛劳')
(0.89374685, '美国加码赠台250万剂疫苗 民进党：非常温暖')
(0.89294803, '美捐赠我国疫苗加码至250万剂 政院：患难见真情')
(0.8892673, '美250万剂疫苗今抵台  郑文灿：台美真朋友')
(0.8884783, '美赠250万剂疫苗蓝营首长噤声？ 许智杰：党中央有感谢')
(0.8870296, '美国赠250万剂疫苗 苏贞昌：真的非常感动')
(0.88411915, '美国赠台250万剂  张丽善：及时雨解决疫苗荒')
(0.8832923, '美方再赠250万剂疫苗 江启臣：疫苗不能单靠国际援助')
(0.88274544, '美国赠台250万剂疫苗 绿营批：蓝营百般无奈口头感谢')
(0.8809268, '250万剂疫

In [121]:
# train2.to_csv(path + "/data/all_sim_limit.csv", index = True, header = True)