In [1]:
WIKI_SEG_TXT = "wiki_seg.txt"

In [2]:
from gensim.models import FastText
from gensim.models.word2vec import LineSentence
import multiprocessing

max_cpu_counts = multiprocessing.cpu_count()
word_dim_size = 300  # 设置word vector维度
print(f"Use {max_cpu_counts} workers to train FastText (dim={word_dim_size})")

WIKI_SEG_TXT = "wiki_seg.txt"

# 读取训练语句
sentences = LineSentence(WIKI_SEG_TXT)

# 训练模型

model = FastText(sentences, vector_size=word_dim_size, workers=max_cpu_counts)

# 保存模型
output_model = f"fasttext.zh.{word_dim_size}.model"
model.save(output_model)

Use 12 workers to train FastText (dim=300)


In [3]:
print(model.wv.vectors.shape)
model.wv.vectors

(1281108, 300)


array([[-2.1143034 , -3.5834088 ,  0.9235744 , ..., -2.9513686 ,
         0.85165715,  0.7485893 ],
       [ 0.44912398,  1.157038  ,  0.17390107, ..., -1.5394628 ,
        -2.0758872 ,  2.3475122 ],
       [-4.7415714 ,  4.1082935 ,  3.0406713 , ..., -0.18160735,
         0.97919846, -3.1430063 ],
       ...,
       [-0.42775387, -0.4923242 ,  0.3848211 , ...,  0.34394506,
         0.08799452, -0.6257678 ],
       [ 0.3650192 , -0.02768007,  0.23861456, ...,  0.26748458,
         0.39309955,  1.2193569 ],
       [-0.14890154, -0.08819858, -0.0111459 , ...,  0.13148315,
        -0.0556561 , -0.13519895]], dtype=float32)

In [5]:
# 加载并训练FastText模型
model = FastText.load('fasttext.zh.300.model')  # 替换成您的模型路径
vocab = model.wv.key_to_index

# 打印总共收录的词汇数
print(f"總共收錄了 {len(vocab)} 個詞彙")

# 打印前20个收录的词汇
print("印出 20 個收錄詞彙:")
print(list(vocab.keys())[:20])

總共收錄了 1281108 個詞彙
印出 20 個收錄詞彙:
['年', '月', '日', '中', '10', '12', '11', '小行星', '中國', '時', '–', '日本', '美國', '20', '香港', '臺灣', '15', '位於', '30', '站']


In [6]:
vec = model.wv['數學家']
print(vec.shape)
vec 

(300,)


array([-0.36248013,  0.98012364,  0.55402476,  0.4230872 , -0.08536739,
        0.37592587, -1.7252742 ,  0.35973006,  1.0355555 ,  0.20347224,
        0.89693195,  2.3107479 ,  1.1560698 ,  0.43550915, -1.8510116 ,
        1.8772343 ,  2.1570227 , -0.8266401 , -0.4693025 , -2.9896805 ,
       -0.76506203,  0.55268776, -0.6439924 , -2.1026824 ,  0.82134354,
       -0.03820011,  1.9603138 ,  3.5589116 ,  1.4944109 , -1.1495035 ,
       -2.2098894 , -2.3953311 , -0.26409993,  0.7407733 ,  0.37456003,
       -3.1395462 ,  0.02966588,  1.4000149 , -0.7277578 ,  0.29956517,
        0.23028906, -3.5200157 , -1.7080253 ,  1.3862095 , -0.80813515,
       -2.0764017 ,  1.3242884 ,  0.78814757,  0.70669633,  0.8410793 ,
       -2.373359  ,  0.17978494, -1.7763336 , -0.5197104 ,  0.02317583,
       -0.61163217, -0.24958536,  1.333366  ,  0.6208196 ,  0.62406814,
        0.20208071, -0.8206698 , -1.3902856 , -1.7098118 ,  0.6752405 ,
        2.733623  ,  1.2758677 , -0.6498876 ,  0.2879006 ,  0.38

In [7]:
word = "這肯定沒見過 "

# 若強行取值會報錯
try:
    vec = model.wv[word]
except KeyError as e:
    print(e)

In [8]:
model.wv.most_similar("飲料", topn=10)

[('輝劍', 0.9710693359375),
 ('名松', 0.9501043558120728),
 ('飲料類', 0.9315357208251953),
 ('飲料機', 0.9279479384422302),
 ('飲料罐', 0.8953368663787842),
 ('軟飲料', 0.8831291198730469),
 ('茶飲料', 0.8725141882896423),
 ('經米濱', 0.8699420690536499),
 ('飲品', 0.8453800082206726),
 ('飲料瓶', 0.7923927307128906)]

In [9]:
model.wv.most_similar("car")

[('hcar', 0.8572422862052917),
 ('carcar', 0.8509683012962341),
 ('ccar', 0.849014401435852),
 ('jetcar', 0.8113610148429871),
 ('tramcar', 0.8039993643760681),
 ('zipcar', 0.8031772971153259),
 ('motorcar', 0.8004679083824158),
 ('boxcar', 0.8001610636711121),
 ('indycar', 0.7994945645332336),
 ('cars', 0.7986459136009216)]

In [10]:
model.wv.most_similar("facebook")

[('youtubefacebook', 0.927127480506897),
 ('thefacebook', 0.8969534635543823),
 ('facebookpage', 0.8882699012756348),
 ('facebox', 0.8686223030090332),
 ('instagram', 0.7984750270843506),
 ('twitteryoutube', 0.7682653069496155),
 ('googleyoutube', 0.7594155669212341),
 ('twitter', 0.7524656057357788),
 ('youtube', 0.7465772032737732),
 ('lnstagram', 0.7246598601341248)]

In [11]:
model.wv.most_similar("詐欺")

[('賈邱', 0.8884372115135193),
 ('赤坑鎮', 0.831690788269043),
 ('中境', 0.8139835000038147),
 ('越中境', 0.7967145442962646),
 ('詐欺罪', 0.7719191908836365),
 ('他魚', 0.7685927152633667),
 ('欺詐', 0.7375842332839966),
 ('抱出', 0.7236839532852173),
 ('欺詐案', 0.6801178455352783),
 ('義德堂', 0.6536234021186829)]

In [12]:
model.wv.most_similar("合約")

[('德康', 0.9192003607749939),
 ('合同', 0.8019339442253113),
 ('綠蠅', 0.749859631061554),
 ('合同期', 0.7271698713302612),
 ('合同額', 0.717912495136261),
 ('合同商', 0.7071802616119385),
 ('簽約', 0.7068753242492676),
 ('續約', 0.7045730948448181),
 ('籤合同', 0.6919059753417969),
 ('合同制', 0.6848821640014648)]

In [13]:
model.wv.similarity("連結", "鍵接")

0.4270057

In [14]:
model.wv.similarity("連結", "陰天")

-0.015750855

In [17]:
print(f"Loading {output_model}...")
new_model = FastText.load(output_model)

Loading fasttext.zh.300.model...


In [18]:
model.wv.similarity("連結", "陰天") == new_model.wv.similarity("連結", "陰天")

True