In [1]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity

base = 0 # index of corpus with zero-base

#Case subject
subject = [
    '[OSCE XG 1876] windows 10 update 1809 asking us to uninstall trend micro software',
    'Clients Not showing online on Console',
    '[OSCE XG] Windows feature pack 1809', 
    'I have one cute cat',
]
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(subject)
word = vectorizer.get_feature_names_out()
print(word) # 顯示有多少單字
print('-'*50)
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(X)
similarity_subject = np.array(cosine_similarity(tfidf[base], tfidf)).ravel()
print(similarity_subject) # 顯示每一篇文章的相關 根據 base 這篇文章

['10' '1809' '1876' 'asking' 'cat' 'clients' 'console' 'cute' 'feature'
 'have' 'micro' 'not' 'on' 'one' 'online' 'osce' 'pack' 'showing'
 'software' 'to' 'trend' 'uninstall' 'update' 'us' 'windows' 'xg']
--------------------------------------------------
[1.         0.         0.33220043 0.        ]


In [3]:
#Case body
body = [    
    'Altaan called in regarding recent windows 10 update 1809 asking us to uninstall trend micro software. Unloading agents will allow the installation of 1809 upgrade',
    'Clients Not showing online on Console \\Suddenly clients on the server went 0',
    'Windows feature pack 1809 reports an incompatibility with Trend. Forces an uninstall in order to update Windows. I then manually reinstall Trend without any issues but when manually scanning files Office scan hangs and i have to run an end task.',
    'How is weather today in Taipei',
]
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(body)
word = vectorizer.get_feature_names_out()
print(word) # 顯示字碼表
print('-'*50)
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(X)
body_subject = np.array(cosine_similarity(tfidf[base], tfidf)).ravel()
print(body_subject) # 根據 base 文章進行比對

['10' '1809' 'agents' 'allow' 'altaan' 'an' 'and' 'any' 'asking' 'but'
 'called' 'clients' 'console' 'end' 'feature' 'files' 'forces' 'hangs'
 'have' 'how' 'in' 'incompatibility' 'installation' 'is' 'issues'
 'manually' 'micro' 'not' 'of' 'office' 'on' 'online' 'order' 'pack'
 'recent' 'regarding' 'reinstall' 'reports' 'run' 'scan' 'scanning'
 'server' 'showing' 'software' 'suddenly' 'taipei' 'task' 'the' 'then'
 'to' 'today' 'trend' 'uninstall' 'unloading' 'update' 'upgrade' 'us'
 'weather' 'went' 'when' 'will' 'windows' 'with' 'without']
--------------------------------------------------
[1.         0.03306472 0.20370167 0.03683486]


In [4]:
values = similarity_subject * 0.5 + body_subject * 0.5
values

array([1.        , 0.01653236, 0.26795105, 0.01841743])

In [6]:
# 50 : 50 weight
values = similarity_subject * 0.5 + body_subject * 0.5
# remove myself
values[base] = 0

# find best solution
index = np.argmax(values)
# print out sugesiton only similarity > 0.1
if values[index] > 0.1:
    print('Similarity {}'.format(values[index]))
    print('---------------------------------------------------------------')
    print('Src Subject : ','\n', subject[base])
    print('Src Body :','\n', body[base])
    print('---------------------------------------------------------------')
    print('Prd Subject : ','\n', subject[index])
    print('Prd Body :','\n', body[index])
else:
    print('Similarity {}'.format(values[index]))
    print('NO_DATA_FOUND')

Similarity 0.26795104939620223
---------------------------------------------------------------
Src Subject :  
 [OSCE XG 1876] windows 10 update 1809 asking us to uninstall trend micro software
Src Body : 
 Altaan called in regarding recent windows 10 update 1809 asking us to uninstall trend micro software. Unloading agents will allow the installation of 1809 upgrade
---------------------------------------------------------------
Prd Subject :  
 [OSCE XG] Windows feature pack 1809
Prd Body : 
 Windows feature pack 1809 reports an incompatibility with Trend. Forces an uninstall in order to update Windows. I then manually reinstall Trend without any issues but when manually scanning files Office scan hangs and i have to run an end task.
