In [1]:
import numpy as np
import csv
import random
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer,TfidfVectorizer
from pyvi import ViTokenizer, ViPosTagger
from bs4 import BeautifulSoup
import collections
import math
import os
from gensim import corpora, matutils,models
import sys
import codecs
import json
import numpy as np
import random,time
from sklearn import datasets
import operator
from scipy import sparse

In [2]:
SPECIAL_CHARACTER = '0123456789?…“”–%@$.,=+-!;/()*"&^:#|\n\t\'.{}<>[]'
DIR_PATH='/Users/mac/Desktop/Internship/data/'
STOP_WORDS = os.path.join(DIR_PATH,'stopwords-nlp-vi.txt')
DICTIONARY_PATH = os.path.join(DIR_PATH,'dictionary.txt')

In [3]:
class ReadData(object):
    def __init__(self,dataPath,encoder=None):
        self.dataPath = dataPath
        self.encoder = encoder if encoder != None else 'utf-16le'
        
    def get_name_files(self):
        class_file = os.listdir(self.dataPath)        
        class_label=[]
        folders=[]
        for file in os.listdir(self.dataPath):
            class_label.append(file)
            folders.append(os.path.join(self.dataPath,file))
        self.folders = folders
        self.class_label=class_label
        
    def get_data(self):
        data=[]
        i=0
        with open(DIR_PATH+'data_news_vcc.csv',encoding='utf-8') as csvfile:
            e = csvfile.readline()
            d = csv.reader(csvfile,delimiter=",")
            for row in d:
                soup = BeautifulSoup(row[4],"html5lib")
                text = soup.get_text()
                if(len(text)>20):
                    data.append({
                        "newsId":row[0],
                        "sourceNews":row[1],
                        "tilte":row[2],
                        "sapo":row[3],
                        "content":text
                    })
                    i=i+1
                if i==2000:
                    break
            csvfile.close()
        self.data=data
        print(len(data))
        return data
    
    def read_stopwords(self):
        with open(self.dataPath, 'r',encoding="utf-8") as f:
            stopwords = set([w.strip().replace(' ', '_') for w in f.readlines()])
        return stopwords
    
    def load_dictionary(self):
        return corpora.Dictionary.load_from_text(self.dataPath) 
    
class FileStore(object):
    def __init__(self, filePath, data = None):
        self.filePath = filePath
        self.data = data

    def store_dictionary(self, dict_words):
        dictionary = corpora.Dictionary(dict_words)
        dictionary.filter_extremes()
        #dictionary.filter_extremes(no_below=20, no_above=0.3)
        dictionary.save_as_text(self.filePath)

In [4]:
class FeatureExtraction(object):
    def __init__(self, data):
        self.data = data
        self.dictionary = None

    def __build_dictionary(self):
        print ('Building dictionary')
        dict_words = []
        i = 0
        for text in self.data:
            i+=1
            #print ("Step {} / {}".format(i, len(self.data)))
            words = NLP(text=text['content']).get_words_feature()
            dict_words.append(words)
        FileStore(filePath=DICTIONARY_PATH).store_dictionary(dict_words)

    def __load_dictionary(self):
        if self.dictionary is not None:
            return
        if os.path.exists(DICTIONARY_PATH) == False:
            self.__build_dictionary()
        self.dictionary = ReadData(DICTIONARY_PATH).load_dictionary()

    def build_dataset(self):
        self.features = []
        self.labels = []
        i = 0
        for d in self.data:
            i += 1
            #print ("Step {} / {}".format(i, len(self.data)))
            self.features.append(self.get_dense(d['content']))
            #self.features.append((d['content']))
            self.labels.append(d['newsId'])

    def get_dense(self, text):
        self.__load_dictionary()
        words = NLP(text).get_words_feature()
        # Bag of words
        vec = self.dictionary.doc2bow(words)
        #dense = list(matutils.corpus2dense([vec], num_terms=len(self.dictionary)).T[0])
        self.len_dict=len(self.dictionary)
        #model = models.TfidfModel(corpus)
        #vector = model[corpus]
        return vec

    def get_data_and_label(self):
        self.build_dataset()
        model = models.TfidfModel(self.features,normalize=True)
        self.vector = model[self.features]
        return self.vector, self.labels,self.len_dict

In [5]:
class NLP(object):
    def __init__(self, text = None):
        self.text = text
        self.__set_stopwords()

    def segmentation(self):
        return ViTokenizer.tokenize(self.text)    
        
    def __set_stopwords(self):
        self.stopwords = ReadData(STOP_WORDS).read_stopwords()
        
    def split_words(self):
        text = self.segmentation()
        try:
            return list(filter(None,[x.strip(SPECIAL_CHARACTER).lower() for x in text.split()]))
        except TypeError:
            return []
        
    def get_words_feature(self):
        split_words = self.split_words()
        return [word for word in split_words if word.encode('utf-8') not in self.stopwords] 

In [6]:
data = ReadData(DIR_PATH).get_data()
x_train,y_train,len_dict = FeatureExtraction(data=data).get_data_and_label()
print(len_dict)

2000
Building dictionary
8203


In [7]:
tfidf_matrix=[]
count=0
for x in x_train:
    row=[]
    number=[]
    col = np.zeros(len(x))
    for i in range(len(x)):
        row.append(x[i][0])
        number.append(x[i][1])
    A=sparse.csr_matrix((number, (row, col)), shape=(len_dict,1))
    B = A.toarray()
    #A = A.reshape(1,-1)
    tfidf_matrix.append(B)
    count+=1

In [8]:
tfidf_matrix = np.asarray(tfidf_matrix).reshape(len(x_train),len_dict)
print(tfidf_matrix.shape)

(2000, 8203)


In [9]:
def sketch(sig,dim):
    vector =[]
    for i in range(sig):
        x = np.random.randint(2,size=dim)
        for v in range(len(x)):
            if x[v]==0:
                x[v]=-1
        vector.append(x)
    vector = np.asanyarray(vector)
    return vector

In [10]:
t = time.time()
s = sketch(500,len_dict)
#matrix_sig = np.dot(s,tfidf_matrix.T)
matrix_sig = np.dot(s,tfidf_matrix.T)    
print(time.time()-t)
print(matrix_sig.shape)

1.6163980960845947
(500, 2000)


In [11]:
#Chuan hoa vector
matrix_sig_norm = np.copy(matrix_sig)
for i in range(matrix_sig_norm.shape[1]):
    temp = np.linalg.norm(matrix_sig_norm[:,i])
    #print(matrix_sig_norm[:,i].shape)
    for j in range(matrix_sig_norm.shape[0]):
        matrix_sig_norm[j][i]/=temp 

In [12]:
sum(matrix_sig_norm[:,:]**2)

array([1., 1., 1., ..., 1., 1., 1.])

In [13]:
# for m in matrix_sig_norm:
#     break
#     temp = np.linalg.norm(m)
#     for i in range(len(matrix_sig_norm)):
#         m[i]/=temp
#     break
# A=[-4, -3, -2, -1,  0,  1,  2,  3,  4]
# temp = np.linalg.norm(A,2)
# for i in range(len(A)):
#     A[i]/=temp
# print(temp)
# s=0
# print(A)
# for j in range(len(A)):
#     s+=A[j]**2
# print(s)
s=0
for j in range(matrix_sig_norm.shape[0]):
    s+=matrix_sig_norm[j][99]**2
print(s)

1.0


In [14]:
len_buckets = 101
hash_table = [[] for i in range(len_buckets)]

def initialize_array_bucket(bands):
    global len_buckets
    array_buckets = []
    for band in range(bands):
        array_buckets.append([[] for i in range(len_buckets)])
    return array_buckets

def apply_LSH_technique(matrix_sig,t,bands,rows):
    t_s = 0
    t_d = 0
    if (bands*rows) != len(matrix_sig):
        print("Error")
        return {},[]
    else:      
        candidates_sig = {}
        candidates_doc = {}
        candidates_list = {}
        dict_item={}
        array_buckets =initialize_array_bucket(bands)
        i = 0
        it = 0
        for b in range(bands):
            buckets = array_buckets[b]        
            band = matrix_sig[i:i+rows,:]
            for col in range(band.shape[1]):
                #hash S1,S2,..,Sn to buckets
                key = int(sum(band[:,col]) % len(buckets))            
                buckets[key].append(col)
            for item in buckets:
                if len(item) > 1:
                    # chuẩn hoá vector có độ dài =1 (Thư viện Ok)
                    #Sau đó [a,b,c]*[a;b;c] => giảm thời gian (Ok)
                    # Thưa hoá vector để giảm thời gian (Ok)
                    doc_m = tfidf_matrix.T[:,item]
                    _t = time.time()
                    similarity_matrix_doc = doc_m.T.dot(doc_m)
                    t_d += time.time() - _t
                    #index_doc1 = np.argwhere((similarity_matrix_doc-np.triu(similarity_matrix_doc))>=t)   
                    index_doc = np.argwhere(similarity_matrix_doc>=t)
                    #print(index_doc.shape)
#                     for i in range(index_doc.shape[0]):
#                         if index_doc[i][0] == index_doc[i][1]:
#                             continue
#                         if index_doc[i][0] in dict_item.keys():
#                             value = dict_item[index_doc[i][0]]
#                             if index_doc[i][1] not in value.keys():
#                                 value[index_doc[i][1]] = similarity_matrix_doc[index_doc[i][0],index_doc[i][1]]  
#                         else:
#                             value={}
#                             value[index_doc[i][1]] = similarity_matrix_doc[index_doc[i][0],index_doc[i][1]]  
#                             dict_item[index_doc[i][0]] = value
                            
                    for i in range(index_doc.shape[0]):
                        if index_doc[i][0] == index_doc[i][1]:
                            continue
                        #print(item[index_doc[i][0]],item[index_doc[i][1]])
                        if item[index_doc[i][0]] in dict_item.keys():
                            value = dict_item[item[index_doc[i][0]]]
                            if item[index_doc[i][1]] not in value.keys():
                                value[item[index_doc[i][1]]] = similarity_matrix_doc[index_doc[i][0],index_doc[i][1]]  
                        else:
                            value={}
                            value[item[index_doc[i][1]]]= similarity_matrix_doc[index_doc[i][0],index_doc[i][1]]  
                            dict_item[item[index_doc[i][0]]] = value

#                     for i in range(index_doc.shape[0]):                            
#                         pair = (item[index_doc[i][0]],item[index_doc[i][1]])
#                         print("Pair: {},{}".format(pair[0],pair[1]))
#                         print(similarity_matrix_doc[index[i][0],index[i][1]])
#                         if pair not in candidates_doc:
#                             candidates_doc[pair] = similarity_matrix_doc[index_doc[i][0],index_doc[i][1]]                   
            i = i+rows
        #sort_sig = sorted(candidates_sig.items(),key=operator.itemgetter(1), reverse=True)
        #sort_doc=sort_doc = sorted(candidates_doc.items(),key=operator.itemgetter(1), reverse=True)
        #sort_list=None
        #sorted(candidates_list.items(),key=operator.itemgetter(1), reverse=True)
        #print(sort)
        print("t_s = %s" % t_s)
        print("t_d = %s" % t_d)
        print(it)
        return dict_item

def cosine_distance(v1,v2):
    s=0;x=0;y=0
    for i in range(len(v1)):
        s+= v1[i]*v2[i]
        x+= v1[i]**2
        y+= v2[i]**2
    x=math.sqrt(x)
    y=math.sqrt(y)
    cosine = s/(x*y)
    return cosine

In [15]:
d= apply_LSH_technique(matrix_sig,t=0.8,bands=20,rows=25)
print(d)

t_s = 0
t_d = 0.8077716827392578
0
{144: {1135: 1.0}, 257: {626: 0.9789131306651371}, 626: {257: 0.9789131306651371}, 851: {936: 0.9670155816649704}, 936: {851: 0.9670155816649704}, 1135: {144: 1.0}, 1220: {1302: 0.998049263386743}, 1298: {1976: 0.9532449530230064}, 1302: {1220: 0.998049263386743}, 1361: {1522: 0.9924395495449743}, 1459: {1584: 0.9922937299034787}, 1522: {1361: 0.9924395495449743}, 1584: {1459: 0.9922937299034787}, 1653: {1674: 0.97524059469608}, 1674: {1653: 0.97524059469608}, 1701: {1881: 0.9990875260416507}, 1813: {1857: 0.9694043850724383}, 1857: {1813: 0.9694043850724383}, 1881: {1701: 0.9990875260416507}, 1976: {1298: 0.9532449530230064}, 93: {161: 0.9936480672503629}, 161: {93: 0.9936480672503629}, 218: {245: 0.9646681969565332, 128: 0.9620007593992215}, 245: {218: 0.9646681969565332, 128: 0.9571575271445143}, 887: {981: 0.9945148056934718}, 946: {1064: 0.9935818484134968}, 981: {887: 0.9945148056934718}, 1064: {946: 0.9935818484134968}, 1650: {1668: 0.996635056

In [35]:
e_s = [e[0] for e in sort_s]
e_d = [e[0] for e in sort_d]
print("E_DOC: {}".format(len(e_d)))
for e in e_d:
    if e not in e_s:
        print(e)
print("E_SIG: {}".format(len(e_s)))
for e in e_s:
    if e not in e_d:
        print(e)

E_DOC: 185
(1461, 519)
E_SIG: 184


In [25]:
def print_doc(x1,x2):
    print("{} \n{} \n---------------------- \n{} \n{}"
          .format(y_train[x1],data[x1]["content"],
                  y_train[x2],data[x2]["content"]))


In [36]:
print_doc(1461, 519)

Tuoitre 
Đoạn video bà Trang phát ngôn "con người không quan trọng" khiến nhiều người bức xúc - Video: HOÀNG NĂNGNgày 5-5, lãnh đạo Ban quản lý khu kinh tế Hải Phòng, TP Hải Phòng cho biết đang xem xét bản tường trình của bà Dương Thị Thuỳ Trang - nữ tài xế đang gây bất bình trong dư luận về phát ngôn "con người không quan trọng" khi được đề nghị đưa nạn nhân đi bệnh viện kiểm tra sau vụ va chạm giao thông chiều 2-5.Bức xúc vì công an thiếu khách quan?Trao đổi với Tuổi Trẻ Online ngày 5-5, ông Nguyễn Công Thành - bí thư Đảng uỷ Ban quản lý khu kinh tế Hải Phòng - cho biết ngay sau khi nghe được thông tin báo chí phản ánh về vụ việc lãnh đạo ban quản lý đã nắm bắt và đề nghị bà Trang làm báo cáo tường trình.  Theo ông Thành, tập thể lãnh đạo trong ban cũng đã nghe trực tiếp tường trình của bà Trang và đang có những bước xem xét, đánh giá cụ thể.  Trong nội dung tường trình bước đầu, bà Trang cho rằng thời điểm đó chỉ đang "tranh luận" với công an trong yêu cầu về việc lập biên bản và cá

In [46]:
item = np.random.randint(10,size=10)
similarity_matrix_doc =np.array([[1,0.77,0.49,0.89,0.52],[0.77,1,0,0.64,0.14],
                                 [0.49, 0,1,0.55,0.88], [0.89,0.64,0.55,1,0.68],
                                 [0.52,0.14,0.88,0.68,1]])
#index_doc=np.argwhere(similarity_matrix_doc>=-0.4)
index_doc = np.argwhere((similarity_matrix_doc-np.triu(similarity_matrix_doc))>=0.3)

In [47]:
print(similarity_matrix_doc)
print(index_doc)

[[1.   0.77 0.49 0.89 0.52]
 [0.77 1.   0.   0.64 0.14]
 [0.49 0.   1.   0.55 0.88]
 [0.89 0.64 0.55 1.   0.68]
 [0.52 0.14 0.88 0.68 1.  ]]
[[1 0]
 [2 0]
 [3 0]
 [3 1]
 [3 2]
 [4 0]
 [4 2]
 [4 3]]


In [89]:
dict_item={}
for i in range(index_doc.shape[0]):
    if index_doc[i][0] == index_doc[i][1]:
        continue
    if index_doc[i][0] in dict_item.keys():
        value = dict_item[index_doc[i][0]]
        if index_doc[i][1] not in value.keys():
            value[index_doc[i][1]] = similarity_matrix_doc[index_doc[i][0],index_doc[i][1]]  
    else:
        value={}
        value[index_doc[i][1]] = similarity_matrix_doc[index_doc[i][0],index_doc[i][1]]  
        dict_item[index_doc[i][0]] = value
        
    if index_doc[i][1] in dict_item.keys():
        value = dict_item[index_doc[i][1]]
        if index_doc[i][0] not in value.keys():
            value[index_doc[i][0]] = similarity_matrix_doc[index_doc[i][0],index_doc[i][1]]  
    else:
        value={}
        value[index_doc[i][0]] = similarity_matrix_doc[index_doc[i][0],index_doc[i][1]]  
        dict_item[index_doc[i][1]] = value

In [90]:
print(dict_item)

{1: {0: 0.77, 3: 0.64}, 0: {1: 0.77, 2: 0.49, 3: 0.89, 4: 0.52}, 2: {0: 0.49, 3: 0.55, 4: 0.88}, 3: {0: 0.89, 1: 0.64, 2: 0.55, 4: 0.68}, 4: {0: 0.52, 2: 0.88, 3: 0.68}}


In [66]:
#sorted_x = sorted(dict_item.items(), key=operator.itemgetter(1))
s=sorted(dict_item.keys())

In [67]:
s[0]

0

In [110]:
for k in dict_item.keys():
    #dict_item[k] =OrderedDict(sorted(dict_item[k].items(), key=lambda x: x[1],reverse=True))
    dict_item[k] =sorted(dict_item[k].items(), key=lambda x: x[1],reverse=True)

In [111]:
type(dict_item)

dict

In [92]:
from collections import OrderedDict
from operator import itemgetter
d = OrderedDict(sorted(dict_item[0].items(), key=lambda x: x[1],reverse=True))

In [103]:
print(d)

OrderedDict([(3, 0.89), (1, 0.77), (4, 0.52), (2, 0.49)])


In [116]:
dict_item[0][1][0]

1