In [2]:
import json
import pandas as pd
from sklearn import preprocessing, model_selection, feature_extraction, naive_bayes, metrics

In [3]:
# Json to csv
def json2df(json_obj):
    
    ls = []
    for row in json_obj:
        ls.append([
            row['title'],
            row['content'],
            row['author'],
            row['brand_id'],
            row['date'],
            row['url']
        ])
    
    col = [
            'title',
            'content',
            'author',
            'date',
            'url',
            'brand_id'
        ]
    
    return pd.DataFrame(ls, columns=col)

In [4]:
# Import dataset

## Read news
with open('./data/real_news_dump.json') as file:
    real_news = json.load(file)

real_news_df = json2df(real_news)

## Fake news
with open('./data/fake_news_dump.json') as file:
    fake_news = json.load(file)
    
fake_news_df = json2df(fake_news)

# Label dataset

In [8]:
# label real dataset
real_label = [True]*len(real_news_df)
real_news_df['label'] = real_label
real_news_df = real_news_df.dropna()

In [9]:
# label fake dataset
fake_label = [False]*len(fake_news_df)
fake_news_df['label'] = fake_label
fake_news_df = fake_news_df.dropna()

# Prepare dataset

In [10]:
compact_df = pd.concat([real_news_df, fake_news_df], ignore_index=True)

In [11]:
compact_df.head()

Unnamed: 0,title,content,author,date,url,brand_id,label
0,謠傳碼頭公園有確診案例 調查局約談網友到案,（中央社記者蕭博文台北4日電）調查局今天表示，網路近期謠傳高雄港63號碼頭、台北青年公園出現...,中央社,9,2020-03-04,https://www.nownews.com/news/20200304/3968815/,True
1,好暖！高醫返國關懷門診　醫護人員進駐機場檢疫站守國門,武漢肺炎國際間疫情不斷擴大，包含韓國、義大利等多個國家被列入我國的旅遊疫情警告地區。高雄醫學...,記者陳美嘉／高雄報導,9,2020-03-04,https://www.nownews.com/news/20200304/3968420/,True
2,湯景華縱火奪6命遭判死 最高法院發回更審,（中央社記者蕭博文台北4日電）男子湯景華因細故縱火，導致翁家6人身亡，一審、二審、更一審皆被...,中央社,9,2020-03-04,https://www.nownews.com/news/20200304/3968749/,True
3,更生人車庫包水餃發跡 實現店面網路販售,（中央社記者劉世怡台北4日電）更生人郭珈瑋出獄後，在車庫包手工水餃起家，再透過圓夢創業小額貸...,中央社,9,2020-03-04,https://www.nownews.com/news/20200304/3968725/,True
4,黑心藥局囤貨　桃園市調處查獲97萬片醫療口罩,武漢肺炎疫情造成全台口罩慌，居然有不肖業者大量囤貨伺機牟利﹔調查局桃園巿調查處日前接獲情資，...,記者李春台 / 桃園報導,9,2020-02-06,https://www.nownews.com/news/20200206/3920699/,True


In [19]:
X = compact_df['title']

# Pipeline

In [21]:
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

In [39]:
class K_means:
    def __init__(self, n_c=3):
        self.tfidf = feature_extraction.text.TfidfVectorizer()
        self.kMeans = KMeans(n_clusters=n_c, random_state=0)

    def fit_transform(self, X):
        self.X = X
        self.X_tfidf = self.tfidf.fit_transform(X)
#         self.X_PCA = PCA(n_components=2).fit_transform(self.X_tfidf)
        self.kMeans.fit(self.X_tfidf)
        return self.kMeans.labels_
    
    def vis(self):
        pass

In [40]:
def kMeans_statistics(df, clustering_label, n_c=3):
    
    real_news = [0]*n_c
    fake_news = [0]*n_c
    
    label = df['label']
    
    assert len(clustering_label) == len(label), "wrong length"
    
    for c_l, l in zip(clustering_label, label):
        if l == True:
            real_news[c_l] += 1
        if l == False:
            fake_news[c_l] += 1
        
    
    print("="*60)
    print('real news distribution : {:>3d} {:>3d} {:>3d}'.format(real_news[0], real_news[1], real_news[2]))
    print('fake news distribution : {:>3d} {:>3d} {:>3d}'.format(fake_news[0], fake_news[1], fake_news[2]))
        

In [44]:
k = K_means(n_c=2)
c_l = k.fit_transform(X)

In [46]:
kMeans_statistics(compact_df, c_l, n_c=3)

real news distribution : 770 230   0
fake news distribution : 674   0   0


# Cosine similarity

In [48]:
from sklearn.metrics.pairwise import cosine_similarity

In [50]:
tfidf = feature_extraction.text.TfidfVectorizer()
tfidf_x = tfidf.fit_transform(X)
cos_x = cosine_similarity(tfidf_x)

# Clustering

In [62]:
import os
import pandas as pd
import numpy as np
import tensorflow.compat.v2 as tf
try:
    from tensorflow_text import SentencepieceTokenizer
    import tensorflow_hub as hub
except ModuleNotFoundError:
    pass
import sklearn.metrics.pairwise
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import DBSCAN

In [None]:
class Clustering:
    def __init__(self):
        pass
    
    def getModel(self):
        model = hub.load('https://tfhub.dev/google/universal-sentence-encoder-multilingual/3')
        return model
    
    def embed_text(self, model, text):
        return model(text)
    
    def getEmbed(self, query_set):
        model = self.getModel()
        news_list = []
        for query in query_set:
            content = query.content
            content = self.remove_covid_message(content)
            news_list.append(content)
        news = self.embed_text(model, news_list)
        return news
    
    def cos_sim(self, input_vectors):
        similarity = cosine_similarity(input_vectors)
        return similarity
    
    
    def getNewsCluster(self, similarity):
        