#Setting up environment

In [1]:
!pip install opencc

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting opencc
  Downloading OpenCC-1.1.4-cp37-cp37m-manylinux1_x86_64.whl (769 kB)
[K     |â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 769 kB 8.2 MB/s 
[?25hInstalling collected packages: opencc
Successfully installed opencc-1.1.4


In [None]:
import os
from google.colab import drive
drive.mount('/content/drive')
os.chdir('/content/drive/My Drive/original')
path = os.getcwd()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

In [2]:
import numpy as np
import pandas as pd
import jieba
from opencc import OpenCC

In [None]:
!pip install -U spacy
!python -m spacy download zh_core_web_sm
!python -m spacy download en_core_web_sm

In [4]:
from gensim.models import word2vec

In [None]:
tf.

# WORD2VEC TRAINING

In [None]:
os.chdir(os.path.join(path, 'dataset'))
!wget "https://dumps.wikimedia.org/zhwiki/latest/zhwiki-latest-pages-articles.xml.bz2"
os.chdir(os.path.join(path, '..'))

In [None]:
import spacy

nlp_zh = spacy.load("zh_core_web_sm")
nlp_en = spacy.load("en_core_web_sm")

STOPWORDS =  nlp_zh.Defaults.stop_words | \
        nlp_en.Defaults.stop_words | \
        set(["\n", "\r\n", "\t", " ", ""])

cc = OpenCC('s2t')
for word in STOPWORDS.copy():
  STOPWORDS.add(cc.convert(word))


In [None]:
def preprocess_and_tokenize( text, token_min_len = 1, token_max_len = 15, lower = True):
    if lower: 
      text = text.lower()
    text = cc.convert(text)
    return [
        token for token in jieba.cut(text, cut_all = False)
        if token_min_len <= len(token) <= token_max_len and \
            token not in STOPWORDS
    ]

In [None]:
import gensim
from gensim.corpora import WikiCorpus

wiki_corpus = WikiCorpus(os.path.join(path, 'dataset', 'zhwiki-latest-pages-meta-history1.xml-p10212p26672.bz2'), tokenizer_func = preprocess_and_tokenize, token_min_len = 1)

In [None]:
from datetime import datetime as dt
generator = wiki_corpus.get_texts()

with open(os.path.join(path, 'dataset', 'wiki_corpused.txt'), "w", encoding = 'utf-8') as output:
  for texts_num, tokens in enumerate(generator):
    output.write(" ".join(tokens) + "\n")
    if (texts_num + 1) % 10000 == 0:
      print(f"[{str(dt.now()):.19}] å·²å¯«å…¥ {texts_num} ç¯‡æ–·è©žæ–‡ç« ")

In [None]:
import multiprocessing
max_cpu_counts = multiprocessing.cpu_count()
word_dim_size = 300
print(f"Use {max_cpu_counts} workers to train Word2Vec (dim={word_dim_size})")

sentences = word2vec.LineSentence(os.path.join(path, 'dataset', 'wiki_corpused.txt'))

model = word2vec.Word2Vec(sentences, size = word_dim_size, workers = max_cpu_counts)

output_model = f"word2vec.zh.{word_dim_size}.model"
model.save(os.path.join(path, f"word2vec.zh.{word_dim_size}.model"))

Use 2 workers to train Word2Vec (dim=300)


In [None]:
w2v_model = word2vec.Word2Vec.load(os.path.join(path, "word2vec.zh.300.model"))
print(f"ç¸½å…±æ”¶éŒ„äº† {len(w2v_model.wv.vocab)} å€‹è©žå½™")

ç¸½å…±æ”¶éŒ„äº† 1138562 å€‹è©žå½™


In [None]:
w2v_model.wv.most_similar("kerkhove", topn=10)

# Combine data

In [None]:
file_path = os.path.join(path, 'dataset')
df = pd.DataFrame(columns = ['content', 'label'])
cc = OpenCC('s2t')
for root, _, files in os.walk(os.path.join(file_path, 'ch_fake')):
  for file_name in files:
    with open(os.path.join(root, file_name)) as f:
      tmp = {'content' : cc.convert(f.read()), 'label' : 0}
      df = df.append(tmp, ignore_index = True)

for root, _, files in os.walk(os.path.join(file_path, 'ch_real')):
  for file_name in files:
    with open(os.path.join(root, file_name)) as f:
      tmp = {'content' : cc.convert(f.read()), 'label' : 1}
      df = df.append(tmp, ignore_index = True)
      
df.to_csv(os.path.join(file_path, 'data.csv'), encoding = 'utf-8_sig')

# DATA PREPROCESSING


In [None]:
import spacy
from opencc import OpenCC
nlp_zh = spacy.load("zh_core_web_sm")
nlp_en = spacy.load("en_core_web_sm")

STOPWORDS =  nlp_zh.Defaults.stop_words | \
        nlp_en.Defaults.stop_words | \
        set(["\n", "\r\n", "\t", " ", ""])

cc = OpenCC('s2t')
for word in STOPWORDS.copy():
  STOPWORDS.add(cc.convert(word))

In [None]:
def preprocess(df, stop_words):
  import re
  for i, data in enumerate(df['content']):
    data = data.lower()
    data = re.sub('[a-zA-Z0-9ã€€ðŸ†™\-.]', '', data)
    seg = jieba.cut(data, cut_all = False)
    seg = [v for v in seg if v not in STOPWORDS]
    df.loc[i, 'content'] = ' '.join(seg)

In [None]:
df = pd.read_csv(os.path.join(path, 'dataset', 'data.csv'))
preprocess(df, STOPWORDS)

In [None]:
from sklearn.utils import shuffle
df = shuffle(df)

train_df = df[ :150]
test_df = df[150: ]
train_x, test_x = train_df['content'].str.split(), test_df['content'].str.split()
train_y, test_y = train_df['label'], test_df['label']

In [None]:
embeddings_index = dict()
with open('keras_word2vec.txt') as f:
  for line in f:
    values = line.split()
    if len(values) != 300 + 1:
      continue
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
  f.close()

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer_obj = Tokenizer()
tokenizer_obj.fit_on_texts(df['content'])
sequences = tokenizer_obj.texts_to_sequences(df['content'])
review_pad = pad_sequences(sequences, maxlen = 256)

word_index = tokenizer_obj.word_index

vocab_size = len(word_index) + 1
embedding_matrix = np.zeros((vocab_size, 300))

for word, i in tokenizer_obj.word_index.items():
  embedding_vector = embeddings_index.get(word)
  if embedding_vector is not None:
    embedding_matrix[i] = embedding_vector

In [None]:
vocab_size

12159

In [None]:
train_sequence = tokenizer_obj.texts_to_sequences(train_x)
train_padded = pad_sequences(train_sequence, maxlen = 256)

test_sequence = tokenizer_obj.texts_to_sequences(test_x)
test_padded = pad_sequences(test_sequence, maxlen = 256)


# GRU


In [None]:
w2v_model.wv.save_word2vec_format('keras_word2vec.txt', binary=False)

In [None]:
import tensorflow as tf
from tensorflow.keras import Sequential, Input, Model
from tensorflow.keras.layers import Dense, GRU, Embedding
from tensorflow.keras.initializers import Constant
GRU_model = Sequential()
GRU_model.add(Embedding(input_dim = vocab_size,
          output_dim = 300,
          embeddings_initializer = Constant(embedding_matrix),
          input_length = 256,
          mask_zero = True,
          trainable = False))
GRU_model.add(GRU(128, dropout = 0.5))
GRU_model.add(Dense(1, activation = 'sigmoid'))
GRU_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
GRU_model.fit(train_padded, train_y.values, epochs = 20, batch_size = 128)

In [None]:
pred = GRU_model.predict(test_padded)

In [None]:
pred = np.array(tf.greater(pred, .5)).flatten()
(sum(pred == test_y)) / len(pred)