<a href="https://colab.research.google.com/github/hogch/masterproject_gan/blob/master/SeqGAN_headlines.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Text Generation using SeqGAN

This notebook generates news headlines using the Machine Learning technology GAN (Generative Adversarial Networks).

## Import dependencies



In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%cd /content/drive/My\ Drive/Colab\ Notebooks/Masterproject
!ls

/content/drive/My Drive/Colab Notebooks/Masterproject
abcnews-date-text.csv  screenshots
Graph		       SeqGAN
headlines.csv	       SeqGAN_headlines_dataloading_experiments.ipynb
news-headlines.db      SeqGAN_headlines.ipynb
results


**Install required dependencies manually**

In [3]:
!pip install tflearn
!pip install tqdm
!pip install --force https://github.com/chengs/tqdm/archive/colab.zip
!pip install tensorboardcolab

Collecting https://github.com/chengs/tqdm/archive/colab.zip
  Downloading https://github.com/chengs/tqdm/archive/colab.zip
[K     / 614kB 576kB/s
Building wheels for collected packages: tqdm
  Running setup.py bdist_wheel for tqdm ... [?25l- done
[?25h  Stored in directory: /tmp/pip-ephem-wheel-cache-8kf29u8t/wheels/41/18/ee/d5dd158441b27965855b1bbae03fa2d8a91fe645c01b419896
Successfully built tqdm
[31mspacy 2.0.18 has requirement numpy>=1.15.0, but you'll have numpy 1.14.6 which is incompatible.[0m
[31mpymc3 3.6 has requirement joblib<0.13.0, but you'll have joblib 0.13.0 which is incompatible.[0m
[31mfeaturetools 0.4.1 has requirement pandas>=0.23.0, but you'll have pandas 0.22.0 which is incompatible.[0m
Installing collected packages: tqdm
  Found existing installation: tqdm 4.28.1
    Uninstalling tqdm-4.28.1:
      Successfully uninstalled tqdm-4.28.1
Successfully installed tqdm-4.28.1


**Import required modules**

In [4]:
import numpy as np
import pandas as pd
import tensorflow as tf
import keras
import datetime

from keras.preprocessing.text import Tokenizer
from tflearn.data_utils import pad_sequences, to_categorical
from tensorflow.contrib import slim
from tqdm import tqdm, tnrange

from tensorboardcolab import *

Using TensorFlow backend.


In [5]:
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


## Load Dataset

In [6]:
df_real = pd.read_csv('headlines.csv', sep=',', usecols=['text', 'fake'])
df_real = df_real.sample(frac=1)
df_real = df_real[:50000]
print(df_real.shape)
df_real.head()

(50000, 2)


Unnamed: 0,text,fake
774512,fire destroys truck in north qld,0
515685,missing millionaire active on facebook,0
58436,support for derby wharf upgrade,0
169780,full text pm announces woods release,0
955724,real madrid sacks manager ancelotti,0


In [7]:
df_fake = pd.DataFrame(columns=['text', 'fake'])
print(df_fake.shape)
df_fake.head()

(0, 2)


Unnamed: 0,text,fake


## Define Test-, Trainingset and Hyper-Parameter

In [0]:
# General Hyper-Parameter
BATCH_SIZE = 32
SEQ_LENGTH = 8 # average sequence length of the given sentences in the dataset
TRAINING_SPLIT = 0.2
TOTAL_EPOCH = 50#200

# Discriminator Hyper-Parameter
D_PRETRAIN_EPOCHS = 50
D_EMB_SIZE = 100
D_EMB_DIM = 64
D_FILTER_SIZES = [2,3]
D_NUM_CLASSES = 2
D_NUM_FILTERS = 50

# Generator Hyper-Parameter
G_PRETRAIN_EPOCHS = 1000
G_START_TOKEN = 0
G_EMB_SIZE = 300
G_EMB_DIM = 32
G_HIDDEN_DIM = 32 # hidden state dimension of lstm cell

In [0]:
import random

def get_datasets(texts, labels):
  tokenizer = init_tokenizer(texts)

  sequences = tokenizer.texts_to_sequences(texts)
  text_seq = pad_sequences(sequences, maxlen=SEQ_LENGTH)
    
  labels = np.asarray(labels)

  indices = np.arange(text_seq.shape[0])
  np.random.shuffle(indices)
  text_seq = text_seq[indices]
  labels = labels[indices]
  test_size = int(TRAINING_SPLIT * text_seq.shape[0])
  
  X_train = text_seq[:test_size]
  y_train = to_categorical(labels[:test_size], 2)
  X_test = text_seq[test_size:]
  y_test = labels[test_size:]
  
  return X_train, y_train, X_test, y_test

def init_tokenizer(texts):
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(texts)
  
  return tokenizer

def load_data(df):
  texts = []
  labels = []
  
  for row in zip(df['text'], df['fake']):
    texts.append(row[0].strip())
    labels.append(row[1])
  
  return texts, labels

def get_tokenizer():
  texts, labels = load_data(df_real)
  return init_tokenizer(texts)
  
def get_fake_data():
  global df_fake
  df_fake = df_fake.sample(frac=1)
  
  texts, labels = load_data(df_fake)
  return get_datasets(texts, labels)

def get_real_data():
  global df_real
  df_real = df_real.sample(frac=1)

  texts, labels = load_data(df_real)
  return get_datasets(texts, labels)

def get_mixed_data():
  # global notation ???
  df = pd.concat([df_real, df_fake])
  df = df.sample(frac=1)

  texts, labels = load_data(df)
  return get_datasets(texts, labels)


In [0]:
tokenizer = get_tokenizer()

WORD_INDEX = [word for idx, word in enumerate(tokenizer.word_index)]
VOCAB_SIZE = len(WORD_INDEX) # number of different words in the dataset

## Discriminator
model for classifying sequences (here headlines) as real or fake.
In this implementation the discriminative model uses following layers: 
1.   embedding layer
2.   convolution layer
3.   max-pooling layer
4.   softmax layer

In [0]:
class Discriminator:
  def __init__(self, batch_size, vocab_size, seq_length, emb_size, filter_sizes, num_classes, num_filters, learning_rate):
    self.batch_size = batch_size
    self.vocab_size = vocab_size
    self.emb_size = emb_size
    self.seq_length = seq_length
    self.filter_sizes = filter_sizes
    self.num_classes = num_classes
    self.num_filters = num_filters
    self.learning_rate = learning_rate

    self.X_input = tf.placeholder(tf.int32, shape=[self.batch_size, self.seq_length], name='d_X_input')
    self.y_input = tf.placeholder(tf.int32, shape=[self.batch_size, self.num_classes], name='d_y_input')
    self.dropout_keep_prob = tf.placeholder(tf.float32, name='d_dropout_keep_prob')
    
    # Keeping track of l2 regularization loss (optional)
    self.l2_reg_lambda = 0.2
    self.l2_loss = tf.constant(0.0)

  def build_model(self):
    with tf.variable_scope('discriminator', reuse=tf.AUTO_REUSE):
      self.embedding_layer = self.build_embedding_layer()
      self.convolution_maxpool_layer = self.build_convolution_maxpool_layer()
      self.scores, self.predictions = self.build_softmax_layer()

      self.calc_mean_cross_entropy_loss()
      self.calc_accuracy_and_cost()

      self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(self.cost)
      
      self.d_summary = tf.summary.merge([
          tf.summary.scalar('d_loss', self.loss),
          tf.summary.scalar('d_accuracy', self.accuracy)
      ])
        
  def build_embedding_layer(self):
    with tf.device('gpu:0'), tf.name_scope('d_embedding_layer'):
      emb_matrix = tf.Variable(tf.random_uniform([self.vocab_size, self.emb_size], -1.0, 1.0))
      emb_lookup = tf.nn.embedding_lookup(emb_matrix, self.X_input)
      emb_lookup_expand = tf.expand_dims(emb_lookup, -1)
      
      return emb_lookup_expand
    
  def build_convolution_maxpool_layer(self):
    with tf.name_scope('d_convolution_maxpool_layer'):
      pooled_outputs = []
      for filter_size in self.filter_sizes:
        with tf.name_scope('conv-maxpool-%s' % filter_size):
          # Convolution Layer
          filter_shape = [filter_size, self.emb_size, 1, self.num_filters]
          W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name='d_W')
          b = tf.Variable(tf.constant(0.1, shape=[self.num_filters]), name='d_b')
          conv = tf.nn.conv2d(
              input=self.embedding_layer,
              filter=W,
              strides=[1,1,1,1],
              padding='VALID',
              name='d_conv'
          )
          # Apply non-linearity - activation function
          activation = tf.nn.relu(tf.nn.bias_add(conv, b), name='d_relu')
          # Maxpooling over outputs
          max_pooling = tf.nn.max_pool(
              value=activation,
              ksize=[1, self.seq_length-filter_size+1, 1, 1],
              strides=[1,1,1,1],
              padding='VALID',
              name='max_pooling'
          )
          pooled_outputs.append(max_pooling)

      # combine all the pooled features
      self.num_filter_total = self.num_filters * len(self.filter_sizes)
      h_pool = tf.concat(pooled_outputs, axis=3)
      
      return tf.reshape(h_pool, [-1, self.num_filter_total])
        
  def build_softmax_layer(self): 
    with tf.name_scope('highway'):
      self.h_highway = self.highway(
          self.convolution_maxpool_layer, self.convolution_maxpool_layer.get_shape()[1], 1, 0
      )

    with tf.name_scope('dropout'):
      self.h_drop = tf.nn.dropout(self.h_highway, self.dropout_keep_prob)
      
    with tf.name_scope('softmax_output'):
      W_softmax = tf.Variable(
          tf.truncated_normal(
              [self.num_filter_total, self.num_classes], 
              stddev=0.1
          ), name='d_W_softmax'
      )
      b_softmax = tf.Variable(tf.constant(0.1, shape=[self.num_classes]), name='d_b_softmax')
            
      self.l2_loss += tf.nn.l2_loss(W_softmax)
      self.l2_loss += tf.nn.l2_loss(b_softmax)
      
      self.scores = tf.nn.xw_plus_b(self.h_drop, W_softmax, b_softmax, name='d_scores')
      #self.scores = tf.matmul(self.convolution_maxpool_layer, W_softmax) + b_softmax
      self.truth_prob = tf.nn.softmax(self.scores, -1)[:, 1]
      predictions = tf.argmax(self.scores, 1, name='d_predictions')
      
    return self.scores, predictions
  
  def calc_mean_cross_entropy_loss(self):
    with tf.name_scope('mse_loss'):
      losses = tf.nn.softmax_cross_entropy_with_logits_v2(logits=self.scores, labels=self.y_input)
      self.loss = tf.reduce_mean(losses) + self.l2_reg_lambda * self.l2_loss
      
  def calc_accuracy_and_cost(self):
    with tf.name_scope('accuracy'):
      correct_predictions = tf.equal(self.predictions, tf.argmax(self.y_input, 1))
      self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, 'float'), name='accuracy')
      self.cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=self.scores, labels=self.y_input))
      
  def highway(self, input_, size, num_layers=1, bias=-2.0, f=tf.nn.relu):
    """Highway Network (cf. http://arxiv.org/abs/1505.00387).
    t = sigmoid(Wy + b)
    z = t * g(Wy + b) + (1 - t) * y
    where g is nonlinearity, t is transform gate, and (1 - t) is carry gate.
    """
    with tf.variable_scope('highway'):
      size = int(size)
      for idx in range(num_layers):
        g = f(slim.fully_connected(input_, size, scope='highway_lin_%d' % idx, activation_fn=None))
        t = tf.sigmoid(slim.fully_connected(input_, size, scope='highway_gate_%d' % idx, activation_fn=None) + bias)

        output = t * g + (1. - t) * input_
        input_ = output
        
    return output
  
  def get_truth_prob(self, sess, X):
    feed_dict = { self.X_input: X, self.dropout_keep_prob: 1.0 }
        
    return sess.run(self.truth_prob, feed_dict=feed_dict)
        
  def train(self, sess, X, y, dropout):
    feed_dict = {
        self.X_input: X,
        self.y_input: y,
        self.dropout_keep_prob: dropout
    }
    _, summary = sess.run([self.optimizer, self.d_summary], feed_dict=feed_dict)
    
    return summary

## Build Discriminator model and train model

In [0]:
tf.reset_default_graph()

discriminator = Discriminator(BATCH_SIZE, VOCAB_SIZE, SEQ_LENGTH, D_EMB_SIZE, 
                              D_FILTER_SIZES, D_NUM_CLASSES, D_NUM_FILTERS, 0.001)
discriminator.build_model()

## Generator

LSTM with Reinforcement Learning for sequence generation.

In [0]:
from tensorflow.contrib import rnn, layers, seq2seq, slim

# inspired by https://www.oreilly.com/ideas/introduction-to-lstms-with-tensorflow
# Decoder/Encoder approach from https://github.com/LA-JAMES/-Encoder-Decoder-Simple-Deep-LSTM-for-Tensorflow

class Generator:
  def __init__(self, batch_size, seq_length, vocab_size, emb_size, emb_dim, 
               hidden_dim, start_token, word_index, learning_rate):
    self.batch_size = batch_size
    self.seq_length = seq_length
    self.vocab_size = vocab_size
    self.emb_size = emb_size
    self.emb_dim = emb_dim
    #self.hidden_dim = hidden_dim
    #self.start_token = start_token
    self.word_index = word_index
    self.learning_rate = learning_rate
    self.temperature = 1.0
    self.grad_clip = 5.0
    self.hidden_layer_sizes = [128]
    self.hidden_layer_size = 128
    
    self.X_input = tf.placeholder(tf.int32, shape=[self.batch_size, self.seq_length], name='g_X_input')
    self.y_input = tf.placeholder(tf.int32, shape=[None, self.vocab_size], name='g_y_input')
    self.dropout_keep_prob = tf.placeholder(tf.float32, name='g_dropout_keep_prob')
    self.rewards = tf.placeholder(tf.float32, shape=[self.batch_size, self.seq_length], name='rewards')
    
  def build_model(self):
    with tf.variable_scope('generator', reuse=tf.AUTO_REUSE):
      self.embedding_layer = self.build_embedding_layer(self.X_input)
      
      self.outputs = []
      for i in range(self.seq_length+1):
        output, final_state = self.build_lstm_layers()
        self.outputs.append(output)
                  
      # PRETRAINING
      """self.pretrain_optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
      self.pretrain_calc_loss()
      self.pretrain_operation = slim.learning.create_train_op(
          self.pretrain_loss, self.pretrain_optimizer, clip_gradient_norm=5.0
      )"""
      
      # TRAINING
      self.predictions, loss = self.get_prediction_and_loss()
      optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
      self.train_operation = slim.learning.create_train_op(loss, optimizer, clip_gradient_norm=self.grad_clip)
                  
      # SUMMARY
      """self.g_pretrain_summary = tf.summary.scalar(
          'g_pretrain_loss', pretrain_loss
      )"""
      self.g_train_summary = tf.summary.merge([
          tf.summary.scalar('g_loss', loss),
          tf.summary.scalar('g_reward', tf.reduce_mean(self.rewards))
      ])
        
  def build_embedding_layer(self, X_input):
    with tf.name_scope('g_embedding_layer'):
      emb_matrix = tf.Variable(tf.random_uniform([self.vocab_size, self.emb_size], -1.0, 1.0))
      emb_lookup = tf.nn.embedding_lookup(emb_matrix, X_input)
      
    return emb_lookup
      
  def build_lstm_layers(self):
    with tf.name_scope('g_lstm_layers'):
      layers = [rnn.LSTMCell(layer_size) for layer_size in self.hidden_layer_sizes]
      dropouts = [rnn.DropoutWrapper(layer, output_keep_prob=self.dropout_keep_prob) for layer in layers]
      cell = rnn.MultiRNNCell(dropouts) # , state_is_tuple=True)?
      
      initial_state = cell.zero_state(self.batch_size, tf.float32)
      outputs, final_state = tf.nn.dynamic_rnn(cell, self.embedding_layer, initial_state=initial_state)
      
    return outputs, final_state 
  
  def get_prediction_and_loss(self):
    predictions = []
    W2 = tf.Variable(tf.random_normal([self.hidden_layer_sizes[-1], self.vocab_size]), dtype=tf.float32)
    b2 = tf.Variable(tf.zeros([1, self.vocab_size]), dtype=tf.float32)
    for i in range(self.seq_length+1):
      output = tf.reshape(self.outputs[i], [-1, self.hidden_layer_sizes[-1]])

      logits = tf.matmul(output, W2) + b2 # broadcasted addition
      #predictions.append(self.sample(tf.nn.softmax(logits), self.temperature))
      predictions.append(tf.nn.softmax(logits))
    
    loss = -tf.reduce_sum(
        tf.reduce_sum(
            tf.one_hot(tf.to_int32(tf.reshape(self.X_input, [-1])), self.vocab_size, 1.0, 0.0) * 
            tf.log(tf.clip_by_value(tf.reshape(predictions[-1], [-1, self.vocab_size]), 1e-20, 1.0)),
            1) * tf.reshape(self.rewards, [-1])
    )
    
    return predictions, loss
      
  def generate(self, sess, given_tokens, dropout=0.75):
    feed_dict = { self.X_input: given_tokens, self.dropout_keep_prob: dropout }
    samples = sess.run([self.predictions[0]], feed_dict=feed_dict)
    sentences, sequences = self.translate_samples(samples)
    
    return sentences, sequences
    
  def sample(self, predictions, temperature):
    #preds = np.asarray(predictions[0]).astype('float64')
    preds = np.log(predictions[0]) / temperature
    preds = np.exp(preds) / np.sum(np.exp(preds))
    rand = random.random() # range: [0,1]
    total = 0.0
    for i in range(len(preds)):
      total += preds[i]
      if total > rand:
        return i
      
    return len(preds)-1
    
  # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
  # REINFORCEMENT LEARNING
  def rollout(self, sess, given_tokens, keep_steps=0, dropout=0.75):
    feed_dict = { self.X_input: given_tokens, self.dropout_keep_prob: dropout }
    output_tensors = self.predictions[keep_steps]
      
    return sess.run(output_tensors, feed_dict=feed_dict)
  
  def get_reward(self, sess, given_tokens, rollout_num, dis):
    rewards = np.zeros((self.batch_size, self.seq_length))
    for keep_num in range(1, self.seq_length):
      for i in range(rollout_num):
        # Markov Chain Monte Carlo Sample
        mc_sample = self.rollout(sess, given_tokens, keep_steps=keep_num)
        sentences, sequence = self.translate_samples(mc_sample)
        rewards[:, keep_num] += dis.get_truth_prob(sess, sequence)
            
    rewards /= rollout_num
    return rewards
  # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
  
  def pretrain(self, sess, given_tokens, dropout=0.75):
    feed_dict = { self.X_input: given_tokens, self.dropout_keep_prob: dropout }
    sess.run(self.pretrain_operation, feed_dict=feed_dict)
    _, summary = sess.run([self.pretrain_operation, self.g_pretrain_summary], feed_dict=feed_dict)
    
    return summary
  
  def train(self, sess, given_tokens, rewards, dropout=0.75):
    feed_dict = { self.X_input: given_tokens, self.rewards: rewards, self.dropout_keep_prob: dropout }
    _, summary = sess.run([self.train_operation, self.g_train_summary], feed_dict=feed_dict)
    
    return summary
  
  def translate_samples(self, sequence):
    batch_softmax = np.reshape(sequence, [self.batch_size, self.seq_length, self.vocab_size])

    sentences = []
    vectors = []
    for sequence in batch_softmax:
      sentence = ''
      vector = []
      for pos in sequence:
        vector_position = np.argmax(pos)
        vector.append(vector_position)
        word = self.word_index[vector_position]
        sentence += word
        sentence += ' '

      sentences.append(sentence)
      vectors.append(vector)

    vectors = np.asarray(vectors)
    return sentences, vectors

## Build Generator model

In [0]:
generator = Generator(BATCH_SIZE, SEQ_LENGTH, VOCAB_SIZE, G_EMB_SIZE, 
                      G_EMB_DIM, G_HIDDEN_DIM, G_START_TOKEN, WORD_INDEX, 0.0001)

generator.build_model()

## Start Adversarial Training

In [15]:
tbc=TensorBoardColab()

LOG_DIR = '/tmp/log'
get_ipython().system_raw(
    'tensorboard --logdir {} --host 0.0.0.0 --port 6006 &'
    .format(LOG_DIR)
)

Wait for 8 seconds...
TensorBoard link:
http://a2c472c6.ngrok.io


In [0]:
def add_fake_samples(headlines):
  global df_fake
  
  for headline in headlines:
    df_fake = df_fake.append({'text': headline , 'fake': 1} , ignore_index=True)

In [18]:
with tf.Session() as sess:
  print('start GAN training at', datetime.datetime.now())
  writer = tbc.get_writer()
  sess.run(tf.global_variables_initializer())
    
  """# PRETRAINING GENERATOR
  for epoch in range(G_PRETRAIN_EPOCHS):
    # seq_length -1 und von 1 anfangen
    X_train, y_train, X_test, y_test = get_mixed_data()
    summary = generator.pretrain(sess, X_train[:BATCH_SIZE])
    writer.add_summary(summary, epoch)
    
  # PRETRAIN DISCRIMINATOR
  for eoch in range(D_PRETRAIN_EPOCHS):
    fake_sentences, fake_sequences = generator.generate(sess, X_train[:BATCH_SIZE])
    #translate_samples(tokenizer, fake_samples, df_fake)
    X_train, y_train, X_test, y_test = get_data()
    
    for _ in range(3):
      discriminator.train(sess, X_train, y_train, 5, BATCH_SIZE, .001)"""

  # ADVERSARIAL TRAINING
  for epoch in tnrange(TOTAL_EPOCH, desc='gan_epoch_loop'):
    X_train, y_train, X_test, y_test = get_real_data()
    # train generator for one step
    for it in range(1):
      fake_sentences, fake_sequences = generator.generate(sess, X_train[:BATCH_SIZE])
      add_fake_samples(fake_sentences)
      rewards = generator.get_reward(sess, fake_sequences, 16, discriminator)
      summary = generator.train(sess, fake_sequences, rewards)
    
    writer.add_summary(summary, epoch)
    
    for _ in tnrange(5, desc='gen_train_loop'):
      X_train, y_train, X_test, y_test = get_real_data()
      samples, outputs = generator.generate(sess, X_train[:BATCH_SIZE])
      add_fake_samples(samples)
      
      for _ in tnrange(3, desc='dis_train_loop'):
        X_train, y_train, X_test, y_test = get_mixed_data()
        summary = discriminator.train(sess, X_train[:BATCH_SIZE], y_train[:BATCH_SIZE], 0.4)
        
    print(df_fake.tail(2))
    writer.add_summary(summary, epoch)
    
    """tokenizer, word_index, X_real_train, y_real_train, X_real_test, y_real_test = get_real_data()
    summary = sess.run(generator.image_summary, feed_dict={ generator.given_tokens: X_real_train[:BATCH_SIZE] })
    writer.add_summary(summary, epoch)"""
    
  print('finish GAN training at', datetime.datetime.now())

start GAN training at 2019-01-10 10:12:24.357224




                                                  text fake
190  floral diversions bowraville nation 20yrs gumb...    1
191  conscience performers ing 4yo oshane spagnuolo...    1


                                                  text fake
382  'three alimony mistrial fanned consoled deputy...    1
383  limps mcewan cling tackles platypus essential'...    1


KeyboardInterrupt: ignored

In [0]:
from pathlib import Path

print(df_fake.tail())

count = 1
saved = False
while saved != True:
  filename = 'generated_headlines_%d.csv' % count
  filepath = Path('./results/%s' % filename)
  
  if filepath.exists():
    count = count+1
  else:
    df_fake.to_csv(filepath, sep='\t', encoding='utf-8')
    saved = True