In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import gensim
from gensim.models import LdaModel
from gensim import models, corpora, similarities
import re
from nltk.stem.porter import PorterStemmer
import time
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
## Preprocess

In [None]:
def initial_clean(text):
    text = re.sub("((\S+)?(http(s)?)(\S+))|((\S+)?(www)(\S+))|((\S+)?(\@)(\S+)?)", " ", text)
    text = re.sub("[^a-zA-Z ]", "", text)
    text = text.lower() # lower case the text
    text = nltk.word_tokenize(text)
    
    return text

stop_words = stopwords.words('english')
def remove_stop_words(text):
    return [word for word in text if word not in stop_words]

stemmer = PorterStemmer()
def stem_words(text):
    try:
        text = [stemmer.stem(word) for word in text]
        text = [word for word in text if len(word) > 1] # make sure we have no 1 letter words
    except IndexError: # the word "oed" broke this, so needed try except
        pass
    return text

def apply_all(text):
    return stem_words(remove_stop_words(initial_clean(text)))

In [None]:
## Read and process train data

In [None]:
train_b = pd.read_csv('train_final.tsv', encoding="utf-8", delimiter = '\t')#, lineterminator='\n')
dev_b = pd.read_csv('dev.tsv', encoding="utf-8", delimiter = '\t')#, lineterminator='\n')

train_b = pd.concat([train_b, dev_b ])
(x_train, y_train) = (train_b['sentence'], train_b['label'])

#test_b = pd.read_csv('dev.tsv', encoding="utf-8", delimiter = '\t' )
#(x_test, y_test) = (test_b['sentence'], test_b['label'])

print(len(x_train))
print(len(y_train))

# clean text and title and create new column "tokenized"
t1 = time.time()
x_train= x_train.apply(apply_all)
#x_test = x_test.apply(apply_all)
t2 = time.time()
print("Time to clean and tokenize", len(x_train), "articles:", (t2-t1)/60, "min")

2418
2418
Time to clean and tokenize 2418 articles: 0.5670200069745381 min


In [None]:
## LLDA

In [None]:
from optparse import OptionParser
import sys, re, numpy

def load_corpus(filename):
    corpus = []
    labels = []
    labelmap = dict()
    f = open(filename, 'r') 
    for line in f:
        mt = re.match(r'\[(.+?)\](.+)', line)
        if mt:
            label = mt.group(1).split(',')
            for x in label: labelmap[x] = 1
            line = mt.group(2)
        else:
            label = None
        doc = re.findall(r'\w+(?:\'\w+)?',line.lower())
        if len(doc)>0:
            corpus.append(doc)
            labels.append(label)
    f.close()
    return labelmap.keys(), corpus, labels

class LLDA:
    def __init__(self, K, alpha, beta):
        #self.K = K
        self.alpha = alpha
        self.beta = beta

    def term_to_id(self, term):
        if term not in self.vocas_id:
            voca_id = len(self.vocas)
            self.vocas_id[term] = voca_id
            self.vocas.append(term)
        else:
            voca_id = self.vocas_id[term]
        return voca_id

    def complement_label(self, label):
        if not label: return numpy.ones(len(self.labelmap))
        vec = numpy.zeros(len(self.labelmap))
        vec[0] = 1.0
        for x in label: vec[self.labelmap[x]] = 1.0
        return vec

    def set_corpus(self, labelset, corpus, labels):
        labelset.insert(0, "common")
        self.labelmap = dict(zip(labelset, range(len(labelset))))
        self.K = len(self.labelmap)

        self.vocas = []
        self.vocas_id = dict()
        self.labels = numpy.array([self.complement_label(label) for label in labels])
        self.docs = [[self.term_to_id(term) for term in doc] for doc in corpus]

        M = len(corpus)
        V = len(self.vocas)

        self.z_m_n = []
        self.n_m_z = numpy.zeros((M, self.K), dtype=int)
        self.n_z_t = numpy.zeros((self.K, V), dtype=int)
        self.n_z = numpy.zeros(self.K, dtype=int)

        for m, doc, label in zip(range(M), self.docs, self.labels):
            N_m = len(doc)
            #z_n = [label[x] for x in numpy.random.randint(len(label), size=N_m)]
            z_n = [numpy.random.multinomial(1, label / label.sum()).argmax() for x in range(N_m)]
            self.z_m_n.append(z_n)
            for t, z in zip(doc, z_n):
                self.n_m_z[m, z] += 1
                self.n_z_t[z, t] += 1
                self.n_z[z] += 1

    def inference(self):
        V = len(self.vocas)
        for m, doc, label in zip(range(len(self.docs)), self.docs, self.labels):
            for n in range(len(doc)):
                t = doc[n]
                z = self.z_m_n[m][n]
                self.n_m_z[m, z] -= 1
                self.n_z_t[z, t] -= 1
                self.n_z[z] -= 1

                denom_a = self.n_m_z[m].sum() + self.K * self.alpha
                denom_b = self.n_z_t.sum(axis=1) + V * self.beta
                p_z = label * (self.n_z_t[:, t] + self.beta) / denom_b * (self.n_m_z[m] + self.alpha) / denom_a
                new_z = numpy.random.multinomial(1, p_z / p_z.sum()).argmax()

                self.z_m_n[m][n] = new_z
                self.n_m_z[m, new_z] += 1
                self.n_z_t[new_z, t] += 1
                self.n_z[new_z] += 1

    def phi(self):
        V = len(self.vocas)
        return (self.n_z_t + self.beta) / (self.n_z[:, numpy.newaxis] + V * self.beta)

    def theta(self):
        """document-topic distribution"""
        n_alpha = self.n_m_z + self.labels * self.alpha
        return n_alpha / n_alpha.sum(axis=1)[:, numpy.newaxis]

    def perplexity(self, docs=None):
        if docs == None: docs = self.docs
        phi = self.phi()
        thetas = self.theta()

        log_per = N = 0
        for doc, theta in zip(docs, thetas):
            for w in doc:
                log_per -= numpy.log(numpy.inner(phi[:,w], theta))
            N += len(doc)
        return numpy.exp(log_per / N)

def main():
    parser = OptionParser()
    parser.add_option("-f", dest="filename", help="corpus filename")
    parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=0.001)
    parser.add_option("--beta", dest="beta", type="float", help="parameter beta", default=0.001)
    parser.add_option("-k", dest="K", type="int", help="number of topics", default=20)
    parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=100)
    (options, args) = parser.parse_args()
    if not options.filename: parser.error("need corpus filename(-f)")

    labelset, corpus, labels = load_corpus(options.filename)

    llda = LLDA(options.K, options.alpha, options.beta)
    llda.set_corpus(labelset, corpus, labels)

    for i in range(options.iteration):
        sys.stderr.write("-- %d " % (i + 1))
        llda.inference()
    #print llda.z_m_n

    phi = llda.phi()
    for v, voca in enumerate(llda.vocas):
        #print ','.join([voca]+[str(x) for x in llda.n_z_t[:,v]])
        print (','.join([voca]+[str(x) for x in phi[:,v]]) )


In [None]:
## Read and process validation data

In [None]:
test_data = pd.read_csv('dev.tsv', encoding="utf-8", delimiter = '\t' )
#(x_test, y_test) = (test_b['sentence'], test_b['label'])
'''
testSet = np.array(x_test)
testLabel = np.array(y_test)
testCorpus = list(testSet)
testSet = np.array(x_test)
testLabel = np.array(y_test)
'''

test_data = test_data.sort_values(by='label')
test_data = test_data[:55]

test_data = test_data.sample(frac = 1) 
labels = []

df2 = pd.DataFrame(columns=['sentence', 'label'])
for i, row in test_data.iterrows():
    s = row['sentence']
    label = row['label']
    #length = row['length']
    labels.append(label) 
    df2= df2.append({'sentence': s, 'label': label}, ignore_index=True)

(x_test, y_test) = (df2['sentence'], df2['label'])
x_test = x_test.apply(apply_all)
testSet = np.array(x_test)
testLabel = np.array(y_test)

In [None]:
## Read test data

In [None]:
test_b = pd.read_csv('test.tsv', encoding="utf-8", delimiter = '\t' )

#(x_train, y_train) = (train_b['sentence'], train_b['label'])
(x_test, y_test) = (test_b['sentence'], test_b['label'])
x_test = x_test.apply(apply_all)
testSet = np.array(x_test)
testLabel = np.array(y_test)

In [None]:
# Import the LLDA (labeled latent dirichlet allocation) class 

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import matthews_corrcoef
import sys

trainSet = np.array(x_train)
trainLabel =  np.array(y_train)
#testSet = np.array(x_test)
#testLabel = np.array(y_test)

#corpus = map(methodcaller("split", " "), list(trainSet[:, 0]))
corpus = list(trainSet)

# Thanks http://stackoverflow.com/questions/25346058/removing-list-of-words-from-a-string
# Allows for removing stupid words that do nothing about distinguishing topics from every headline
stopwords = ['the', 'a', 'at', 'of', '...', ':', ',', 'in', 'for', 'with', 'to', 'on', 'and', 'is', ';', '-', '–', '\'', '"', '|', 
             'are', '&', 'this', 'about', 'from', 'be', 'as', 'by', 'up', 'what', 'will', 'how', 'that', 'you', 'it', 'why', 'after']
#trimmed_corpus = []
#for i in range(len(corpus)):
#    temp = [word for word in corpus[i].split() if word.lower() not in stopwords]
#    trimmed_corpus.append(temp)

#corpus = trimmed_corpus

# Turn the 1.0/0.0 float labels into 1 or 0

labels = map(list, map(str, map(int, trainLabel)))

# '0' for real and '1' for fake
labelset = ['0', '1']

# K is the number of topics, so we want that to be 2, not the default of 20
K = 2

# Alpha and Beta: controlling similarity between topics and words within a topic
# explained : https://www.youtube.com/watch?v=3mHy4OSyRf0
# as in the llda_nltk example
alpha = 0.0001
beta = 0.0001
iterations = 5

# Instantiate an LLDA object
llda = LLDA(K, alpha, beta)
llda.set_corpus(labelset, corpus, labels)

# lower perplexity in each step is good

for i in range(iterations):
    sys.stderr.write("-- %d : %.4f\n" % (i, llda.perplexity()))
    llda.inference()
    
    phi = llda.phi()
    for k, label in enumerate(labelset):
        print ("\n-- label %d : %s" % (k, label))
        for w in np.argsort(-phi[k])[:20]:
            print ("%s: %.4f" % (llda.vocas[w], phi[k,w]))
print ("perplexity : %.4f" % llda.perplexity() )

# This prints out the 40 most likely words in each topic
phi = llda.phi()
for k, label in enumerate(labelset):
    print ("\n-- label %d : %s" % (k, label))
    for w in np.argsort(-phi[k])[:40]:
        print ("%s: %.4f" % (llda.vocas[w], phi[k,w]))        


-- 0 : 3386.6958



-- label 0 : common
climat: 0.0083
chang: 0.0060
trump: 0.0058
said: 0.0053
year: 0.0051
would: 0.0051
one: 0.0044
like: 0.0041
state: 0.0039
peopl: 0.0039
new: 0.0038
say: 0.0037
time: 0.0034
also: 0.0033
presid: 0.0032
us: 0.0030
nation: 0.0028
use: 0.0028
even: 0.0028
world: 0.0027

-- label 1 : 0
trump: 0.0111
said: 0.0084
say: 0.0053
one: 0.0049
would: 0.0048
peopl: 0.0047
like: 0.0043
state: 0.0040
presid: 0.0038
year: 0.0036
new: 0.0033
time: 0.0031
clinton: 0.0031
also: 0.0030
polit: 0.0028
make: 0.0027
could: 0.0027
climat: 0.0027
obama: 0.0026
go: 0.0026

-- label 2 : 1
climat: 0.0264
chang: 0.0116
warm: 0.0084
global: 0.0074
temperatur: 0.0072
energi: 0.0070
carbon: 0.0059
emiss: 0.0053
year: 0.0047
increas: 0.0044
would: 0.0042
scienc: 0.0041
new: 0.0037
polici: 0.0036
govern: 0.0036
us: 0.0036
dioxid: 0.0036
report: 0.0036
per: 0.0034
human: 0.0034


-- 1 : 3290.8821



-- label 0 : common
climat: 0.0078
chang: 0.0058
would: 0.0054
said: 0.0053
year: 0.0050
trump: 0.0048
one: 0.0047
like: 0.0043
state: 0.0043
new: 0.0041
peopl: 0.0039
say: 0.0037
time: 0.0036
also: 0.0033
us: 0.0032
nation: 0.0031
presid: 0.0031
world: 0.0029
even: 0.0028
use: 0.0028

-- label 1 : 0
trump: 0.0127
said: 0.0088
say: 0.0055
peopl: 0.0048
one: 0.0047
would: 0.0045
like: 0.0043
presid: 0.0041
clinton: 0.0037
state: 0.0036
year: 0.0035
also: 0.0031
new: 0.0031
time: 0.0030
obama: 0.0029
polit: 0.0029
get: 0.0027
go: 0.0027
make: 0.0027
could: 0.0026

-- label 2 : 1
climat: 0.0283
chang: 0.0126
warm: 0.0090
temperatur: 0.0081
global: 0.0079
energi: 0.0073
carbon: 0.0063
emiss: 0.0057
year: 0.0053
increas: 0.0049
scienc: 0.0045
would: 0.0041
report: 0.0040
dioxid: 0.0040
use: 0.0038
polici: 0.0038
human: 0.0037
govern: 0.0037
new: 0.0036
model: 0.0036


-- 2 : 3246.4620



-- label 0 : common
climat: 0.0081
chang: 0.0062
would: 0.0052
year: 0.0051
said: 0.0051
one: 0.0046
state: 0.0044
like: 0.0043
new: 0.0043
peopl: 0.0040
trump: 0.0040
say: 0.0036
time: 0.0034
us: 0.0033
also: 0.0033
world: 0.0031
nation: 0.0030
presid: 0.0030
even: 0.0028
make: 0.0028

-- label 1 : 0
trump: 0.0138
said: 0.0091
say: 0.0058
one: 0.0048
peopl: 0.0048
would: 0.0046
presid: 0.0044
like: 0.0043
clinton: 0.0041
state: 0.0035
time: 0.0032
year: 0.0032
obama: 0.0032
also: 0.0031
polit: 0.0030
new: 0.0028
republican: 0.0028
go: 0.0028
get: 0.0027
hous: 0.0027

-- label 2 : 1
climat: 0.0285
chang: 0.0128
warm: 0.0093
temperatur: 0.0083
global: 0.0081
energi: 0.0073
carbon: 0.0066
emiss: 0.0058
year: 0.0054
increas: 0.0049
scienc: 0.0046
would: 0.0042
report: 0.0042
dioxid: 0.0041
use: 0.0039
polici: 0.0038
human: 0.0037
model: 0.0037
us: 0.0037
per: 0.0037


-- 3 : 3211.8618



-- label 0 : common
climat: 0.0087
chang: 0.0066
would: 0.0053
said: 0.0052
year: 0.0051
state: 0.0045
one: 0.0044
like: 0.0043
new: 0.0043
peopl: 0.0038
us: 0.0036
say: 0.0035
time: 0.0034
world: 0.0034
trump: 0.0033
also: 0.0031
nation: 0.0030
govern: 0.0029
presid: 0.0028
global: 0.0028

-- label 1 : 0
trump: 0.0147
said: 0.0090
say: 0.0059
one: 0.0050
peopl: 0.0050
presid: 0.0046
would: 0.0045
like: 0.0042
clinton: 0.0042
state: 0.0033
obama: 0.0032
also: 0.0031
polit: 0.0031
time: 0.0031
year: 0.0031
republican: 0.0029
go: 0.0029
hous: 0.0028
new: 0.0028
get: 0.0027

-- label 2 : 1
climat: 0.0285
chang: 0.0127
warm: 0.0094
temperatur: 0.0085
global: 0.0081
energi: 0.0073
carbon: 0.0067
emiss: 0.0059
year: 0.0057
increas: 0.0049
report: 0.0046
scienc: 0.0046
would: 0.0044
dioxid: 0.0041
use: 0.0038
polici: 0.0038
model: 0.0038
record: 0.0038
human: 0.0037
claim: 0.0037


-- 4 : 3185.3085



-- label 0 : common
climat: 0.0092
chang: 0.0068
would: 0.0055
year: 0.0052
said: 0.0051
state: 0.0047
new: 0.0044
one: 0.0043
like: 0.0040
peopl: 0.0038
world: 0.0037
us: 0.0037
time: 0.0034
say: 0.0032
govern: 0.0031
trump: 0.0030
global: 0.0030
also: 0.0030
nation: 0.0029
polici: 0.0028

-- label 1 : 0
trump: 0.0151
said: 0.0090
say: 0.0061
one: 0.0051
peopl: 0.0049
presid: 0.0048
like: 0.0046
clinton: 0.0043
would: 0.0042
obama: 0.0033
also: 0.0033
time: 0.0032
republican: 0.0031
polit: 0.0030
state: 0.0030
hous: 0.0029
campaign: 0.0029
year: 0.0029
work: 0.0028
go: 0.0028

-- label 2 : 1
climat: 0.0283
chang: 0.0127
warm: 0.0096
temperatur: 0.0087
global: 0.0080
energi: 0.0073
carbon: 0.0068
emiss: 0.0060
year: 0.0058
increas: 0.0048
report: 0.0047
scienc: 0.0045
dioxid: 0.0043
would: 0.0043
use: 0.0039
model: 0.0038
claim: 0.0038
record: 0.0038
per: 0.0037
polici: 0.0037
perplexity : 3166.9508

-- label 0 : common
climat: 0.0092
chang: 0.0068
would: 0.0055
year: 0.0052
said: 0.0

"\nimport shelve\nfilename = './SavedModels/llda_prepended_hostnames_5iter.dat'\nmy_shelf = shelve.open(filename, 'n')\n\nmy_shelf['llda'] = globals()['llda']\nmy_shelf['trainSet'] = globals()['trainSet']\nmy_shelf['testSet'] = globals()['testSet']\n\nmy_shelf.close()\n"

In [None]:
testCorpus = list(testSet)
 
trimmedTestCorpus = testCorpus
phi = llda.phi()
totalPredictionsCorrect = 0
pred_classes = np.zeros([np.size(testSet, axis=0)])
true_classes = np.float64(np.reshape(testLabel, [np.size(testSet, axis=0)]))

In [None]:
# for dev data (validation)
for i in range(len(testSet)):
    # Each column is a one-hot entry indicating the existence of a particular word from the vocab in the dataset
    oneHotVector = np.zeros([1, len(llda.vocas)])
    for j in range(len(trimmedTestCorpus[i][:])):
        if trimmedTestCorpus[i][j] in llda.vocas:
            oneHotVector[0, llda.vocas.index(trimmedTestCorpus[i][j])] = 1
            
    # Dot-product that one-hot vector with the phi vector and see what the most likely topic is
    dotProductReal = np.dot(oneHotVector[0, :], phi[1, :])
    dotProductFake = np.dot(oneHotVector[0, :], phi[2, :])
    
    #print("dot product with topic common: " + str(dotProductCommon))
    #print("dot product with topic real: " + str(dotProductReal))
    #print("dot product with topic fake: " + str(dotProductFake))
    print("0: ", dotProductReal,"1: ", dotProductFake, 'label: ', testLabel[i])
    if dotProductReal < dotProductFake and dotProductFake>0.1 :
        expected = 1.0
    else:
        expected = 0.0
        
    pred_classes[i] = expected
    #print("Actual label: " + str(testSet[i, 1]))
    
    if expected == testLabel[i]:
        totalPredictionsCorrect = totalPredictionsCorrect + 1
    
    if i % 10 == 0 and i > 0:
        print("i = " + str(i) + ", " + str(100.*i/len(testSet)) + "% done")
    
    #print("\n\n")

#print("Testing Accuracy: " + str((100.*totalPredictionsCorrect)/len(testSet)))
#print("Matthews Correlation Coeff: " + str(matthews_corrcoef(y_true=true_classes, y_pred=pred_classes)))
print('done')

0:  0.07166381530781253 1:  0.09726452735205175 label:  -1
0:  0.16982709122789905 1:  0.0724875048984593 label:  -1
0:  0.1609432065312889 1:  0.12158975400964198 label:  -1
0:  0.08852082877751154 1:  0.0645523178031501 label:  -1
0:  0.062013034060428254 1:  0.14268834044939072 label:  -1
0:  0.15480556163339906 1:  0.09384829282320577 label:  -1
0:  0.07938497915443109 1:  0.03734410665845324 label:  -1
0:  0.09502942328516847 1:  0.06183646425303698 label:  -1
0:  0.16004347591085114 1:  0.15143248511292642 label:  -1
0:  0.08190703142304509 1:  0.056047777943513385 label:  -1
0:  0.10411821998765398 1:  0.05154699038828512 label:  -1
i = 10, 0.7092198581560284% done
0:  0.08320542925135987 1:  0.03424868083305151 label:  -1
0:  0.13855758341444352 1:  0.08893623708610068 label:  -1
0:  0.1876420231718467 1:  0.1583102139604155 label:  -1
0:  0.04171484064303213 1:  0.02707267127617511 label:  -1
0:  0.0744156603623152 1:  0.22139379042155025 label:  -1
0:  0.22579387510060164 1: 

KeyboardInterrupt: ignored

In [None]:
# for submit test

In [None]:
for i in range(len(testSet)):
    oneHotVector = np.zeros([1, len(llda.vocas)])
    for j in range(len(trimmedTestCorpus[i][:])):
        if trimmedTestCorpus[i][j] in llda.vocas:
            oneHotVector[0, llda.vocas.index(trimmedTestCorpus[i][j])] = 1

    dotProductReal = np.dot(oneHotVector[0, :], phi[1, :])
    dotProductFake = np.dot(oneHotVector[0, :], phi[2, :])

    if dotProductReal < dotProductFake:
        expected = 1.0
    else:
        expected = 0.0
        
    pred_classes[i] = expected

    if i % 10 == 0 and i > 0:
        print("i = " + str(i) + ", " + str(100.*i/len(testSet)) + "% done")
    
    #print("\n\n")
#print("Testing Accuracy: " + str((100.*totalPredictionsCorrect)/len(testSet)))
#print("Matthews Correlation Coeff: " + str(matthews_corrcoef(y_true=true_classes, y_pred=pred_classes)))
print('done')

i = 10, 0.7092198581560284% done
i = 20, 1.4184397163120568% done
i = 30, 2.127659574468085% done
i = 40, 2.8368794326241136% done
i = 50, 3.5460992907801416% done
i = 60, 4.25531914893617% done
i = 70, 4.964539007092198% done
i = 80, 5.673758865248227% done
i = 90, 6.382978723404255% done
i = 100, 7.092198581560283% done
i = 110, 7.801418439716312% done
i = 120, 8.51063829787234% done
i = 130, 9.21985815602837% done
i = 140, 9.929078014184396% done
i = 150, 10.638297872340425% done
i = 160, 11.347517730496454% done
i = 170, 12.056737588652481% done
i = 180, 12.76595744680851% done
i = 190, 13.47517730496454% done
i = 200, 14.184397163120567% done
i = 210, 14.893617021276595% done
i = 220, 15.602836879432624% done
i = 230, 16.31205673758865% done
i = 240, 17.02127659574468% done
i = 250, 17.73049645390071% done
i = 260, 18.43971631205674% done
i = 270, 19.148936170212767% done
i = 280, 19.858156028368793% done
i = 290, 20.56737588652482% done
i = 300, 21.27659574468085% done
i = 310, 2

In [None]:
from sklearn.metrics import classification_report

result = classification_report(true_classes, pred_classes) 
print ('\n clasification report:\n', result)

In [None]:
## submission

In [None]:
pred_classes = [int(x) for x in pred_classes]
result = np.array(pred_classes)
submission_df = pd.DataFrame(data=result, columns=['prediction'])
print(submission_df)

      prediction
0              1
1              0
2              0
3              0
4              1
...          ...
1405           1
1406           0
1407           1
1408           1
1409           0

[1410 rows x 1 columns]


In [None]:
import json
from google.colab import files
#{"test-0": {"label": 0}, "test-1": {"label": 0}, ... ,}
data = {}
df = submission_df['prediction']
for idx, rows in enumerate(df):
    label = {}
    _id = 'test-'+str(idx)
    label["label"] = rows
    data[_id] = label

jsonfilename = 'test-output.json'
with open(jsonfilename, 'w') as jsonFile:
    jsonFile.write(json.dumps(data))

files.download('test-output.json')