In [1]:
import pandas as pd
import re
import os
import json
import pickle
from datetime import datetime
from collections import Counter, defaultdict
from tqdm import tqdm

# Importing libraries you need to install
import pandas as pd
import numpy as np

# Import yfinance and pandas_datareader
from pandas_datareader import data as pdr
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

# import yfinance as yf 

# Override function to store data we get
# yf.pdr_override()

# Import nltk for first step extracting words
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer

# Set up stop_words from nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))
stop_words |= {'10-k', 'form', 'table', 'contents', 'united', 'states', 'securities', 'exchange', 'commission'}

lemmatizer = WordNetLemmatizer() 

"""
this is where different from version 1
"""
#import libraries for n-gram counting
from sklearn.feature_extraction.text import CountVectorizer

from utils.crawler import remove_punct, is_words

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/luckywang/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/luckywang/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/luckywang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## now we have all the txts stored in the file:
'./data/10k/[cik]/rawtext/[cik]_[date]'

## we can make a dictionary to store all the data needed

In [2]:
# read the ticker library of all the tikers into ticker_library
ticker_library = pd.read_csv(os.path.join("data", "tickers.csv"))

# read the sp500 components into ticker_selected, 'name' is the company name and ticker is company's ticker
ticker_selected = pd.read_csv(os.path.join("data", "SP500_component_stocks.csv"), header = None)
ticker_selected.columns = ['name','ticker']


  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
# build a ticker_cik_df dataframe to store ticker and its cik number
ticker_cik_df = pd.DataFrame()

# store all the tickers in a ticker_list
ticker_list = ticker_selected.ticker

# build a list cik_list for cik
cik_list = []

for ticker in ticker_list:    
    try:
        # for a given ticker, find its cik number through th ticker library
        cik_list.append(list(ticker_library[ticker_library.ticker == ticker].secfilings)[0][-10:])
        
    except:
        # if could not find cik, give it a empty cik
        cik_list.append('')

# write cik_list and ticker_list to the dataframe ticker_cik_df
ticker_cik_df['cik'] = cik_list
ticker_cik_df['ticker'] = ticker_list

# delete the tickers with empty cik number
ticker_cik_df = ticker_cik_df[ticker_cik_df['cik'] != '']

# display a sample of ticker_cik_df
ticker_cik_df.head()


Unnamed: 0,cik,ticker
0,1090872,A
1,6201,AAL
2,1158449,AAP
3,320193,AAPL
4,1551152,ABBV


In [4]:
CIK2TICKER = {row["cik"]: row["ticker"] for _, row in ticker_cik_df.iterrows()}

In [5]:
listtickers = ['AMZN','BBY','BKNG','MCD','EBAY','F','HD','TGT','WHR','JPM','SIVB','CFG','C','ALL','IVZ','ETFC','MET','PFG','CBOE',
              'CTL','IPG','VIAC','NFLX','CHTR','FB','TWTR','NWSA','FOXA','AMD','INTC','AAPL','LRCX','MSFT','NLOK','CTSH','ADS',
              'WU','PAYC','ABT','CVS','PFE','JNJ','BIIB','INCY','HSIC','WAT','ALGN','EW']

ticker_cik_sample = pd.DataFrame()

for ticker in listtickers:
    ticker_cik_sample = ticker_cik_sample.append(ticker_cik_df[ticker_cik_df['ticker'] == ticker])
    

In [6]:
cik_list = ticker_cik_sample["cik"].values
ticker_list = ticker_cik_sample["ticker"].values

In [7]:
# store data using dictionary
all_data = {}

# set the key of dictionary as ticker
for cik, ticker in zip(cik_list, ticker_list):
    
    # set the value of tikcer as a dict
    all_data[ticker] = {}

    # set the dict data[ticker] 
    all_data[ticker]['cik'] = cik
    all_data[ticker]['10ks'] = {}
    all_data[ticker]['10qs'] = {}


In [8]:
all_data['AMZN']

{'cik': '0001018724', '10ks': {}, '10qs': {}}

In [9]:
dir_10k = './data/10k/'
dir_10q = './data/10q/'

In [10]:
print(len(cik_list))
print(len(ticker_list))

45
45


In [11]:
def aggregate_cik_texts(cik, filetype):
    """
    Collect all the texts related to given `cik` with given filetype and 
    return a single string which concatenate all docs
    """
    cik_dir = os.path.join("data", filetype, cik)
    pkl_path = os.path.join(cik_dir, "pickle")

    if not os.path.isdir(pkl_path):
        os.mkdir(pkl_path)
    else:
        # If already processed before, directly read the cache and return
        with open(os.path.join(pkl_path, 'agg_texts.pkl'), 'rb') as f:
            texts = pickle.load(f)
        with open(os.path.join(pkl_path, 'token_counter.pkl'), 'rb') as f:
            counter = pickle.load(f)
        return {"texts": texts, "counter": counter}

    rawtext_dir = os.path.join(cik_dir, "rawtext")
    # goes into the directory to find the path for txtfiles
    try:
        all_files = os.listdir(rawtext_dir)
    except:
        print("No such dir")
    
    texts = ""
    for file in all_files:
        with open(os.path.join("data", filetype, cik, "rawtext", file), encoding = "utf8") as f:
            string_temp = f.read().lower()
            texts += preprocess(string_temp)
    
    texts = remove_punct(texts)
    counter = texts2counter(texts)

    with open(os.path.join(pkl_path, 'agg_texts.pkl'), 'wb') as f:
        # Pickle the 'data' dictionary using the highest protocol available.
        print(os.path.join(pkl_path, 'agg_texts.pkl'))
        pickle.dump(texts, f, pickle.HIGHEST_PROTOCOL)
    
    with open(os.path.join(pkl_path, 'token_counter.pkl'), 'wb') as f:
        # Pickle the 'data' dictionary using the highest protocol available.
        print(os.path.join(pkl_path, 'token_counter.pkl'))
        pickle.dump(counter, f, pickle.HIGHEST_PROTOCOL)

    return {"texts": texts, "counter": counter}


In [12]:
def preprocess(texts):
    """ 
    Tokenize texts, remove stopwords and numbers, and keep only the relevant words,
    then lemmatize the tokens
    """
    # lemmatizer = WordNetLemmatizer()
    ps = PorterStemmer()
#     for w in words:
#         rootWord=ps.stem(w)
    
#     tokens = [lemmatizer.lemmatize(token) for token in nltk.word_tokenize(texts) if token not in stop_words and is_words(token)]
    tokens = [ps.stem(token) for token in nltk.word_tokenize(texts) if token not in stop_words and is_words(token)]
    
    return ' '.join(tokens)
    

In [13]:
def texts2counter(texts):
    tokens = texts.split(' ')
    counter = Counter(tokens)
    
    return counter
    

In [14]:
now = datetime.now() # current date and time

date_time = now.strftime("%m-%d-%H_%M_%S")
print("date and time:",date_time)	

date and time: 10-06-01_38_45


In [15]:
def get_texts(cik_list, ticker_list):
    # './data/10k/[cik]/rawtext/[cik]_[date]'
    docs = []
    tickers = []
    counters = dict()   # {ticker: counter}

    for cik, ticker in tqdm(zip(cik_list, ticker_list)):
        tickers.append(ticker)
        texts = ""
        for filetype in ["10k", "10q"]:
            dict_ret = aggregate_cik_texts(cik, filetype)
            texts += dict_ret["texts"]
        
        counter = texts2counter(texts)
        counters[ticker] = counter

        docs.append(texts)
    
    date_time = now.strftime("%m-%d-%H_%M_%S")
    cache_path = os.path.join("data", date_time)

    if not os.path.exists(cache_path):
        os.mkdir(cache_path)
    
    with open(os.path.join(cache_path, 'agg_counters.pkl'), 'wb') as f:
        # Pickle the 'data' dictionary using the highest protocol available.
        pickle.dump(counters, f, pickle.HIGHEST_PROTOCOL)

    with open(os.path.join(cache_path, 'agg_texts.pkl'), 'wb') as f:
        # Pickle the 'data' dictionary using the highest protocol available.
        pickle.dump(docs, f, pickle.HIGHEST_PROTOCOL)
    
    return {"docs": docs, "tickers": tickers, "counters": counters}


In [16]:
ret = get_texts(cik_list[:2], ticker_list[:2])

2it [00:26, 13.28s/it]

data/10k/0000764478/pickle/agg_texts.pkl
data/10k/0000764478/pickle/token_counter.pkl





In [17]:
docs = ret["docs"]

In [66]:
cv = CountVectorizer(max_df=0.8, stop_words=stop_words, max_features=10000)
word_count_vector = cv.fit_transform(docs)



In [50]:
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)


In [51]:
def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]

    score_vals = []
    feature_vals = []
    
    # word index and corresponding tf-idf score
    for idx, score in sorted_items:
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])

    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results = {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]] = score_vals[idx]
    
    return results


In [67]:
tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
tfidf_transformer.fit(word_count_vector)
# tf_idf_vector = tfidf_transformer.transform(word_count_vector)
# tf_idf_vector = tfidf_transformer.transform(cv.transform([docs[3]]))
# tf_idf_vector = tfidf_transformer.transform(cv.transform(docs[0]))

TfidfTransformer()

In [68]:
len(docs)

45

In [69]:
keywords = []

for i in range(len(docs)):
    tf_idf_vector = tfidf_transformer.transform(cv.transform([docs[i]]))
    # you only needs to do this once, this is a mapping of index to 
    feature_names = cv.get_feature_names()

    sorted_items = sort_coo(tf_idf_vector.tocoo())
    # extract only the top n; n here is 10
    keyword = extract_topn_from_vector(feature_names, sorted_items, 30)
    
    keywords.append(keyword)
    

In [70]:
dict_top_k = defaultdict(list)
# dict_top_k["tickers"] = tickers

In [71]:
for keyword in keywords:
    for i, word in enumerate(keyword.keys()):
        dict_top_k["word_{}".format(i)].append(word)
        dict_top_k["tfidf_{}".format(i)].append(keyword[word])

In [72]:
len(keywords)

45

In [73]:
df_top_k_word = pd.DataFrame(dict_top_k)

In [74]:
df_top_k_word = df_top_k_word.set_index("tickers")
df_top_k_word.filter(regex='word*', axis=1)

Unnamed: 0_level_0,word_0,word_1,word_2,word_3,word_4,word_5,word_6,word_7,word_8,word_9,...,word_20,word_21,word_22,word_23,word_24,word_25,word_26,word_27,word_28,word_29
tickers,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AMZN,amazoncom,peac,amazon,ship,seller,bezo,equitymethod,merchandis,ecommerc,aw,...,shipment,wwwamazond,wwwamazoncouk,unearn,wwwamazoncojp,pledg,jeffrey,longzon,wrongdo,kindl
BBY,sga,musicland,merchandis,applianc,magnolia,squad,geek,shop,ar,ebitdar,...,tradenam,phone,remodel,notebook,speakeasi,entertain,televis,canadian,auctionr,auction
BKNG,hotel,pricelinecom,bookingcom,airlin,ticket,hotelscom,kayak,car,merchant,pricedisclos,...,expedia,rentalcarscom,braddock,schulman,pricelin,nca,ctrip,agodacom,googl,walker
MCD,restaur,companyoper,mcdonald,franchise,franchis,apmea,systemwid,development,menu,refranchis,...,nm,japan,chicken,convent,nutrit,breakfast,omnibu,commod,occup,eat
EBAY,paypal,ebay,seller,skype,gsi,merchant,ticket,card,stubhub,rolex,...,counterfeit,processor,client,copyright,meritori,shop,butterfield,billpoint,launder,licensur
F,ford,automot,motor,incomeloss,securit,volvo,mazda,dealer,truck,fce,...,veba,jaguar,pag,fuel,statementsnot,nonconsum,warranti,commod,assetback,ghg
HD,depot,merchandis,hd,sga,expo,card,lumber,assort,carol,omnibu,...,rdc,remodel,ferri,blake,menear,atlanta,mexico,kpmg,contentsth,floor
TGT,card,guest,comparablestor,redcard,merchandis,sga,ebit,scovann,remodel,jpmc,...,ep,supertarget,visa,trc,marshal,minnesota,periodend,dougla,gift,linkbas
WHR,whirlpool,befiex,brazilian,indesit,maytag,embraco,applianc,sundri,compressor,pricemix,...,warranti,refriger,forwardsopt,oilrel,monet,alno,hotpoint,amana,kitchenaid,raw
JPM,jpmorgan,lendingrel,securit,card,chargeoff,var,msr,noninterest,pci,client,...,nm,lend,af,creditimpair,multisel,tier,spe,cb,heldforsal,conduit


In [75]:
df_top_k_word.to_csv("data/45_companies_top_30_stemming.csv")

## Analyze 10K word with tfidf and bag-of-words

In [19]:
"""
This part is to compute the tf values for the words in 10ks and collect the overall word list for computing idf in the next step
"""
#count the number of 10K documents we have
document_num_10k = 0

#word list for 10k 
word_list_10k = defaultdict(int)


#iterate through the tickers
for ticker in all_data:
    
    #for a given ticker, iterate through date
    for date in all_data[ticker]['10ks']:
        
        #we have document for a given date, so add 1 for document_num_10k
        document_num_10k += 1
        
        #compute the tfs for the txt file
        all_data[ticker]['10ks'][date]['tf'] = Counter(all_data[ticker]['10ks'][date]['words'].split())
        
        #iterate through the words in tf, which is the words of a given 10k document of a given date
        for word in all_data[ticker]['10ks'][date]['tf']:
            
            #add one if it already contains the word, or add the this word to the dict if not
            word_list_10k[word] += 1
        

In [20]:
#compute idf value for the word in 10ks
idf_10k = {}

#iterate through all the words in word_list_10k
for word in word_list_10k:
    
    #compute idf value
    idf_10k[word] = np.log(document_num_10k / (1 + word_list_10k[word]))

In [21]:
"""
This part is new in the version 2. It's using for n-gram preparation.
"""

#we already have the number of all 10k files
# document_num_10k = document_num_10k

#word pair list for 10k
pair_list_10k = defaultdict(int)


#iterate through the tickers
for ticker in all_data:
    
    #for a given ticker, iterate through date
    for date in all_data[ticker]['10ks']:
        
        #there is no need for counting document_num_10k
        
        #compute the tfs for the txt file
        vectorizer = CountVectorizer(analyzer = 'word', ngram_range = (2, 2))
        
        #fit it through vectorizer
        fitted = vectorizer.fit_transform([all_data[ticker]['10ks'][date]['words']])
        
        #after vectorizer, we have feature name and feature count, feed them to a dataframe
        df_temp = pd.DataFrame(index = vectorizer.get_feature_names(), data = np.squeeze(fitted.toarray()))
        # then we can add the data into the main data set all_data
        all_data[ticker]['10ks'][date]['tf_pair'] = df_temp.to_dict()[0]
        # print(all_data[ticker]['10ks'][date]['tf_pair'])
        #print(all_data[ticker]['10ks'][date]['tf_pair'])
        #iterate through the pairs in tf_pair, which is the words of a given 10k document of a given date
        for pair in all_data[ticker]['10ks'][date]['tf_pair']:
            
            #add one if it already contains the pair, or add the this pair to the dict if not have
            pair_list_10k[pair] += 1
        

In [22]:
pair_list_10k

defaultdict(int, {})

In [23]:
"""
This part is new in the version 2. It's using for n-gram preparation.
"""

#compute idf value for the pair in 10ks
idf_10k_pair = {}

#iterate through all the pairs in pair_list_10k
for pair in pair_list_10k:
    
    #compute idf value
    idf_10k_pair[pair] = np.log(document_num_10k / (1 + pair_list_10k[pair]))

In [24]:
"""
This part is new in the version 2. It's using for n-gram preparation.
"""

#we already have the number of all 10k files
# document_num_10k = document_num_10k

#word triple list for 10k
triple_list_10k = defaultdict(int)


#iterate through the tickers
for ticker in all_data:
    
    #for a given ticker, iterate through date
    for date in all_data[ticker]['10ks']:
        
        #there is no need for counting document_num_10k
        
        #compute the tfs for the txt file
        vectorizer = CountVectorizer(analyzer = 'word', ngram_range = (3, 3))
        
        #fit it through vectorizer
        fitted = vectorizer.fit_transform([all_data[ticker]['10ks'][date]['words']])
        
        #after vectorizer, we have feature name and feature count, feed them to a dataframe
        df_temp = pd.DataFrame(index = vectorizer.get_feature_names(), data = np.squeeze(fitted.toarray()))
        
        #then we can add the data into the main data set all_data
        all_data[ticker]['10ks'][date]['tf_triple'] = df_temp.to_dict()[0]
        
        #iterate through the triples in tf_triple, which is the words of a given 10k document of a given date
        for triple in all_data[ticker]['10ks'][date]['tf_triple']:
            
            #add one if it already contains the triple, or add the this triple to the dict if not have
            triple_list_10k[triple] += 1
            
"""
This part is new in the version 2. It's using for n-gram preparation.
"""

#compute idf value for the triple in 10ks
idf_10k_triple = {}

#iterate through all the triples in triple_list_10k
for triple in triple_list_10k:
    
    #compute idf value
    idf_10k_triple[triple] = np.log(document_num_10k / (1 + triple_list_10k[triple]))

### doing the same to 10qs

In [25]:
#also do the same to 10q files

"""
This part is to compute the tf values for the words in 10qs and collect the overall word list for computing idf in the next step
"""
#count the number of 10Q documents we have
document_num_10q = 0

#word list for 10q 
word_list_10q = defaultdict(int)


#iterate through the tickers
for ticker in all_data:
    
    #for a given ticker, iterate through date
    for date in all_data[ticker]['10qs']:
        
        #we have document for a given date, so add 1 for document_num_10q
        document_num_10q += 1
        
        #compute the tfs for the txt file
        all_data[ticker]['10qs'][date]['tf'] = Counter(all_data[ticker]['10qs'][date]['words'].split())
        
        #iterate through the words in tf, which is the words of a given 10q document of a given date
        for word in all_data[ticker]['10qs'][date]['tf']:
            
            #add one if it already contains the word, or add the this word to the dict if not
            word_list_10q[word] += 1
               

In [26]:
#compute idf value for the word in 10qs
idf_10q = {}

#iterate through all the words in word_list_10q
for word in word_list_10q:
    
    #compute idf value
    idf_10q[word] = np.log(document_num_10q / (1 + word_list_10q[word]))
    

In [27]:
"""
This part is new in the version 2. It's using for n-gram preparation.
"""

#we already have the number of all 10q files
# document_num_10q = document_num_10q

#word pair list for 10q
pair_list_10q = defaultdict(int)


#iterate through the tickers
for ticker in all_data:
    
    #for a given ticker, iterate through date
    for date in all_data[ticker]['10qs']:
        
        #there is no need for counting document_num_10q
        
        #compute the tfs for the txt file
        vectorizer = CountVectorizer(analyzer = 'word', ngram_range = (2, 2))
        
        #fit it through vectorizer
        fitted = vectorizer.fit_transform([all_data[ticker]['10qs'][date]['words']])
        
        #after vectorizer, we have feature name and feature count, feed them to a dataframe
        df_temp = pd.DataFrame(index = vectorizer.get_feature_names(), data = np.squeeze(fitted.toarray()))
        
        #then we can add the data into the main data set all_data
        all_data[ticker]['10qs'][date]['tf_pair'] = df_temp.to_dict()[0]
        
        #iterate through the pairs in tf_pair, which is the words of a given 10q document of a given date
        for pair in all_data[ticker]['10qs'][date]['tf_pair']:
            
            #add one if it already contains the pair, or add the this pair to the dict if not have
            pair_list_10q[pair] += 1

In [28]:
"""
This part is new in the version 2. It's using for n-gram preparation.
"""

#compute idf value for the pair in 10qs
idf_10q_pair = {}

#iterate through all the pairs in pair_list_10q
for pair in pair_list_10q:
    
    #compute idf value
    idf_10q_pair[pair] = np.log(document_num_10q / (1 + pair_list_10q[pair]))

In [29]:
"""
This part is new in the version 2. It's using for n-gram preparation.
"""

#we already have the number of all 10q files
# document_num_10q = document_num_10q

#word triple list for 10q
triple_list_10q = defaultdict(int)


#iterate through the tickers
for ticker in all_data:
    
    #for a given ticker, iterate through date
    for date in all_data[ticker]['10qs']:
        
        #there is no need for counting document_num_10q
        
        #compute the tfs for the txt file
        """
        The only adaptation from pair to triple is changing ngram_range, needing futher simplification of code
        """
        vectorizer = CountVectorizer(analyzer = 'word', ngram_range = (3, 3))
        
        #fit it through vectorizer
        fitted = vectorizer.fit_transform([all_data[ticker]['10qs'][date]['words']])
        
        #after vectorizer, we have feature name and feature count, feed them to a dataframe
        df_temp = pd.DataFrame(index = vectorizer.get_feature_names(), data = np.squeeze(fitted.toarray()))
        
        #then we can add the data into the main data set all_data
        all_data[ticker]['10qs'][date]['tf_triple'] = df_temp.to_dict()[0]
        
        #iterate through the triples in tf_triple, which is the words of a given 10q document of a given date
        for triple in all_data[ticker]['10qs'][date]['tf_triple']:
            
            #add one if it already contains the triple, or add the this triple to the dict if not have
            triple_list_10q[triple] += 1
            
"""
This part is new in the version 2. It's using for n-gram preparation.
"""

#compute idf value for the triple in 10qs
idf_10q_triple = {}

#iterate through all the triples in triple_list_10q
for triple in triple_list_10q:
    
    #compute idf value
    idf_10q_triple[triple] = np.log(document_num_10q / (1 + triple_list_10q[triple]))

### have a look at the data structure

In [30]:
idf_10k

{}

In [31]:
"""
This is new in version 2, containing idfs for word pairs
"""

idf_10q_pair

{}

In [32]:
"""
This is new in version 2, containing idfs for word pairs
"""

idf_10q_triple

{}

## Store the data for future use 

In [33]:
"""
this part is for storing the data for future use
"""
#delete word in all_data for storage
#iterate through the tickers
for ticker in all_data:
    
    #for a given ticker, iterate through date
    for date in all_data[ticker]['10qs']:
        
        del all_data[ticker]['10qs'][date]['words']
    
    #for a given ticker, iterate through date
    for date in all_data[ticker]['10ks']:
        
        del all_data[ticker]['10ks'][date]['words']


In [34]:
#write all_data to a json file
with open('all_data.json', 'w') as json_file:
    json_file.write(json.dumps(all_data))

#write idf_10k to a json file
with open('idf_10k.json', 'w') as json_file:
    json_file.write(json.dumps(idf_10k))

#write idf_10q to a json file    
with open('idf_10q.json', 'w') as json_file:
    json_file.write(json.dumps(idf_10q))
    

In [35]:
"""
This is new in version 2, storing files for word pairs and word triples
"""
#write idf_10k_pair to a json file  
with open('idf_10k_pair.json', 'w') as json_file:
    json_file.write(json.dumps(idf_10k_pair))

#write idf_10q_pair to a json file    
with open('idf_10q_pair.json', 'w') as json_file:
    json_file.write(json.dumps(idf_10q_pair))
    
#write idf_10k_triple to a json file  
with open('idf_10k_triple.json', 'w') as json_file:
    json_file.write(json.dumps(idf_10k_triple))

#write idf_10q_triple to a json file    
with open('idf_10q_triple.json', 'w') as json_file:
    json_file.write(json.dumps(idf_10q_triple))    
