In [1]:
import pandas as pd
import numpy as np
from multiprocessing import Pool, Lock
import shutil
import os
import json
import csv
from ast import literal_eval
from file_operations import get_all_files, read_file, copy_file, process_token

In [2]:
# ## download fastText model
# import gensim.downloader as api
# model = api.load('fasttext-wiki-news-subwords-300')

In [2]:
root = "../data/cnn_dailymail/data"
daily_path, cnn_path = "/dailymail/stories/", "/cnn/stories/"
daily_docs = get_all_files(root + daily_path)
cnn_docs = get_all_files(root + cnn_path)
# file_paths = [f for f in os.listdir(root+"/subset") if f.endswith("story")]        
# file_paths

In [3]:
full_text = "../data/tf_idf/full_text/"
entity = "../data/entity/"
tfidf = "../data/tf_idf/"

## Vectorize Documents

In [44]:
def process_file(file_paths):
    
    docs2vec_path = root+"/cnn_docs.csv"

    lock = Lock()

    file = open(docs2vec_path, 'w', newline='')
    csv_writer = csv.writer(file)
    
    with Pool(processes=8) as pool:
        results = pool.imap(read_file, file_paths)

        # Process the results as they become available
        for ID, document in results:
            # Do something with the content (process each file separately)
            vectors = []
            if not document:
                continue
            
            # FastText word vectorization
            for token in document:
                if token in model:
                    vectors.append(model[token])
                    
            # Averaging all words
            vector = np.mean(vectors, axis=0) 
            ids = ID.split(".")[0]
            
            row = [ids] + list(vector)
            
            lock.acquire()
            csv_writer.writerow(row) 
            lock.release()
            
    file.close()
            
    
    
            
    

In [23]:
# process_file(daily_docs)

In [45]:
# process_file(cnn_docs)

## Copy Label File

In [7]:
labels = pd.read_csv(root+"label.csv")
label_ids = labels.loc[:,"id"]

In [27]:
def join_path_from_orgin(ids):
    full_path = []
    for file_name in ids:
        if  not file_name.endswith("story"):
            file_name += ".story"
        if file_name in daily_docs:
            full_path.append(os.path.join(root+daily_path, file_name))
        else:
            full_path.append(os.path.join(root+cnn_path, file_name))
            
    return full_path

def join_path_to_dest(ids, dest):
    full_path = []
    for file_name in ids:
        if  not file_name.endswith("story"):
            file_name += ".story"
        full_path.append(os.path.join(dest, file_name))
    return full_path

In [29]:
def copy_file_docs(label_ids, destination):
    source_files = join_path_from_orgin(label_ids)
    destination_files = join_path_to_dest(label_ids, destination)
    
    file_pairs = zip(source_files, destination_files)
    
    with Pool(processes=8) as pool:
        pool.map(copy_file, file_pairs)

In [30]:
copy_file_docs(label_ids, "../data/package_performance/manual_label/label_story/")

## Compute vocabulary

#### 1. full text

In [14]:
def compute_vocal(result, file_paths):

    # lock = Lock()

    with Pool(processes=8) as pool:
        results = pool.imap(read_file, file_paths)

        # Process the results as they become available
        for ID, document in results:
            # Do something with the content (process each file separately)
            
            if not document:
                continue
            
            for token in document:
                result.add(token)
                    
            # lock.acquire()
            # lock.release()
            
    return result
            
            

In [10]:
def store_vocabulary(vocal_set, destination):
    vocal_set.discard("")
    
    set_as_list = list(vocal_set)

    # Write the set (list) to a JSON file
    with open(destination, 'w', encoding='utf-8') as file:
        json.dump(set_as_list, file)

In [53]:
vocal_set = set()

In [54]:
vocal_set = compute_vocal(vocal_set, cnn_docs)

In [58]:
vocal_set = compute_vocal(vocal_set, daily_docs)

In [None]:
store_vocabulary(vocal_set, full_text+'/vocabulary.json')

#### 2. entity file

In [13]:
def compute_entity_vocal(result, file_path, col=1):

    entity_df = pd.read_excel(file_path)
        
    for s in entity_df.iloc[:,col]:
        if s and len(s) > 0:
            result.update(process_token(s))   
        
    return result

In [35]:
vocal_location = set()
vocal_location = compute_entity_vocal(vocal_location,entity+"cnn_location.xlsx")
vocal_location = compute_entity_vocal(vocal_location,entity+"daily_loc.xlsx")
store_vocabulary(vocal_location, tfidf+"location/vocabulary.json")

In [55]:
vocal_date = set()
vocal_date = compute_entity_vocal(vocal_date,entity+"cnn_date.xlsx")
vocal_date = compute_entity_vocal(vocal_date,entity+"daily_date.xlsx")

store_vocabulary(vocal_date, tfidf+"date/vocabulary.json")

In [None]:
vocal_people = set()
vocal_people = compute_entity_vocal(vocal_people,entity+"cnn_people.xlsx")
vocal_people = compute_entity_vocal(vocal_people,entity+"daily_people_org.xlsx")

store_vocabulary(vocal_people, tfidf+"people/vocabulary.json")

In [14]:
vocal_org = set()
vocal_org = compute_entity_vocal(vocal_org,entity+"cnn_organization.xlsx")
vocal_org = compute_entity_vocal(vocal_org,entity+"daily_people_org.xlsx", 2)

store_vocabulary(vocal_org, tfidf+"organization/vocabulary.json")

## Compute IDF

#### 1. full_text

In [4]:
with open(full_text+'/vocabulary.json', 'r') as file:
    vocabulary = json.load(file)

In [6]:
def compute_df(df_dict, file_paths):
    
    with Pool(processes=8) as pool:
        results = pool.imap(read_file, file_paths)

        # Process the results as they become available
        for ID, document in results:
            # Do something with the content (process each file separately)
            
            if not document:
                continue
                
            # remove duplicated tokens
            token_set = set(document)
            
            for token in token_set:
                df_dict[token] += 1
                
    return df_dict
                
            


In [19]:
def compute_idf(idf):
    N = len(daily_docs) + len(cnn_docs)
    
    for k,v in idf.items():
        idf[k] = np.log(N/v)
        
    return idf
    

In [7]:
idf = {element: 0 for element in vocabulary}

In [10]:
idf = compute_df(idf, daily_docs)

In [11]:
idf = compute_df(idf, cnn_docs)

In [12]:
idf = compute_idf(df)

In [14]:
# Write the dictionary to a JSON file
with open(full_text+'/idf.json', 'w') as file:
    json.dump(idf, file)

#### 2. entity file

In [20]:
def compute_entity_df(df_dict, file_path, col=1):
    entity_df = pd.read_excel(file_path)
        
    for s in entity_df.iloc[:,col]:
        if s and len(s) > 0:
            tokens = process_token(s)
            token_set = set(tokens)
            
        # print(s, token_set)
        for token in token_set:
            df_dict[token] += 1
            
    return df_dict

In [7]:
with open(tfidf+"date/vocabulary.json", 'r') as file:
    vocabulary = json.load(file)

date_idf = {element: 0 for element in vocabulary}
date_idf = compute_entity_df(date_idf, entity+"cnn_date.xlsx")
date_idf = compute_entity_df(date_idf, entity+"daily_date.xlsx")

date_idf = compute_idf(date_idf)
with open(tfidf+'date/idf.json', 'w') as file:
    json.dump(date_idf, file)

In [10]:
with open(tfidf+"location/vocabulary.json", 'r') as file:
    vocabulary = json.load(file)

loc_idf = {element: 0 for element in vocabulary}
loc_idf = compute_entity_df(loc_idf, entity+"cnn_location.xlsx")
loc_idf = compute_entity_df(loc_idf, entity+"daily_loc.xlsx")

loc_idf = compute_idf(loc_idf)
with open(tfidf+'location/idf.json', 'w') as file:
    json.dump(loc_idf, file)

In [21]:
with open(tfidf+"people/vocabulary.json", 'r') as file:
    vocabulary = json.load(file)

people_idf = {element: 0 for element in vocabulary}
people_idf = compute_entity_df(people_idf, entity+"cnn_people.xlsx")
people_idf = compute_entity_df(people_idf, entity+"daily_people_org.xlsx")

people_idf = compute_idf(people_idf)
with open(tfidf+'people/idf.json', 'w') as file:
    json.dump(people_idf, file)

In [22]:
with open(tfidf+"organization/vocabulary.json", 'r') as file:
    vocabulary = json.load(file)

org_idf = {element: 0 for element in vocabulary}
org_idf = compute_entity_df(org_idf, entity+"cnn_organization.xlsx")
org_idf = compute_entity_df(org_idf, entity+"daily_people_org.xlsx", 2)

org_idf = compute_idf(org_idf)
with open(tfidf+'organization/idf.json', 'w') as file:
    json.dump(org_idf, file)