## Topic Modeling
- This notebook walks thru a topic modeling process using `data/interim/subset_first_15000.gzip` 
- At the end of the notebook, a labeled data will be returned

#### Import Libraries

In [1]:
# Change to parent directory
import os
os.chdir(os.pardir)

In [2]:
import re
import pickle 
from pprint import pprint
import pandas as pd
pd.set_option('display.max_colwidth', 200)

import warnings
# warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings('ignore')


#### Helper Function to process raw data by chunks 
(Functions from `data_prep` package)  


In [3]:
from src.data_prep.topic_modeling_helpers import (preprocess_text, make_corpus,extract_labels,
                                                  find_best_crime_topic,build_lda_model, 
                                                  extract_labels)
from src.data_prep.preprocessing_helpers import impute_nans, remove_empty_articles

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jhonsen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


For each chunk:
- preprocess text file (remove empty articles, impute nans)
- topic model it
- find best topic modeled as "crime" 
    - keywords: ['black', 'police', 'violence', 'kill', 'arrest']
- save each labeled news into `data/interim` folder

In [4]:
# CONSTANTs (Hyperparameters) from previous notebook (TopicModelingFirstBatch.ipynb)
ALPHA = 'asymmetric'
ETA = 1
NTOPICS = 14

# Helper functions
def create_topic_model(dataset, start_row, end_row):

    # Preprocess 
    papers = dataset['article'].apply(preprocess_text)
    # Prepare topic modeling input
    corpus, id2word, bigrams, data_lemmatized = make_corpus(papers)
    # Build model & print the topic number with best matching keywords
    lda_model = build_lda_model(corpus, id2word, n_topics=NTOPICS, alpha=ALPHA, eta=ETA)
    best_topic_no = find_best_crime_topic(lda_model, n_topics=NTOPICS)
    # label document
    dataset['topic'] = extract_labels(lda_model, data_lemmatized, corpus, n_topics=NTOPICS)
    crime_subset = dataset[dataset.topic==best_topic_no]
    
    # Save subset of crime (labeled) file
    filename = f'labeled_crime_row{start_row}_to_row{end_row}.gzip'
    filepath = os.path.join('data', 'interim', filename)
    crime_subset.to_parquet(filepath, compression='gzip')
    print(f'{filename} has ', crime_subset.shape[0], ' rows')
    return crime_subset, best_topic_no, filename
        
def process_chunk(dataset, start_row, end_row):
    return create_topic_model(impute_nans(remove_empty_articles(dataset)), start_row, end_row)


---

#### Processing each chunk
**BEWARE** This takes **13 hours** to run locally! 
  
Output:  
- `data/interim/labeled_crime_row{start_row}_to_row{end_row}.gzip`
- `data/interim/crime_topic_index.gzip` 

In [7]:

def start(chunksize, start_row, end_row):
    data_directory = 'data'
    raw_filepath = os.path.join(os.path.relpath('.'), 'data', 'raw', 'all-the-news-2-1.csv')

    confirm = input("do you want to start? [y/n]")
    if (confirm == 'y') or (confirm =='Y'):

        all_crime_news = pd.DataFrame()
        crime_topic_index = pd.DataFrame()
        for chunk in pd.read_csv(raw_filepath, header=0,
                                 chunksize=chunksize, 
                                 encoding='utf-8',
                                 usecols = ["date","author","title","publication","section","url", "article"],
                                 parse_dates=['date']
                                ):
            crime_news, best_topic_no, fname = process_chunk(chunk, start_row, end_row)

            crime_topic_index = crime_topic_index.append(pd.DataFrame({'filename': fname, 'topic': best_topic_no,
                                                                      'start_row': start_row, 'end_row': end_row},
                                                                     columns=['filename','topic','start_row','end_row'], index=[0]), ignore_index=True)
            all_crime_news = all_crime_news.append(crime_news, ignore_index=True)

            #print(f'\t===== Finished with first {end_row} rows ====\n')
            start_row += chunksize
            end_row += chunksize

        filepath = os.path.join('data', 'interim', 'crime_topic_index.gzip')
        crime_topic_index.to_parquet(filepath, compression='gzip')
    
    print('complete')
    


In [8]:
###### File chunk settings
start_row = 1
chunksize = 20000  # This is equivalent to <25 mb parquet file
end_row = chunksize 

######  Un-comment below to start! ########
start(chunksize, start_row, end_row)

do you want to start? [y/n] y


labeled_crime_row1_to_row20000.gzip has  1720  rows
labeled_crime_row20001_to_row40000.gzip has  2047  rows
labeled_crime_row40001_to_row60000.gzip has  2288  rows
labeled_crime_row60001_to_row80000.gzip has  1523  rows
labeled_crime_row80001_to_row100000.gzip has  1733  rows
labeled_crime_row100001_to_row120000.gzip has  1411  rows
labeled_crime_row120001_to_row140000.gzip has  1913  rows
labeled_crime_row140001_to_row160000.gzip has  1459  rows
labeled_crime_row160001_to_row180000.gzip has  1245  rows
labeled_crime_row180001_to_row200000.gzip has  899  rows
labeled_crime_row200001_to_row220000.gzip has  2288  rows
labeled_crime_row220001_to_row240000.gzip has  0  rows
labeled_crime_row240001_to_row260000.gzip has  1056  rows
labeled_crime_row260001_to_row280000.gzip has  0  rows
labeled_crime_row280001_to_row300000.gzip has  1084  rows
labeled_crime_row300001_to_row320000.gzip has  0  rows
labeled_crime_row320001_to_row340000.gzip has  0  rows
labeled_crime_row340001_to_row360000.gzi

---