## Inspect each chunk for accuracy of "crime" label

#### Import library

In [1]:
import os
import pandas as pd
pd.set_option('display.max_colwidth', 150)

#### Manual Inspection of Topic Model Accuracy
- Call each file saved during Chunk-processing (03_TopicModelingAllBatches.ipynb), saved in `data/interim`  
- Take a glance at "title" and "article" to see if they are really CRIME-related news
    - Read "title" and "article" of labeled news in `data/interim` folder
    - Mark non-relevant filenames for next step

In [2]:
os.getcwd()

'/Users/jhonsen/Documents/DS/dsProjects/racial-bias-detection/notebooks'

In [3]:
filepath = os.path.join('..','data', 'interim', 'crime_topic_index.gzip')
topic_index = pd.read_parquet(filepath)
topic_index.iloc[[*range(3)],:]

Unnamed: 0,filename,topic,start_row,end_row
0,labeled_crime_row1_to_row20000.gzip,12,1,20000
1,labeled_crime_row20001_to_row40000.gzip,5,20001,40000
2,labeled_crime_row40001_to_row60000.gzip,13,40001,60000


In [4]:
# Total number of files to inspect
topic_index.shape[0]

135

In [298]:
total = topic_index.shape[0]

def view(start, end):

    # display setting
    display_row = 3  # look at only the first 3 rows
    colnames = ['topic','title','article']
    selected_chunk = topic_index.iloc[[*range(start,end)],:]

    for index, row in selected_chunk.iterrows():

        if row['topic'] != 99:  # 99 is a dummy topic

            # get title and articles
            start,end,topic = row['start_row'], row['end_row'], row['topic']
            filename = f'labeled_crime_row{start}_to_row{end}.gzip'

            article = pd.read_parquet(os.path.join('..','data', 'interim', filename), 
                                      engine="pyarrow").query(f'topic=={topic}').head(display_row)[colnames]        
            select_topic_index = topic_index[(topic_index['start_row']==start) & (topic_index['end_row']==end)][['filename','topic']]

            display(pd.merge(select_topic_index, article, how='inner', on='topic'))

def data_batch(total):
    for start in range(0, total, 4): 
        end = start + 4
        try:
            print(start, end)
            yield view(start, end)
        except IndexError:
            end = total
            view(start, end)

# Initialize
go = data_batch(total)

In [297]:
# Iterate over 135 total files
next(go)

132 136


StopIteration: 

##### Record the filenames of non-relevant crime news below

In [75]:
# Insert the filenames for which title/article dont appear to be crime related

non_crime_filenames = ['labeled_crime_row200001_to_row220000.gzip',
                      'labeled_crime_row420001_to_row440000.gzip',
                      'labeled_crime_row720001_to_row740000.gzip',
                      'labeled_crime_row760001_to_row780000.gzip',
                       'labeled_crime_row1160001_to_row1180000.gzip',
                       'labeled_crime_row1260001_to_row1280000.gzip',
                       'labeled_crime_row1340001_to_row1360000.gzip',
                       'labeled_crime_row1380001_to_row1400000.gzip',
                       'labeled_crime_row1420001_to_row1440000.gzip',
                       'labeled_crime_row1500001_to_row1520000.gzip',
                       'labeled_crime_row1520001_to_row1540000.gzip',
                       'labeled_crime_row1740001_to_row1760000.gzip',
                       'labeled_crime_row2140001_to_row2160000.gzip',
                       'labeled_crime_row2140001_to_row2160000.gzip',
                       'labeled_crime_row2460001_to_row2480000.gzip',
                      ]

---