## Inspect each chunk for accuracy of "crime" label

#### Import library

In [None]:
import pandas as pd

#### Manual Inspection of Topic Model Accuracy
- Call each file saved during Chunk-processing (03_TopicModelingAllBatches.ipynb), saved in `data/interim`  
- Take a glance at "title" and "article" to see if they are really CRIME-related news
    - Read "title" and "article" of labeled news in `data/interim` folder
    - Mark non-relevant filenames for next step

In [58]:
topic_index = pd.read_parquet(os.path.join('data', 'interim', 'crime_topic_index.gzip'), engine="pyarrow")
topic_index.iloc[[*range(3)],:]

Unnamed: 0,filename,topic,start_row,end_row
0,labeled_crime_row1_to_row10000.gzip,4,1,10000
1,labeled_crime_row10001_to_row20000.gzip,4,10001,20000
2,labeled_crime_row20001_to_row30000.gzip,5,20001,30000


In [71]:
# Total number of files to inspect
topic_index.shape[0]

269

In [74]:
################# MODIFY start and end ############## 

# Increments of 5 is easy for the eyes
start = 0
end = 5 

#####################################################

# display setting
display_row = 3
colnames = ['topic','title','article']
selected_chunk = topic_index.iloc[[*range(start,end)],:]

for index, row in selected_chunk.iterrows():

    if row['topic'] != 99:  # 99 is a dummy topic

        # get title and articles
        start,end,topic = row['start_row'], row['end_row'], row['topic']
        filename = f'labeled_crime_row{start}_to_row{end}.gzip'
        
        article = pd.read_parquet(os.path.join('data', 'interim', filename), 
                                  engine="pyarrow").query(f'topic=={topic}').head(display_row)[colnames]        
        select_topic_index = topic_index[(topic_index['start_row']==start) & (topic_index['end_row']==end)][['filename','topic']]

        display(pd.merge(select_topic_index, article, how='inner', on='topic'))



Unnamed: 0,filename,topic,title,article
0,labeled_crime_row1_to_row10000.gzip,4,"Venezuela detains six military, police officials: family members, activists","CARACAS (Reuters) - Venezuelan authorities have arrested six members of the country’s military and police forces over the weekend, according to relatives of the detainees and human rights activist..."
1,labeled_crime_row1_to_row10000.gzip,4,"Paradise, California, wildfire: why the fire threat to California is only growing","PARADISE, CALIFORNIA — Brook Jenkins moved to the town of Paradise to escape a rough neighborhood in nearby Chico and raise her three children in an idyllic small town, filled with trees. Paradise..."
2,labeled_crime_row1_to_row10000.gzip,4,Teen prisoners rioted and lit British Columbia's ‘super jail’ on fire this week,"A six-hour riot that started with a fire and devolved into a rampage through a youth ""super jail"" in British Columbia this week has confirmed long-held fears over what would happen when the provin..."


Unnamed: 0,filename,topic,title,article
0,labeled_crime_row10001_to_row20000.gzip,4,We Asked a Law Professor Whether the Government Could Really Ban Rough Sex,"John Doe was a freshman at George Mason University when he started seeing Jane Roe, a student at a different university (both subjects have been anonymous in media accounts and court documents). T..."
1,labeled_crime_row10001_to_row20000.gzip,4,Justin Bieber Fan Arrested for Trespassing At Singer's Beverly Hills Home,A Justin Bieber fan was arrested at the singer's Beverly Hills home Monday after cops say she wandered onto the property looking for the singer ... for the third time this week. Law enforcement so...
2,labeled_crime_row10001_to_row20000.gzip,4,Baylor University paid ex-football coach $15 million after sex scandal,(Reuters) - Baylor University in Texas paid more than $15.1 million to its former head football coach Art Briles after firing him in 2016 for failing to address students’ complaints of rape and se...


Unnamed: 0,filename,topic,title,article
0,labeled_crime_row20001_to_row30000.gzip,5,The Tour That Celebrates the Lives—Not Deaths—of Jack the Ripper's Victims,"Ever since Jack the Ripper claimed his first victim 130 years ago, investigating his legacy has become both a mainstream activity and a legitimized hobby. There have been TV programs, films, video..."
1,labeled_crime_row20001_to_row30000.gzip,5,Unexploded device spotted on one of attacked oil tankers -U.S. source,"WASHINGTON, June 13 (Reuters) - An unexploded device, believed to be a limpet mine, was spotted on the side of one of two oil tankers attacked on Thursday in the Gulf of Oman, a U.S. official told..."
2,labeled_crime_row20001_to_row30000.gzip,5,Hong Kong court favors gay couple in landmark victory for LGBT+ rights,BANGKOK (Thomson Reuters Foundation) - Hong Kong’s top court on Thursday ruled in favor of a gay civil servant fighting for spousal and tax benefits for his husband in the latest legal victory ove...


Unnamed: 0,filename,topic,title,article
0,labeled_crime_row30001_to_row40000.gzip,3,Forever 21 Pulls 'Rapey' Tee After Customer Complaints,"Forever 21 has pulled a controversial t-shirt after getting a huge backlash from consumers who called the gear ""shameful"" and ""rapey."" The graphic tee featured a slogan that seemingly referred to ..."
1,labeled_crime_row30001_to_row40000.gzip,3,Giegling Co-Founder Responds to Article Alleging He Made Sexist Remarks,"Update [June 22 2017, 11.20 AM]: London's Sunfall festival announced that it has removed Giegling from its bill following the allegations of Konstantin's sexist remarks about female DJs. Konstant..."
2,labeled_crime_row30001_to_row40000.gzip,3,Study: black people simply saying they’re multiracial makes others think they’re better-looking,"Newly published research indicates that black people are perceived as more attractive if they claim to be multiracial, regardless of the way they look. Let that sink in: All the study subjects had..."


Unnamed: 0,filename,topic,title,article
0,labeled_crime_row40001_to_row50000.gzip,10,"After scandals, Pope orders his diplomats to toe the line","VATICAN CITY (Reuters) - Pope Francis on Thursday told his ambassadors around the world, some of whom have been involved in sexual and financial scandals, to live humble, exemplary lives and be cl..."
1,labeled_crime_row40001_to_row50000.gzip,10,Chyna Autopsy Shows Toxic Cocktail of Rx Pills and Booze,WWE legend Chyna had traces of multiple prescription drugs in her system -- including oxycodone and Valium -- along with alcohol when she died ... this according to the autopsy report obtained by ...
2,labeled_crime_row40001_to_row50000.gzip,10,New York City terror attack: what we know so far,"A 29-year-old man drove a rental truck into a pedestrian and bike path along the Hudson River in Lower Manhattan in New York City Tuesday, killing eight people and injuring 11 in the deadliest ter..."


##### Record the filenames of non-relevant crime news below

In [75]:
# Insert the filenames for which title and article dont appear to be crime related
non_crime_filenames = []

---