In [1]:
import nltk
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from ctfidf import CTFIDFVectorizer
import pyperclip
from tqdm import tqdm
tqdm.pandas()

In [2]:
stopwords_en = nltk.corpus.stopwords.words('english')
#stopwords_en
stopwords_custom = ["mr","ms","mrs","said","one","new"]
stopwords_en = stopwords_en + stopwords_custom
stopwords_en

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [3]:
def get_argmax(row):
    max_count = 0
    argmax = None
    #print(row)
    for row_key, row_val in row.iteritems():
        if row_val > max_count:
            max_count = row_val
            argmax = row_key
    return argmax

### Cleaning

In [4]:
punct_list = '“.’”,—()'
def remove_punct(text):
    # Special case for "-", we replace it with a space
    clean_chars = [c.replace("-"," ") for c in text if c not in punct_list]
    return "".join(clean_chars)

In [5]:
plural_map = {
    "presidents": "president",
    "wars": "war",
    "games": "game", "leagues": "league", "teams": "team",
    "books": "book",
    "films": "film",
    "restaurants": "restaurant",
    "homes": "home",
    "house": "home",
    "apartments": "apartment",
    "countries": "country", "countrys": "country",
    "companies": "company", "companys": "company",
    "fashionable": "fashion",
    "states": "state",
    "republicans": "republican",
    "readers": "read", "reading": "read",
    "senators": "senator",
    "bathroom": "bathrooms",
    "bedroom": "bedrooms",
    "museums": "museum",
}
def map_plurals(text):
    text_tokens = text.split()
    clean_tokens = [(plural_map[t] if t in plural_map else t) for t in text_tokens]
    return " ".join(clean_tokens)

In [6]:
def remove_stopwords(text):
    text_tokens = text.split()
    clean_tokens = [t for t in text_tokens if t not in stopwords_en]
    return " ".join(clean_tokens)

In [7]:
def preprocess_corpus(data_df):
    data_df['text'] = data_df['text_raw'].str.lower()
    data_df['text'] = data_df['text'].apply(remove_punct)
    data_df['text'] = data_df['text'].apply(remove_stopwords)
    data_df['text'] = data_df['text'].apply(map_plurals)

In [8]:
def compute_ctfidf(data_df, classnum_to_sec):
    doc_class_df = data_df.groupby(['sec_num'], as_index=False).agg({'text': ' '.join})
    # Create bag of words
    ngrams = 1
    #min_df = 0.4
    #min_df = 0.24
    min_df = 0.1
    count_vectorizer = CountVectorizer(ngram_range=(1,ngrams), min_df=min_df)
    #count_vectorizer = CountVectorizer(ngram_range=(1,ngrams))
    count_vectorizer = count_vectorizer.fit(doc_class_df['text'])
    count = count_vectorizer.transform(doc_class_df['text'])
    words = count_vectorizer.get_feature_names_out()
    # Extract top 10 words
    ctfidf = CTFIDFVectorizer().fit_transform(count, n_samples=len(data_df)).toarray()
    num_top_words = 50
    words_per_class = {classnum_to_sec[label]: list(reversed([words[index] for index in ctfidf[label].argsort()[-num_top_words:]]))
                       for label in doc_class_df['sec_num']}
    return words_per_class

### cTFIDF Results

In [9]:
def print_ctfidf(words_per_class):
    for cur_class in words_per_class.keys():
        print(f"=====[ {cur_class} ]=====")
        cur_class_words = words_per_class[cur_class]
        #print(cur_class_words)
        #print([w for w in cur_class_words if w.endswith("s")])
        print([w for w in cur_class_words])
        #joblib.dump(cur_class_words, f"./indic_words/{cur_class}.pkl")

In [10]:
def get_token_counts(token_list, section):
    section_keywords = keywords[section]
    total_count = 0
    for cur_sec_kw in section_keywords:
        total_count += token_list.count(cur_sec_kw)
    return total_count

## Now the full articles (from the LDC corpus)

In [11]:
import os
import tarfile

import xmltodict

In [12]:
ldc_path = "../corpus_nyt/2006/"

In [13]:
os.listdir(ldc_path)

['01.tgz',
 '02.tgz',
 '03.tgz',
 '04.tgz',
 '05.tgz',
 '06.tgz',
 '07.tgz',
 '08.tgz',
 '09.tgz',
 '10.tgz',
 '11.tgz',
 '12.tgz']

In [14]:
ldc_fpath = os.path.join(ldc_path, "01.tgz")

In [15]:
tar = tarfile.open(ldc_fpath, "r:gz")

In [16]:
def parse_content(content):
    content_str = content.decode("utf-8")
    content_dict = xmltodict.parse(content_str)
    return content_dict['nitf']

In [17]:
def get_dict_item(item_list, key, val, other_key):
    # Helper fn
    results = [item for item in item_list if item[key] == val]
    if len(results) == 0:
        return ""
    return results[0][other_key]

In [18]:
content_rows = []
for member in tar.getmembers():
    #print(tar.getnames())
    f = tar.extractfile(member)
    if f is not None:
        #print(dir(member))
        print(member.name)
        content = f.read()
        content_dict = parse_content(content)
        sections = get_dict_item(content_dict['head']['meta'], "@name", "online_sections", "@content")
        if sections == "Paid Death Notices":
            continue
        if "Corrections" in sections:
            continue
        body_content = content_dict['body']['body.content']
        if body_content is None:
            continue
        body_block = body_content['block']
        if type(body_block) == list:
            paras = get_dict_item(body_block, "@class", "full_text", "p")
            if type(paras) == list:
                full_text = " ".join(paras)
            else:
                full_text = paras
        else:
            # It's just one element, a dict
            paras = body_block['p']
        content_rows.append({'id':member.name, 'sections':sections, 'text_raw':full_text})
        #break

01/17/1732746.xml
01/17/1732662.xml
01/17/1732665.xml
01/17/1732619.xml
01/17/1732741.xml
01/17/1732733.xml
01/17/1732617.xml
01/17/1732610.xml
01/17/1732748.xml
01/17/1732734.xml
01/17/1732625.xml
01/17/1732659.xml
01/17/1732688.xml
01/17/1732701.xml
01/17/1732706.xml
01/17/1732622.xml
01/17/1732650.xml
01/17/1732681.xml
01/17/1732708.xml
01/17/1732774.xml
01/17/1732773.xml
01/17/1732657.xml
01/17/1732686.xml
01/17/1732762.xml
01/17/1732697.xml
01/17/1732646.xml
01/17/1732719.xml
01/17/1732690.xml
01/17/1732641.xml
01/17/1732765.xml
01/17/1732717.xml
01/17/1732633.xml
01/17/1732634.xml
01/17/1732710.xml
01/17/1732699.xml
01/17/1732648.xml
01/17/1732759.xml
01/17/1732725.xml
01/17/1732722.xml
01/17/1732674.xml
01/17/1732750.xml
01/17/1732608.xml
01/17/1732757.xml
01/17/1732673.xml
01/17/1732772.xml
01/17/1732656.xml
01/17/1732687.xml
01/17/1732651.xml
01/17/1732680.xml
01/17/1732709.xml
01/17/1732775.xml
01/17/1732707.xml
01/17/1732623.xml
01/17/1732624.xml
01/17/1732658.xml
01/17/1732

In [19]:
art_df = pd.DataFrame(content_rows)

In [20]:
sections_to_keep = [
    "Sports",
    #"Business",
    "Arts", "U.S.", "World", "Real Estate"
]
def clean_art_df(orig_art_df):
    secnum_to_sec = {i: sections_to_keep[i] for i in range(len(sections_to_keep))}
    orig_art_df['sections'] = orig_art_df['sections'].str.replace("Front Page; ","")
    orig_art_df['num_sections'] = orig_art_df['sections'].apply(lambda x: len(x.split(";")))
    # Drop articles with multiple sections, for now
    new_art_df = orig_art_df[orig_art_df['num_sections'] == 1].copy()
    # And now just keep the desired sections
    new_art_df = new_art_df[new_art_df['sections'].isin(sections_to_keep)].copy()
    new_art_df['sec_num'] = new_art_df['sections'].apply(lambda x: sections_to_keep.index(x))
    return new_art_df, secnum_to_sec

In [21]:
art_df, secnum_to_sec = clean_art_df(art_df)

In [22]:
art_df['sections'].value_counts()

Sports         565
Arts           459
World          370
U.S.           357
Real Estate     70
Name: sections, dtype: int64

In [23]:
art_df

Unnamed: 0,id,sections,text_raw,num_sections,sec_num
8,01/17/1732688.xml,World,Russia and China affirmed Monday that Iran mus...,1,3
10,01/17/1732706.xml,Sports,"Lauryn Williams, the reigning world champion i...",1,0
14,01/17/1732774.xml,World,Pope Benedict XVI visited the main synagogue i...,1,3
15,01/17/1732773.xml,World,The United Nations placed eight managers on pa...,1,3
17,01/17/1732686.xml,U.S.,Protesters wore yellow and black armbands and ...,1,2
...,...,...,...,...,...
6122,01/27/1735148.xml,World,The speaker of Russia's Parliament snubbed a c...,1,3
6137,01/27/1735001.xml,Arts,A mysterious collector went on a Dutch buying ...,1,1
6138,01/27/1735157.xml,U.S.,An audit found that millions of dollars in Dep...,1,2
6139,01/27/1735150.xml,World,A federal judge in New York denied bail to Ton...,1,3


### Define Keywords

In [24]:
keywords = {
    "U.S.": ["state","court","federal","republican"],
    "World": ["government","country","officials","minister"],
    #"Arts": ["music", "art", "artist", "show"],
    "Arts": ["music","show","art","dance",], #"museum"],
    "Sports": ["game", "league", "team", "coach"],
    #"Real Estate": ["home", "apartment", "available", "building"],
    "Real Estate": ["home","bedrooms","bathrooms","building"],
}

In [25]:
# Get keywords in LaTeX form
for cur_section, cur_kw_list in keywords.items():
    tex_list = ["\\texttt{" + kw + "}" for kw in cur_kw_list]
    kw_str = ", ".join(tex_list)
    print(f"{cur_section} & {kw_str} \\\\")

U.S. & \texttt{state}, \texttt{court}, \texttt{federal}, \texttt{republican} \\
World & \texttt{government}, \texttt{country}, \texttt{officials}, \texttt{minister} \\
Arts & \texttt{music}, \texttt{show}, \texttt{art}, \texttt{dance} \\
Sports & \texttt{game}, \texttt{league}, \texttt{team}, \texttt{coach} \\
Real Estate & \texttt{home}, \texttt{bedrooms}, \texttt{bathrooms}, \texttt{building} \\


In [26]:
preprocess_corpus(art_df)

### cTFIDF

In [27]:
words_per_class = compute_ctfidf(art_df, secnum_to_sec)
#words_per_class

In [28]:
print_ctfidf(words_per_class)

=====[ Sports ]=====
['game', 'team', 'season', 'coach', 'first', 'last', 'year', 'play', 'league', 'two', 'would', 'football', 'players', 'time', 'points', 'back', 'like', 'second', 'we', 'three', 'he', 'it', 'going', 'knicks', 'years', 'could', 'bowl', 'get', 'think', 'state', 'left', 'jets', 'player', 'victory', 'that', 'giants', 'world', '10', 'nfl', 'night', 'made', 'brown', 'played', 'way', 'four', 'yards', 'five', 'third', 'yesterday', 'week']
=====[ Arts ]=====
['music', 'like', 'show', 'art', 'pm', 'york', 'street', 'work', 'two', 'museum', 'also', 'time', 'the', 'first', 'would', 'year', 'american', 'made', 'even', 'city', 'million', 'dance', 'center', 'years', 'band', 'series', 'last', 'theater', 'much', 'album', 'three', 'songs', 'well', 'world', 'way', 'works', '212', 'opera', 'night', 'home', 'could', 'rock', 'still', 'orchestra', 'film', 'song', 'west', 'many', 'gallery', 'old']
=====[ U.S. ]=====
['judge', 'alito', 'court', 'would', 'republican', 'home', 'state', 'peopl

In [29]:
keywords_sub = {k: v for k, v in keywords.items() if k in sections_to_keep}

In [30]:
keywords_sub

{'U.S.': ['state', 'court', 'federal', 'republican'],
 'World': ['government', 'country', 'officials', 'minister'],
 'Arts': ['music', 'show', 'art', 'dance'],
 'Sports': ['game', 'league', 'team', 'coach'],
 'Real Estate': ['home', 'bedrooms', 'bathrooms', 'building']}

In [31]:
art_df

Unnamed: 0,id,sections,text_raw,num_sections,sec_num,text
8,01/17/1732688.xml,World,Russia and China affirmed Monday that Iran mus...,1,3,russia china affirmed monday iran must resume ...
10,01/17/1732706.xml,Sports,"Lauryn Williams, the reigning world champion i...",1,0,lauryn williams reigning world champion women'...
14,01/17/1732774.xml,World,Pope Benedict XVI visited the main synagogue i...,1,3,pope benedict xvi visited main synagogue rome ...
15,01/17/1732773.xml,World,The United Nations placed eight managers on pa...,1,3,united nations placed eight managers paid susp...
17,01/17/1732686.xml,U.S.,Protesters wore yellow and black armbands and ...,1,2,protesters wore yellow black armbands chanted ...
...,...,...,...,...,...,...
6122,01/27/1735148.xml,World,The speaker of Russia's Parliament snubbed a c...,1,3,speaker russia's parliament snubbed call parli...
6137,01/27/1735001.xml,Arts,A mysterious collector went on a Dutch buying ...,1,1,mysterious collector went dutch buying spree y...
6138,01/27/1735157.xml,U.S.,An audit found that millions of dollars in Dep...,1,2,audit found millions dollars department homela...
6139,01/27/1735150.xml,World,A federal judge in New York denied bail to Ton...,1,3,federal judge york denied bail tongsun park ko...


In [32]:
art_df['text_tokens'] = art_df['text'].str.split()

In [33]:
art_df

Unnamed: 0,id,sections,text_raw,num_sections,sec_num,text,text_tokens
8,01/17/1732688.xml,World,Russia and China affirmed Monday that Iran mus...,1,3,russia china affirmed monday iran must resume ...,"[russia, china, affirmed, monday, iran, must, ..."
10,01/17/1732706.xml,Sports,"Lauryn Williams, the reigning world champion i...",1,0,lauryn williams reigning world champion women'...,"[lauryn, williams, reigning, world, champion, ..."
14,01/17/1732774.xml,World,Pope Benedict XVI visited the main synagogue i...,1,3,pope benedict xvi visited main synagogue rome ...,"[pope, benedict, xvi, visited, main, synagogue..."
15,01/17/1732773.xml,World,The United Nations placed eight managers on pa...,1,3,united nations placed eight managers paid susp...,"[united, nations, placed, eight, managers, pai..."
17,01/17/1732686.xml,U.S.,Protesters wore yellow and black armbands and ...,1,2,protesters wore yellow black armbands chanted ...,"[protesters, wore, yellow, black, armbands, ch..."
...,...,...,...,...,...,...,...
6122,01/27/1735148.xml,World,The speaker of Russia's Parliament snubbed a c...,1,3,speaker russia's parliament snubbed call parli...,"[speaker, russia's, parliament, snubbed, call,..."
6137,01/27/1735001.xml,Arts,A mysterious collector went on a Dutch buying ...,1,1,mysterious collector went dutch buying spree y...,"[mysterious, collector, went, dutch, buying, s..."
6138,01/27/1735157.xml,U.S.,An audit found that millions of dollars in Dep...,1,2,audit found millions dollars department homela...,"[audit, found, millions, dollars, department, ..."
6139,01/27/1735150.xml,World,A federal judge in New York denied bail to Ton...,1,3,federal judge york denied bail tongsun park ko...,"[federal, judge, york, denied, bail, tongsun, ..."


In [34]:
for section, sec_keywords in keywords.items():
    get_counts_partial = lambda token_list: get_token_counts(token_list, section)
    art_df[section] = art_df['text_tokens'].apply(get_counts_partial)

In [35]:
art_df

Unnamed: 0,id,sections,text_raw,num_sections,sec_num,text,text_tokens,U.S.,World,Arts,Sports,Real Estate
8,01/17/1732688.xml,World,Russia and China affirmed Monday that Iran mus...,1,3,russia china affirmed monday iran must resume ...,"[russia, china, affirmed, monday, iran, must, ...",8,13,0,0,0
10,01/17/1732706.xml,Sports,"Lauryn Williams, the reigning world champion i...",1,0,lauryn williams reigning world champion women'...,"[lauryn, williams, reigning, world, champion, ...",3,0,0,1,0
14,01/17/1732774.xml,World,Pope Benedict XVI visited the main synagogue i...,1,3,pope benedict xvi visited main synagogue rome ...,"[pope, benedict, xvi, visited, main, synagogue...",0,0,0,0,0
15,01/17/1732773.xml,World,The United Nations placed eight managers on pa...,1,3,united nations placed eight managers paid susp...,"[united, nations, placed, eight, managers, pai...",0,0,0,0,0
17,01/17/1732686.xml,U.S.,Protesters wore yellow and black armbands and ...,1,2,protesters wore yellow black armbands chanted ...,"[protesters, wore, yellow, black, armbands, ch...",0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
6122,01/27/1735148.xml,World,The speaker of Russia's Parliament snubbed a c...,1,3,speaker russia's parliament snubbed call parli...,"[speaker, russia's, parliament, snubbed, call,...",0,2,0,0,0
6137,01/27/1735001.xml,Arts,A mysterious collector went on a Dutch buying ...,1,1,mysterious collector went dutch buying spree y...,"[mysterious, collector, went, dutch, buying, s...",1,2,5,0,3
6138,01/27/1735157.xml,U.S.,An audit found that millions of dollars in Dep...,1,2,audit found millions dollars department homela...,"[audit, found, millions, dollars, department, ...",0,0,0,0,0
6139,01/27/1735150.xml,World,A federal judge in New York denied bail to Ton...,1,3,federal judge york denied bail tongsun park ko...,"[federal, judge, york, denied, bail, tongsun, ...",2,0,0,0,0


In [36]:
art_df['predicted'] = art_df[sections_to_keep].apply(get_argmax, axis=1)

In [37]:
art_df

Unnamed: 0,id,sections,text_raw,num_sections,sec_num,text,text_tokens,U.S.,World,Arts,Sports,Real Estate,predicted
8,01/17/1732688.xml,World,Russia and China affirmed Monday that Iran mus...,1,3,russia china affirmed monday iran must resume ...,"[russia, china, affirmed, monday, iran, must, ...",8,13,0,0,0,World
10,01/17/1732706.xml,Sports,"Lauryn Williams, the reigning world champion i...",1,0,lauryn williams reigning world champion women'...,"[lauryn, williams, reigning, world, champion, ...",3,0,0,1,0,U.S.
14,01/17/1732774.xml,World,Pope Benedict XVI visited the main synagogue i...,1,3,pope benedict xvi visited main synagogue rome ...,"[pope, benedict, xvi, visited, main, synagogue...",0,0,0,0,0,
15,01/17/1732773.xml,World,The United Nations placed eight managers on pa...,1,3,united nations placed eight managers paid susp...,"[united, nations, placed, eight, managers, pai...",0,0,0,0,0,
17,01/17/1732686.xml,U.S.,Protesters wore yellow and black armbands and ...,1,2,protesters wore yellow black armbands chanted ...,"[protesters, wore, yellow, black, armbands, ch...",0,0,1,0,1,Arts
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6122,01/27/1735148.xml,World,The speaker of Russia's Parliament snubbed a c...,1,3,speaker russia's parliament snubbed call parli...,"[speaker, russia's, parliament, snubbed, call,...",0,2,0,0,0,World
6137,01/27/1735001.xml,Arts,A mysterious collector went on a Dutch buying ...,1,1,mysterious collector went dutch buying spree y...,"[mysterious, collector, went, dutch, buying, s...",1,2,5,0,3,Arts
6138,01/27/1735157.xml,U.S.,An audit found that millions of dollars in Dep...,1,2,audit found millions dollars department homela...,"[audit, found, millions, dollars, department, ...",0,0,0,0,0,
6139,01/27/1735150.xml,World,A federal judge in New York denied bail to Ton...,1,3,federal judge york denied bail tongsun park ko...,"[federal, judge, york, denied, bail, tongsun, ...",2,0,0,0,0,U.S.


In [38]:
predict_df = art_df[~pd.isna(art_df['predicted'])].copy()

In [39]:
predict_df

Unnamed: 0,id,sections,text_raw,num_sections,sec_num,text,text_tokens,U.S.,World,Arts,Sports,Real Estate,predicted
8,01/17/1732688.xml,World,Russia and China affirmed Monday that Iran mus...,1,3,russia china affirmed monday iran must resume ...,"[russia, china, affirmed, monday, iran, must, ...",8,13,0,0,0,World
10,01/17/1732706.xml,Sports,"Lauryn Williams, the reigning world champion i...",1,0,lauryn williams reigning world champion women'...,"[lauryn, williams, reigning, world, champion, ...",3,0,0,1,0,U.S.
17,01/17/1732686.xml,U.S.,Protesters wore yellow and black armbands and ...,1,2,protesters wore yellow black armbands chanted ...,"[protesters, wore, yellow, black, armbands, ch...",0,0,1,0,1,Arts
18,01/17/1732762.xml,Sports,The Denver Broncos' path from afterthought to ...,1,0,denver broncos' path afterthought super bowl f...,"[denver, broncos', path, afterthought, super, ...",0,1,0,18,1,Sports
19,01/17/1732697.xml,World,A motorcycle-riding suicide bomber rode into a...,1,3,motorcycle riding suicide bomber rode crowd wr...,"[motorcycle, riding, suicide, bomber, rode, cr...",0,7,0,0,0,World
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6120,01/27/1734998.xml,Arts,There is no consistent name for ''Chinatown'' ...,1,1,consistent name ''chinatown'' chinese newspape...,"[consistent, name, ''chinatown'', chinese, new...",1,4,6,1,5,Arts
6122,01/27/1735148.xml,World,The speaker of Russia's Parliament snubbed a c...,1,3,speaker russia's parliament snubbed call parli...,"[speaker, russia's, parliament, snubbed, call,...",0,2,0,0,0,World
6137,01/27/1735001.xml,Arts,A mysterious collector went on a Dutch buying ...,1,1,mysterious collector went dutch buying spree y...,"[mysterious, collector, went, dutch, buying, s...",1,2,5,0,3,Arts
6139,01/27/1735150.xml,World,A federal judge in New York denied bail to Ton...,1,3,federal judge york denied bail tongsun park ko...,"[federal, judge, york, denied, bail, tongsun, ...",2,0,0,0,0,U.S.


In [40]:
predict_df['correct'] = predict_df['sections'] == predict_df['predicted']

In [41]:
correct_counts = predict_df['correct'].value_counts()

In [42]:
correct_counts

True     1358
False     360
Name: correct, dtype: int64

In [43]:
correct_counts[True] / (correct_counts[True]+correct_counts[False])

0.7904540162980209

In [44]:
predict_df.rename(columns={'U.S.':'U.S. News', 'World':'World News'}, inplace=True)
predict_df['sections'] = predict_df['sections'].apply(lambda x: "U.S. News" if x == "U.S." else x)
predict_df['sections'] = predict_df['sections'].apply(lambda x: "World News" if x == "World" else x)
predict_df['correct'] = predict_df['correct'].apply(lambda x: "Correct" if x else "Incorrect")

In [45]:
predict_df

Unnamed: 0,id,sections,text_raw,num_sections,sec_num,text,text_tokens,U.S. News,World News,Arts,Sports,Real Estate,predicted,correct
8,01/17/1732688.xml,World News,Russia and China affirmed Monday that Iran mus...,1,3,russia china affirmed monday iran must resume ...,"[russia, china, affirmed, monday, iran, must, ...",8,13,0,0,0,World,Correct
10,01/17/1732706.xml,Sports,"Lauryn Williams, the reigning world champion i...",1,0,lauryn williams reigning world champion women'...,"[lauryn, williams, reigning, world, champion, ...",3,0,0,1,0,U.S.,Incorrect
17,01/17/1732686.xml,U.S. News,Protesters wore yellow and black armbands and ...,1,2,protesters wore yellow black armbands chanted ...,"[protesters, wore, yellow, black, armbands, ch...",0,0,1,0,1,Arts,Incorrect
18,01/17/1732762.xml,Sports,The Denver Broncos' path from afterthought to ...,1,0,denver broncos' path afterthought super bowl f...,"[denver, broncos', path, afterthought, super, ...",0,1,0,18,1,Sports,Correct
19,01/17/1732697.xml,World News,A motorcycle-riding suicide bomber rode into a...,1,3,motorcycle riding suicide bomber rode crowd wr...,"[motorcycle, riding, suicide, bomber, rode, cr...",0,7,0,0,0,World,Correct
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6120,01/27/1734998.xml,Arts,There is no consistent name for ''Chinatown'' ...,1,1,consistent name ''chinatown'' chinese newspape...,"[consistent, name, ''chinatown'', chinese, new...",1,4,6,1,5,Arts,Correct
6122,01/27/1735148.xml,World News,The speaker of Russia's Parliament snubbed a c...,1,3,speaker russia's parliament snubbed call parli...,"[speaker, russia's, parliament, snubbed, call,...",0,2,0,0,0,World,Correct
6137,01/27/1735001.xml,Arts,A mysterious collector went on a Dutch buying ...,1,1,mysterious collector went dutch buying spree y...,"[mysterious, collector, went, dutch, buying, s...",1,2,5,0,3,Arts,Correct
6139,01/27/1735150.xml,World News,A federal judge in New York denied bail to Ton...,1,3,federal judge york denied bail tongsun park ko...,"[federal, judge, york, denied, bail, tongsun, ...",2,0,0,0,0,U.S.,Incorrect


In [46]:
ct_counts = pd.crosstab(predict_df['correct'], predict_df['sections'])
ct_counts.index.name = None
ct_counts.columns.name = None
ct_counts.columns = [str(c) for c in ct_counts.columns]
ct_counts.index = [str(i) for i in ct_counts.index]

In [47]:
ct_counts['Total'] = ct_counts.sum(axis=1)

In [48]:
ct_counts

Unnamed: 0,Arts,Real Estate,Sports,U.S. News,World News,Total
Correct,327,65,520,228,218,1358
Incorrect,101,4,37,109,109,360


In [49]:
for cur_col in ct_counts.columns:
    cur_correct = ct_counts.loc["Correct",cur_col]
    cur_incorrect = ct_counts.loc["Incorrect",cur_col]
    ct_counts.loc["Accuracy",cur_col] = cur_correct / (cur_correct + cur_incorrect)

In [50]:
ct_counts.index

Index(['Correct', 'Incorrect', 'Accuracy'], dtype='object')

In [51]:
ct_counts

Unnamed: 0,Arts,Real Estate,Sports,U.S. News,World News,Total
Correct,327.0,65.0,520.0,228.0,218.0,1358.0
Incorrect,101.0,4.0,37.0,109.0,109.0,360.0
Accuracy,0.764019,0.942029,0.933573,0.676558,0.666667,0.790454


In [52]:
with pd.option_context('display.float_format', '{:.3f}'.format):
    display(ct_counts)

Unnamed: 0,Arts,Real Estate,Sports,U.S. News,World News,Total
Correct,327.0,65.0,520.0,228.0,218.0,1358.0
Incorrect,101.0,4.0,37.0,109.0,109.0,360.0
Accuracy,0.764,0.942,0.934,0.677,0.667,0.79


In [53]:
ct_counts_str = ct_counts.copy()

In [54]:
ct_counts_str = ct_counts_str.applymap(lambda x: "{:.3f}".format(x))

In [55]:
ct_counts_str = ct_counts_str.applymap(lambda x: x.replace(".00",""))

In [56]:
ct_counts_str

Unnamed: 0,Arts,Real Estate,Sports,U.S. News,World News,Total
Correct,3270.0,650.0,5200.0,2280.0,2180.0,13580.0
Incorrect,1010.0,40.0,370.0,1090.0,1090.0,3600.0
Accuracy,0.764,0.942,0.934,0.677,0.667,0.79


In [57]:
tex_buffer = ""
def printb(text):
    global tex_buffer
    tex_buffer = tex_buffer + text + "\n"
printb("\\toprule")
header_str = " & ".join(["\\textbf{" + c + "}" for c in ct_counts_str.columns.values])
header_str = " & " + header_str + " \\\\"
printb(header_str)
printb("\\midrule")
for row_index, row in ct_counts_str.iterrows():
    index_str = "\\textbf{" + str(row_index) + "}"
    row_str = " & ".join(row)
    if row_index == "Accuracy":
        printb("\\midrule")
    printb(index_str + " & " + row_str + " \\\\")
printb("\\bottomrule")
print(tex_buffer)

\toprule
 & \textbf{Arts} & \textbf{Real Estate} & \textbf{Sports} & \textbf{U.S. News} & \textbf{World News} & \textbf{Total} \\
\midrule
\textbf{Correct} & 3270 & 650 & 5200 & 2280 & 2180 & 13580 \\
\textbf{Incorrect} & 1010 & 40 & 370 & 1090 & 1090 & 3600 \\
\midrule
\textbf{Accuracy} & 0.764 & 0.942 & 0.934 & 0.677 & 0.667 & 0.790 \\
\bottomrule



In [58]:
pyperclip.copy(tex_buffer)

## Extract excerpts

In [59]:
wash_df = predict_df[predict_df['text'].str.contains("washington") & predict_df['text'].str.contains("officials")].copy()

In [60]:
len(wash_df)

112

In [61]:
wash_df.iloc[6]['text_raw']

'Jack Abramoff pleaded guilty to three felony counts on Tuesday as part of a settlement with federal prosecutors, instantly becoming the star witness in a sweeping federal investigation into public corruption in Washington. The inquiry could involve as many as a dozen lawmakers, people involved in the case said. Mr. Abramoff, 46, once a prominent Republican lobbyist, accepted a recommended reduced prison sentence of about 10 years in exchange for testifying against former associates in the influence-peddling case. The agreement also requires Mr. Abramoff to pay more than $26 million in tax penalties and restitution to former clients, although he has told associates he is broke. The corruption inquiry involving Mr. Abramoff, potentially one of the most explosive in Congressional history, has expanded in recent months to encompass dozens of political operatives, including former Congressional aides and lobbyists suspected of arranging bribes in exchange for legislative work, participants

In [69]:
def extract_text(df):
    text_buff = ""
    for row_index in range(len(df)):
        cur_text = df.iloc[row_index]['text_raw']
        text_buff += cur_text + "\n\n\n"
    return text_buff

In [81]:
clint_df = predict_df[predict_df['text'].str.contains("the president")].copy()
len(clint_df)

22

In [82]:
clint_text = extract_text(clint_df)
pyperclip.copy(clint_text)

In [63]:
#pyperclip.copy(predict_df.loc[275,]['text_raw'])

In [64]:
wash_text = extract_text(wash_df)

In [65]:
pyperclip.copy(wash_text)

### Now more text

(The full year, rather than just one month)

In [27]:
import glob

In [28]:
ldc_fpaths = glob.glob("../corpus_nyt/2006/*.tgz")
ldc_fpaths

['../corpus_nyt/2006\\01.tgz',
 '../corpus_nyt/2006\\02.tgz',
 '../corpus_nyt/2006\\03.tgz',
 '../corpus_nyt/2006\\04.tgz',
 '../corpus_nyt/2006\\05.tgz',
 '../corpus_nyt/2006\\06.tgz',
 '../corpus_nyt/2006\\07.tgz',
 '../corpus_nyt/2006\\08.tgz',
 '../corpus_nyt/2006\\09.tgz',
 '../corpus_nyt/2006\\10.tgz',
 '../corpus_nyt/2006\\11.tgz',
 '../corpus_nyt/2006\\12.tgz']

In [29]:
def parse_content(content):
    content_str = content.decode("utf-8")
    content_dict = xmltodict.parse(content_str)
    return content_dict['nitf']

def get_dict_item(item_list, key, val, other_key):
    # Helper fn
    results = [item for item in item_list if item[key] == val]
    if len(results) == 0:
        return ""
    return results[0][other_key]

In [30]:
def tar_to_df(tar_obj):
    content_rows = []
    for member in tar.getmembers():
        #print(tar.getnames())
        f = tar.extractfile(member)
        if f is not None:
            #print(dir(member))
            print(member.name)
            content = f.read()
            content_dict = parse_content(content)
            sections = get_dict_item(content_dict['head']['meta'], "@name", "online_sections", "@content")
            if sections == "Paid Death Notices":
                continue
            if "Corrections" in sections:
                continue
            body_content = content_dict['body']['body.content']
            if body_content is None:
                continue
            body_block = body_content['block']
            if type(body_block) == list:
                paras = get_dict_item(body_block, "@class", "full_text", "p")
                if type(paras) == list:
                    full_text = " ".join(paras)
                else:
                    full_text = paras
            else:
                # It's just one element, a dict
                paras = body_block['p']
            content_rows.append({'id':member.name, 'sections':sections, 'text_raw':full_text})
            #break
    art_df = pd.DataFrame(content_rows)
    return art_df

In [31]:
all_dfs = []
for cur_fpath in ldc_fpaths:
    tar = tarfile.open(cur_fpath, "r:gz")
    cur_df = tar_to_df(tar)
    all_dfs.append(cur_df)

01/17/1732746.xml
01/17/1732662.xml
01/17/1732665.xml
01/17/1732619.xml
01/17/1732741.xml
01/17/1732733.xml
01/17/1732617.xml
01/17/1732610.xml
01/17/1732748.xml
01/17/1732734.xml
01/17/1732625.xml
01/17/1732659.xml
01/17/1732688.xml
01/17/1732701.xml
01/17/1732706.xml
01/17/1732622.xml
01/17/1732650.xml
01/17/1732681.xml
01/17/1732708.xml
01/17/1732774.xml
01/17/1732773.xml
01/17/1732657.xml
01/17/1732686.xml
01/17/1732762.xml
01/17/1732697.xml
01/17/1732646.xml
01/17/1732719.xml
01/17/1732690.xml
01/17/1732641.xml
01/17/1732765.xml
01/17/1732717.xml
01/17/1732633.xml
01/17/1732634.xml
01/17/1732710.xml
01/17/1732699.xml
01/17/1732648.xml
01/17/1732759.xml
01/17/1732725.xml
01/17/1732722.xml
01/17/1732674.xml
01/17/1732750.xml
01/17/1732608.xml
01/17/1732757.xml
01/17/1732673.xml
01/17/1732772.xml
01/17/1732656.xml
01/17/1732687.xml
01/17/1732651.xml
01/17/1732680.xml
01/17/1732709.xml
01/17/1732775.xml
01/17/1732707.xml
01/17/1732623.xml
01/17/1732624.xml
01/17/1732658.xml
01/17/1732

In [32]:
full_df = pd.concat(all_dfs, ignore_index=True)

In [35]:
full_df

Unnamed: 0,id,sections,text_raw
0,01/17/1732746.xml,Opinion,To the Editor: Thomas L. Friedman is correct i...
1,01/17/1732662.xml,Business,"Germany, after four years of stagnation, is sh..."
2,01/17/1732665.xml,Technology; Business,"MAXJET COMES TO WASHINGTON -- MAXjet, the all-..."
3,01/17/1732741.xml,Movies; Arts,Three new movies displaced last week's top dra...
4,01/17/1732733.xml,New York and Region,"Senator Hillary Rodham Clinton, speaking yeste..."
...,...,...,...
71337,12/04/1809495.xml,New York and Region,"Somewhere on Staten Island after dark, a Nissa..."
71338,12/04/1809438.xml,U.S.,"At the Tyson poultry plant here, Fred L. Mason..."
71339,12/04/1809398.xml,Technology; Business,When David A. Gross heard last month that the ...
71340,12/04/1809443.xml,U.S.; Obituaries,"Tom Fexas, the innovative motor yacht designer..."


In [36]:
full_df = clean_art_df(full_df)

In [37]:
full_df

Unnamed: 0,id,sections,text_raw,num_sections,sec_num
8,01/17/1732688.xml,World,Russia and China affirmed Monday that Iran mus...,1,3
10,01/17/1732706.xml,Sports,"Lauryn Williams, the reigning world champion i...",1,0
14,01/17/1732774.xml,World,Pope Benedict XVI visited the main synagogue i...,1,3
15,01/17/1732773.xml,World,The United Nations placed eight managers on pa...,1,3
17,01/17/1732686.xml,U.S.,Protesters wore yellow and black armbands and ...,1,2
...,...,...,...,...,...
71333,12/04/1809515.xml,Sports,"Joe Paterno, whose 80th birthday is Dec. 20, s...",1,0
71334,12/04/1809512.xml,Sports,It had grown as tedious as a fellow commuter's...,1,0
71335,12/04/1809436.xml,U.S.,Republicans intend to conclude the 109th Congr...,1,2
71338,12/04/1809438.xml,U.S.,"At the Tyson poultry plant here, Fred L. Mason...",1,2


In [38]:
preprocess_corpus(full_df)

In [39]:
full_df

Unnamed: 0,id,sections,text_raw,num_sections,sec_num,text
8,01/17/1732688.xml,World,Russia and China affirmed Monday that Iran mus...,1,3,russia china affirmed monday iran must resume ...
10,01/17/1732706.xml,Sports,"Lauryn Williams, the reigning world champion i...",1,0,lauryn williams reigning world champion women'...
14,01/17/1732774.xml,World,Pope Benedict XVI visited the main synagogue i...,1,3,pope benedict xvi visited main synagogue rome ...
15,01/17/1732773.xml,World,The United Nations placed eight managers on pa...,1,3,united nations placed eight managers paid susp...
17,01/17/1732686.xml,U.S.,Protesters wore yellow and black armbands and ...,1,2,protesters wore yellow black armbands chanted ...
...,...,...,...,...,...,...
71333,12/04/1809515.xml,Sports,"Joe Paterno, whose 80th birthday is Dec. 20, s...",1,0,joe paterno whose 80th birthday dec 20 yesterd...
71334,12/04/1809512.xml,Sports,It had grown as tedious as a fellow commuter's...,1,0,grown tedious fellow commuter's cellphone conv...
71335,12/04/1809436.xml,U.S.,Republicans intend to conclude the 109th Congr...,1,2,republican intend conclude 109th congress week...
71338,12/04/1809438.xml,U.S.,"At the Tyson poultry plant here, Fred L. Mason...",1,2,tyson poultry plant fred l mason jr hangs live...


## cTFIDF

In [None]:
words_per_class = compute_ctfidf(art_df, secnum_to_sec)
#words_per_class

In [157]:
wash_df = full_df[full_df['text'].str.contains("washington") & full_df['text'].str.contains("officials")].copy()

In [158]:
wash_df

Unnamed: 0,id,sections,text_raw,num_sections,sec_num,text
133,01/17/1732730.xml,U.S.,Two leading civil rights groups plan to file l...,1,2,two leading civil rights groups plan file laws...
175,01/25/1734699.xml,U.S.,The Army has issued new regulations for carryi...,1,2,army issued regulations carrying military exec...
185,01/25/1734563.xml,U.S.,Ramping up the administration's defense of its...,1,2,ramping administration's defense domestic eave...
287,01/25/1734552.xml,World,An inquiry by the Council of Europe into alleg...,1,3,inquiry council europe allegations cia operate...
339,01/04/1729474.xml,U.S.,"As a high-flying Republican lobbyist, Jack Abr...",1,2,high flying republican lobbyist jack abramoff ...
...,...,...,...,...,...,...
70598,12/10/1811026.xml,World,Nuclear regulators in Washington and abroad ar...,1,3,nuclear regulators washington abroad studying ...
70686,12/10/1811044.xml,U.S.,Life was different in Unit E at the state pris...,1,2,life different unit e state prison outside new...
70808,12/17/1812654.xml,U.S.,"For the last decade, residents of northern Mon...",1,2,last decade residents northern montana praying...
71127,12/25/1814345.xml,U.S.,"Democracy is hard work, the adage goes, but it...",1,2,democracy hard work adage goes harder places o...


In [160]:
wash_text = extract_text(wash_df)

In [161]:
pyperclip.copy(wash_text)