In [1]:
from collections import Counter
import json

import nltk
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from ctfidf import CTFIDFVectorizer

In [2]:
stopwords_en = nltk.corpus.stopwords.words('english')
#stopwords_en
stopwords_custom = ["mr","ms","mrs","said"]
stopwords_en = stopwords_en + stopwords_custom

In [3]:
api_url = "https://api.nytimes.com/svc/archive/v1/2022/5.json?api-key=[YOUR_API_KEY]"

In [4]:
def get_argmax(row):
    max_count = 0
    argmax = None
    #print(row)
    for row_key, row_val in row.iteritems():
        if row_val > max_count:
            max_count = row_val
            argmax = row_key
    return argmax

### Load data

In [5]:
json_fpath = "./nyt_2022-05.json"

In [6]:
with open(json_fpath, 'r', encoding='utf-8') as infile:
    nyt_data = json.load(infile)

In [7]:
nyt_data['response'].keys()

dict_keys(['docs', 'meta'])

In [8]:
docs = nyt_data['response']['docs']

In [9]:
docs[0]['snippet']

'A police chief said that victims’ bodies were discovered in a forest, and showed signs of mutilation.'

In [10]:
docs[0]['lead_paragraph']

'The bodies of three men were found in a pit near Bucha, a few miles west of Kyiv, Ukraine’s capital, with signs that the civilians had been “tortured to death” by Russian troops, the regional police chief said on Saturday.'

In [11]:
docs[0]['section_name']

'World'

In [12]:
doc_data = [(doc['section_name'], doc['snippet'] + " " + doc['lead_paragraph']) for doc in docs]

In [13]:
doc_data[:5]

[('World',
  'A police chief said that victims’ bodies were discovered in a forest, and showed signs of mutilation. The bodies of three men were found in a pit near Bucha, a few miles west of Kyiv, Ukraine’s capital, with signs that the civilians had been “tortured to death” by Russian troops, the regional police chief said on Saturday.'),
 ('Homepage',
  'The war’s worst atrocities have been discovered in this little Ukrainian town not far from the capital of Kyiv. The war’s worst atrocities have been discovered in this little Ukrainian town not far from the capital of Kyiv.'),
 ('Sports',
  'Paul Salata made a point of celebrating the last player selected in the draft. His family carried on the tradition by honoring the 262nd pick on Saturday. LAS VEGAS — Only the hardiest of N.F.L. fans stick around to watch the end of the third and final day of the draft. To Paul Salata, it was pure joy.'),
 ('Sports',
  'The search for elite players is so competitive that IMG, the agency that once

In [14]:
# assign a number to each section
section_counts = Counter([d[0] for d in doc_data])

In [15]:
sorted(section_counts.items(), key=lambda x: x[1], reverse=True)

[('U.S.', 849),
 ('World', 466),
 ('Opinion', 316),
 ('Arts', 300),
 ('Business Day', 291),
 ('Sports', 223),
 ('New York', 193),
 ('Books', 164),
 ('Briefing', 136),
 ('Style', 128),
 ('Movies', 119),
 ('The Learning Network', 114),
 ('Food', 112),
 ('Real Estate', 100),
 ('Crosswords & Games', 74),
 ('Theater', 72),
 ('Technology', 63),
 ('Science', 55),
 ('Well', 48),
 ('Podcasts', 46),
 ('Magazine', 43),
 ('Health', 42),
 ('Climate', 37),
 ('Today’s Paper', 31),
 ('Corrections', 30),
 ('Travel', 30),
 ('T Magazine', 27),
 ('Times Insider', 13),
 ('The Upshot', 9),
 ('Your Money', 8),
 ('Fashion & Style', 8),
 ('Reader Center', 6),
 ('Headway', 5),
 ('Smarter Living', 4),
 ('Obituaries', 4),
 ('Special Series', 3),
 ('en Español', 3),
 ('Homepage', 2),
 ('Burst', 2),
 ('T Brand', 1),
 ('The New York Times Presents', 1),
 ('International Home', 1)]

### Choose relevant sections

In [16]:
# Keep only the most relevant sections
key_sections = [
    "U.S.","World","Arts","Business Day","Sports",
    "Books","Movies","Food","Real Estate","Technology"
]
classnum_to_sec = {i: key_sections[i] for i in range(len(key_sections))}

In [17]:
key_data = [d for d in doc_data if d[0] in key_sections]

In [18]:
# Also filter too-short articles
key_data = [d for d in key_data if len(d[1]) > 20]
#len(key_data)

In [19]:
# And assign numbers
data_rows = [{'section': d[0], 'text_raw': d[1], 'sec_num': key_sections.index(d[0])} for d in key_data]

In [20]:
data_df = pd.DataFrame(data_rows)
#data_df

### Cleaning

In [21]:
punct_list = '“.’”,—()'
def remove_punct(text):
    # Special case for "-", we replace it with a space
    clean_chars = [c.replace("-"," ") for c in text if c not in punct_list]
    return "".join(clean_chars)

In [22]:
plural_map = {
    "presidents": "president",
    "wars": "war",
    "games": "game", "leagues": "league", "teams": "team",
    "books": "book",
    "films": "film",
    "restaurants": "restaurant",
    "homes": "home",
    "house": "home",
    "apartments": "apartment",
    "countries": "country", "countrys": "country",
    "companies": "company", "companys": "company",
    "fashionable": "fashion",
    "states": "state",
    "republicans": "republican",
    "readers": "read", "reading": "read",
    "senators": "senator",
}
def map_plurals(text):
    text_tokens = text.split()
    clean_tokens = [(plural_map[t] if t in plural_map else t) for t in text_tokens]
    return " ".join(clean_tokens)

In [23]:
def remove_stopwords(text):
    text_tokens = text.split()
    clean_tokens = [t for t in text_tokens if t not in stopwords_en]
    return " ".join(clean_tokens)

In [24]:
def preprocess_corpus(data_df):
    data_df['text'] = data_df['text_raw'].str.lower()
    data_df['text'] = data_df['text'].apply(remove_punct)
    data_df['text'] = data_df['text'].apply(remove_stopwords)
    data_df['text'] = data_df['text'].apply(map_plurals)

### Now cTFIDF

In [25]:
def compute_ctfidf(data_df, classnum_to_sec):
    doc_class_df = data_df.groupby(['sec_num'], as_index=False).agg({'text': ' '.join})
    # Create bag of words
    ngrams = 1
    #min_df = 0.4
    #min_df = 0.24
    min_df = 0.1
    count_vectorizer = CountVectorizer(ngram_range=(1,ngrams), min_df=min_df)
    #count_vectorizer = CountVectorizer(ngram_range=(1,ngrams))
    count_vectorizer = count_vectorizer.fit(doc_class_df['text'])
    count = count_vectorizer.transform(doc_class_df['text'])
    words = count_vectorizer.get_feature_names_out()
    # Extract top 10 words
    ctfidf = CTFIDFVectorizer().fit_transform(count, n_samples=len(data_df)).toarray()
    num_top_words = 50
    words_per_class = {classnum_to_sec[label]: list(reversed([words[index] for index in ctfidf[label].argsort()[-num_top_words:]]))
                       for label in doc_class_df['sec_num']}
    return words_per_class

In [27]:
data_df

Unnamed: 0,section,text_raw,sec_num
0,World,A police chief said that victims’ bodies were ...,1
1,Sports,Paul Salata made a point of celebrating the la...,4
2,Sports,The search for elite players is so competitive...,4
3,Arts,"The soprano Liudmyla Monastyrska, replacing th...",2
4,U.S.,"Amid concerns about Covid, President Biden off...",0
...,...,...,...
2682,Arts,The Ukrainian city’s distinctive architecture ...,2
2683,Sports,Zverev will face either Rafael Nadal or Novak ...,4
2684,Books,Tom Perrotta’s sequel to “Election” finds Flic...,5
2685,Arts,"The company’s spring season, a double whammy o...",2


In [26]:
words_per_class = compute_ctfidf(data_df, classnum_to_sec)
words_per_class

KeyError: "Column(s) ['text'] do not exist"

### cTFIDF Results

In [1543]:
def print_ctfidf(words_per_class):
    for cur_class in words_per_class.keys():
        print(f"=====[ {cur_class} ]=====")
        cur_class_words = words_per_class[cur_class]
        #print(cur_class_words)
        #print([w for w in cur_class_words if w.endswith("s")])
        print([w for w in cur_class_words])
        #joblib.dump(cur_class_words, f"./indic_words/{cur_class}.pkl")
print_ctfidf(words_per_class)

=====[ Sports ]=====
['game', 'said', 'team', 'season', 'first', 'coach', 'last', 'year', 'one', 'would', 'play', 'two', 'players', 'league', 'points', 'time', 'three', 'football', 'like', 'back', 'we', 'giants', 'he', 'new', 'could', 'it', 'bowl', 'get', 'going', 'second', 'years', 'state', 'left', 'player', 'field', 'made', 'that', 'patriots', 'manning', 'good', 'knicks', 'colts', '10', 'think', 'night', 'quarterback', 'point', 'four', 'way', 'victory']
=====[ Arts ]=====
['new', 'music', 'show', 'like', 'one', 'art', 'pm', 'said', 'work', 'two', 'the', 'first', 'york', 'also', 'museum', 'series', 'american', 'time', 'made', 'would', 'dance', 'years', 'songs', 'well', 'million', 'theater', 'film', 'works', 'life', 'street', 'ballet', 'last', 'way', 'company', 'album', 'three', 'year', 'band', 'night', 'much', 'west', '10', 'it', 'orchestra', 'played', 'even', 'review', 'performance', 'another', 'world']
=====[ U.S. ]=====
['said', 'new', 'state', 'would', 'people', 'one', 'court', 'c

### Define Keywords

In [1692]:
keywords = {
    #"U.S." : ["washington", "state", "republican"],
    #"U.S.": ["court","federal","state"],
    "U.S.": ["president","court","republican"],
    #"World": ["war", "country", "military"],
    "World": ["government","country","officials"],
    "Arts": ["music", "art", "artist"],
    "Business Day": ["economy", "investors", "inflation"],
    "Sports": ["game", "league", "team"],
    "Books": ["book", "novel", "read"],
    #"Style": ["fashion", "brand"],
    "Movies": ["film", "movie", "director"],
    "Food": ["restaurant", "food", "kitchen"],
    "Real Estate": ["home", "apartment", "available"],
    "Technology": ["digital", "online", "tech"],
}

In [1693]:
# Get keywords in LaTeX form
for cur_section, cur_kw_list in keywords.items():
    tex_list = ["\\texttt{" + kw + "}" for kw in cur_kw_list]
    kw_str = ", ".join(tex_list)
    print(f"{cur_section} & {kw_str} \\\\")

U.S. & \texttt{president}, \texttt{court}, \texttt{republican} \\
World & \texttt{government}, \texttt{country}, \texttt{officials} \\
Arts & \texttt{music}, \texttt{art}, \texttt{artist} \\
Business Day & \texttt{economy}, \texttt{investors}, \texttt{inflation} \\
Sports & \texttt{game}, \texttt{league}, \texttt{team} \\
Books & \texttt{book}, \texttt{novel}, \texttt{read} \\
Movies & \texttt{film}, \texttt{movie}, \texttt{director} \\
Food & \texttt{restaurant}, \texttt{food}, \texttt{kitchen} \\
Real Estate & \texttt{home}, \texttt{apartment}, \texttt{available} \\
Technology & \texttt{digital}, \texttt{online}, \texttt{tech} \\


In [1694]:
data_df['text_tokens'] = data_df['text'].str.split()

KeyError: 'text'

In [1695]:
data_df

Unnamed: 0,section,text_raw,sec_num
0,World,A police chief said that victims’ bodies were ...,1
1,Sports,Paul Salata made a point of celebrating the la...,4
2,Sports,The search for elite players is so competitive...,4
3,Arts,"The soprano Liudmyla Monastyrska, replacing th...",2
4,U.S.,"Amid concerns about Covid, President Biden off...",0
...,...,...,...
2682,Arts,The Ukrainian city’s distinctive architecture ...,2
2683,Sports,Zverev will face either Rafael Nadal or Novak ...,4
2684,Books,Tom Perrotta’s sequel to “Election” finds Flic...,5
2685,Arts,"The company’s spring season, a double whammy o...",2


In [1696]:
def get_token_counts(token_list, section):
    section_keywords = keywords[section]
    total_count = 0
    for cur_sec_kw in section_keywords:
        total_count += token_list.count(cur_sec_kw)
    return total_count

In [1697]:
for section, sec_keywords in keywords.items():
    get_counts_partial = lambda token_list: get_token_counts(token_list, section)
    data_df[section] = data_df['text_tokens'].apply(get_counts_partial)

KeyError: 'text_tokens'

In [1698]:
data_df

Unnamed: 0,section,text_raw,sec_num
0,World,A police chief said that victims’ bodies were ...,1
1,Sports,Paul Salata made a point of celebrating the la...,4
2,Sports,The search for elite players is so competitive...,4
3,Arts,"The soprano Liudmyla Monastyrska, replacing th...",2
4,U.S.,"Amid concerns about Covid, President Biden off...",0
...,...,...,...
2682,Arts,The Ukrainian city’s distinctive architecture ...,2
2683,Sports,Zverev will face either Rafael Nadal or Novak ...,4
2684,Books,Tom Perrotta’s sequel to “Election” finds Flic...,5
2685,Arts,"The company’s spring season, a double whammy o...",2


In [1699]:
cols_to_keep = ["section","text"] + key_sections
data_df[:5][cols_to_keep]

KeyError: "['text', 'U.S.', 'World', 'Arts', 'Business Day', 'Sports', 'Books', 'Movies', 'Food', 'Real Estate', 'Technology'] not in index"

In [1700]:
data_df['predicted'] = data_df[key_sections].apply(get_argmax, axis=1)

KeyError: "None of [Index(['U.S.', 'World', 'Arts', 'Business Day', 'Sports', 'Books', 'Movies',\n       'Food', 'Real Estate', 'Technology'],\n      dtype='object')] are in the [columns]"

In [1701]:
data_df

Unnamed: 0,section,text_raw,sec_num
0,World,A police chief said that victims’ bodies were ...,1
1,Sports,Paul Salata made a point of celebrating the la...,4
2,Sports,The search for elite players is so competitive...,4
3,Arts,"The soprano Liudmyla Monastyrska, replacing th...",2
4,U.S.,"Amid concerns about Covid, President Biden off...",0
...,...,...,...
2682,Arts,The Ukrainian city’s distinctive architecture ...,2
2683,Sports,Zverev will face either Rafael Nadal or Novak ...,4
2684,Books,Tom Perrotta’s sequel to “Election” finds Flic...,5
2685,Arts,"The company’s spring season, a double whammy o...",2


In [1702]:
data_df['section'].value_counts()

U.S.            849
World           466
Arts            300
Business Day    291
Sports          223
Books           164
Movies          119
Food            112
Real Estate     100
Technology       63
Name: section, dtype: int64

In [1703]:
predict_df = data_df[~pd.isna(data_df['predicted'])].copy()

KeyError: 'predicted'

In [1704]:
predict_df

Unnamed: 0,id,sections,text_raw,num_sections,sec_num,text,text_tokens,U.S.,World,Arts,Business Day,Sports,Books,Movies,Food,Real Estate,Technology,predicted,correct
4,01/22/1820684.xml,Sports,Marquette point guard Dominic James wore dark ...,1,0,marquette point guard dominic james wore dark ...,"[marquette, point, guard, dominic, james, wore...",0,1,0,0,14,0,0,0,3,0,Sports,True
5,01/22/1820655.xml,Arts,"Even in their absence, the Grateful Dead can p...",1,1,even absence grateful dead pack hall saturday ...,"[even, absence, grateful, dead, pack, hall, sa...",0,1,2,0,0,0,0,0,0,0,Arts,True
7,01/22/1820683.xml,World,A damaged and listing cargo ship was spilling ...,1,3,damaged listing cargo ship spilling fuel cargo...,"[damaged, listing, cargo, ship, spilling, fuel...",0,5,0,0,1,0,0,0,0,0,World,True
9,01/22/1820743.xml,Sports,As Chicago Bears defensive end Adewale Ogunley...,1,0,chicago bears defensive end adewale ogunleye w...,"[chicago, bears, defensive, end, adewale, ogun...",0,0,0,0,3,0,0,0,0,0,Sports,True
12,01/22/1820744.xml,U.S.,Nine years after he began serving a life sente...,1,2,nine years began serving life sentence unabomb...,"[nine, years, began, serving, life, sentence, ...",14,11,0,0,0,2,0,0,2,0,U.S.,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5600,01/18/1819742.xml,Sports,"It is halfway through the season, and just the...",1,0,halfway season right time struggling team rein...,"[halfway, season, right, time, struggling, tea...",4,0,0,0,7,0,0,0,0,0,Sports,True
5601,01/18/1819730.xml,Sports,"The telephone calls keep coming, so Mets Gener...",1,0,telephone calls keep coming mets general manag...,"[telephone, calls, keep, coming, mets, general...",0,1,0,0,7,0,0,0,3,0,Sports,True
5610,01/18/1819595.xml,Arts,Stravinsky was arguably the most important com...,1,1,stravinsky arguably important composer dance 2...,"[stravinsky, arguably, important, composer, da...",3,0,3,0,0,0,0,0,0,0,Arts,True
5616,01/18/1819745.xml,Sports,Casey Powell made a big mistake bringing his w...,1,0,casey powell made big mistake bringing wife ne...,"[casey, powell, made, big, mistake, bringing, ...",0,0,0,0,17,0,0,0,1,0,Sports,True


In [1705]:
predict_df['correct'] = predict_df['section'] == predict_df['predicted']

KeyError: 'section'

In [1706]:
predict_df

Unnamed: 0,id,sections,text_raw,num_sections,sec_num,text,text_tokens,U.S.,World,Arts,Business Day,Sports,Books,Movies,Food,Real Estate,Technology,predicted,correct
4,01/22/1820684.xml,Sports,Marquette point guard Dominic James wore dark ...,1,0,marquette point guard dominic james wore dark ...,"[marquette, point, guard, dominic, james, wore...",0,1,0,0,14,0,0,0,3,0,Sports,True
5,01/22/1820655.xml,Arts,"Even in their absence, the Grateful Dead can p...",1,1,even absence grateful dead pack hall saturday ...,"[even, absence, grateful, dead, pack, hall, sa...",0,1,2,0,0,0,0,0,0,0,Arts,True
7,01/22/1820683.xml,World,A damaged and listing cargo ship was spilling ...,1,3,damaged listing cargo ship spilling fuel cargo...,"[damaged, listing, cargo, ship, spilling, fuel...",0,5,0,0,1,0,0,0,0,0,World,True
9,01/22/1820743.xml,Sports,As Chicago Bears defensive end Adewale Ogunley...,1,0,chicago bears defensive end adewale ogunleye w...,"[chicago, bears, defensive, end, adewale, ogun...",0,0,0,0,3,0,0,0,0,0,Sports,True
12,01/22/1820744.xml,U.S.,Nine years after he began serving a life sente...,1,2,nine years began serving life sentence unabomb...,"[nine, years, began, serving, life, sentence, ...",14,11,0,0,0,2,0,0,2,0,U.S.,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5600,01/18/1819742.xml,Sports,"It is halfway through the season, and just the...",1,0,halfway season right time struggling team rein...,"[halfway, season, right, time, struggling, tea...",4,0,0,0,7,0,0,0,0,0,Sports,True
5601,01/18/1819730.xml,Sports,"The telephone calls keep coming, so Mets Gener...",1,0,telephone calls keep coming mets general manag...,"[telephone, calls, keep, coming, mets, general...",0,1,0,0,7,0,0,0,3,0,Sports,True
5610,01/18/1819595.xml,Arts,Stravinsky was arguably the most important com...,1,1,stravinsky arguably important composer dance 2...,"[stravinsky, arguably, important, composer, da...",3,0,3,0,0,0,0,0,0,0,Arts,True
5616,01/18/1819745.xml,Sports,Casey Powell made a big mistake bringing his w...,1,0,casey powell made big mistake bringing wife ne...,"[casey, powell, made, big, mistake, bringing, ...",0,0,0,0,17,0,0,0,1,0,Sports,True


In [1707]:
correct_counts = predict_df['correct'].value_counts()

In [1708]:
correct_counts

True     1050
False     361
Name: correct, dtype: int64

In [1709]:
correct_counts[True] / (correct_counts[True]+correct_counts[False])

0.744153082919915

In [1710]:
pd.crosstab(predict_df['correct'], predict_df['section'], normalize="columns")

KeyError: 'section'

## Now the full articles (from the LDC corpus)

In [1711]:
import glob
import os
import tarfile

import xmltodict

In [1712]:
ldc_path = "./nyt_corpus/data/2007/"

In [1713]:
os.listdir(ldc_path)

['01', '01.tgz', '02.tgz', '03.tgz', '04.tgz', '05.tgz', '06.tgz']

In [1714]:
ldc_fpath = os.path.join(ldc_path, "01.tgz")

In [1715]:
tar = tarfile.open(ldc_fpath, "r:gz")

In [1716]:
def parse_content(content):
    content_str = content.decode("utf-8")
    content_dict = xmltodict.parse(content_str)
    return content_dict['nitf']

In [1717]:
def get_dict_item(item_list, key, val, other_key):
    # Helper fn
    results = [item for item in item_list if item[key] == val]
    if len(results) == 0:
        return ""
    return results[0][other_key]

In [1718]:
content_rows = []
for member in tar.getmembers():
    #print(tar.getnames())
    f = tar.extractfile(member)
    if f is not None:
        #print(dir(member))
        print(member.name)
        content = f.read()
        content_dict = parse_content(content)
        sections = get_dict_item(content_dict['head']['meta'], "@name", "online_sections", "@content")
        if sections == "Paid Death Notices":
            continue
        if "Corrections" in sections:
            continue
        body_content = content_dict['body']['body.content']
        if body_content is None:
            continue
        body_block = body_content['block']
        if type(body_block) == list:
            paras = get_dict_item(body_block, "@class", "full_text", "p")
            if type(paras) == list:
                full_text = " ".join(paras)
            else:
                full_text = paras
        else:
            # It's just one element, a dict
            paras = body_block['p']
        content_rows.append({'id':member.name, 'sections':sections, 'text_raw':full_text})
        #break

01/22/1820620.xml
01/22/1820704.xml
01/22/1820703.xml
01/22/1820627.xml
01/22/1820684.xml
01/22/1820655.xml
01/22/1820629.xml
01/22/1820683.xml
01/22/1820652.xml
01/22/1820743.xml
01/22/1820667.xml
01/22/1820738.xml
01/22/1820660.xml
01/22/1820744.xml
01/22/1820736.xml
01/22/1820612.xml
01/22/1820615.xml
01/22/1820731.xml
01/22/1820669.xml
01/22/1820604.xml
01/22/1820678.xml
01/22/1820720.xml
01/22/1820727.xml
01/22/1820603.xml
01/22/1820671.xml
01/22/1820729.xml
01/22/1820755.xml
01/22/1820752.xml
01/22/1820676.xml
01/22/1820643.xml
01/22/1820692.xml
01/22/1820644.xml
01/22/1820695.xml
01/22/1820638.xml
01/22/1820712.xml
01/22/1820636.xml
01/22/1820631.xml
01/22/1820715.xml
01/22/1820614.xml
01/22/1820730.xml
01/22/1820668.xml
01/22/1820737.xml
01/22/1820613.xml
01/22/1820739.xml
01/22/1820661.xml
01/22/1820745.xml
01/22/1820742.xml
01/22/1820666.xml
01/22/1820682.xml
01/22/1820653.xml
01/22/1820595.xml
01/22/1820685.xml
01/22/1820654.xml
01/22/1820628.xml
01/22/1820702.xml
01/22/1820

In [1719]:
art_df = pd.DataFrame(content_rows)

In [1720]:
art_df['sections'] = art_df['sections'].str.replace("Front Page; ","")

In [1721]:
art_df['num_sections'] = art_df['sections'].apply(lambda x: len(x.split(";")))

In [1722]:
# Drop articles with multiple sections, for now
art_df = art_df[art_df['num_sections'] == 1].copy()

In [1723]:
art_df['sections'].value_counts()

New York and Region    935
Sports                 536
Business               415
Opinion                411
Arts                   410
U.S.                   259
World                  248
Style                  111
Travel                  81
Real Estate             76
Health                  50
Automobiles             46
Magazine                34
Week in Review          27
                        22
Job Market               2
Theater                  1
Front Page               1
Technology               1
Books                    1
Name: sections, dtype: int64

In [1724]:
sections_to_keep = [
    "Sports",
    #"Business",
    "Arts", "U.S.", "World", "Real Estate"
]
secnum_to_sec = {i: sections_to_keep[i] for i in range(len(sections_to_keep))}

In [1725]:
art_df = art_df[art_df['sections'].isin(sections_to_keep)].copy()

In [1726]:
art_df['sec_num'] = art_df['sections'].apply(lambda x: sections_to_keep.index(x))

In [1727]:
art_df

Unnamed: 0,id,sections,text_raw,num_sections,sec_num
4,01/22/1820684.xml,Sports,Marquette point guard Dominic James wore dark ...,1,0
5,01/22/1820655.xml,Arts,"Even in their absence, the Grateful Dead can p...",1,1
7,01/22/1820683.xml,World,A damaged and listing cargo ship was spilling ...,1,3
9,01/22/1820743.xml,Sports,As Chicago Bears defensive end Adewale Ogunley...,1,0
12,01/22/1820744.xml,U.S.,Nine years after he began serving a life sente...,1,2
...,...,...,...,...,...
5601,01/18/1819730.xml,Sports,"The telephone calls keep coming, so Mets Gener...",1,0
5610,01/18/1819595.xml,Arts,Stravinsky was arguably the most important com...,1,1
5615,01/18/1819592.xml,Arts,When is bridge like poker? When bluffing is in...,1,1
5616,01/18/1819745.xml,Sports,Casey Powell made a big mistake bringing his w...,1,0


In [1728]:
preprocess_corpus(art_df)

### cTFIDF

In [1729]:
words_per_class = compute_ctfidf(art_df, secnum_to_sec)
#words_per_class

In [1730]:
print_ctfidf(words_per_class)

=====[ Sports ]=====
['game', 'said', 'team', 'season', 'first', 'coach', 'last', 'year', 'one', 'would', 'play', 'two', 'players', 'league', 'points', 'time', 'three', 'football', 'like', 'back', 'we', 'giants', 'he', 'new', 'could', 'it', 'bowl', 'get', 'going', 'second', 'years', 'state', 'left', 'player', 'field', 'made', 'that', 'patriots', 'manning', 'good', 'knicks', 'colts', '10', 'think', 'night', 'quarterback', 'point', 'four', 'way', 'victory']
=====[ Arts ]=====
['new', 'music', 'show', 'like', 'one', 'art', 'pm', 'said', 'work', 'two', 'the', 'first', 'york', 'also', 'museum', 'series', 'american', 'time', 'made', 'would', 'dance', 'years', 'songs', 'well', 'million', 'theater', 'film', 'works', 'life', 'street', 'ballet', 'last', 'way', 'company', 'album', 'three', 'year', 'band', 'night', 'much', 'west', '10', 'it', 'orchestra', 'played', 'even', 'review', 'performance', 'another', 'world']
=====[ U.S. ]=====
['said', 'new', 'state', 'would', 'people', 'one', 'court', 'c

In [1731]:
keywords_sub = {k: v for k, v in keywords.items() if k in sections_to_keep}

In [1732]:
keywords_sub

{'U.S.': ['president', 'court', 'republican'],
 'World': ['government', 'country', 'officials'],
 'Arts': ['music', 'art', 'artist'],
 'Sports': ['game', 'league', 'team'],
 'Real Estate': ['home', 'apartment', 'available']}

In [1733]:
art_df

Unnamed: 0,id,sections,text_raw,num_sections,sec_num,text
4,01/22/1820684.xml,Sports,Marquette point guard Dominic James wore dark ...,1,0,marquette point guard dominic james wore dark ...
5,01/22/1820655.xml,Arts,"Even in their absence, the Grateful Dead can p...",1,1,even absence grateful dead pack hall saturday ...
7,01/22/1820683.xml,World,A damaged and listing cargo ship was spilling ...,1,3,damaged listing cargo ship spilling fuel cargo...
9,01/22/1820743.xml,Sports,As Chicago Bears defensive end Adewale Ogunley...,1,0,chicago bears defensive end adewale ogunleye w...
12,01/22/1820744.xml,U.S.,Nine years after he began serving a life sente...,1,2,nine years began serving life sentence unabomb...
...,...,...,...,...,...,...
5601,01/18/1819730.xml,Sports,"The telephone calls keep coming, so Mets Gener...",1,0,telephone calls keep coming mets general manag...
5610,01/18/1819595.xml,Arts,Stravinsky was arguably the most important com...,1,1,stravinsky arguably important composer dance 2...
5615,01/18/1819592.xml,Arts,When is bridge like poker? When bluffing is in...,1,1,bridge like poker? bluffing involved poker per...
5616,01/18/1819745.xml,Sports,Casey Powell made a big mistake bringing his w...,1,0,casey powell made big mistake bringing wife ne...


In [1734]:
art_df['text_tokens'] = art_df['text'].str.split()

In [1735]:
art_df

Unnamed: 0,id,sections,text_raw,num_sections,sec_num,text,text_tokens
4,01/22/1820684.xml,Sports,Marquette point guard Dominic James wore dark ...,1,0,marquette point guard dominic james wore dark ...,"[marquette, point, guard, dominic, james, wore..."
5,01/22/1820655.xml,Arts,"Even in their absence, the Grateful Dead can p...",1,1,even absence grateful dead pack hall saturday ...,"[even, absence, grateful, dead, pack, hall, sa..."
7,01/22/1820683.xml,World,A damaged and listing cargo ship was spilling ...,1,3,damaged listing cargo ship spilling fuel cargo...,"[damaged, listing, cargo, ship, spilling, fuel..."
9,01/22/1820743.xml,Sports,As Chicago Bears defensive end Adewale Ogunley...,1,0,chicago bears defensive end adewale ogunleye w...,"[chicago, bears, defensive, end, adewale, ogun..."
12,01/22/1820744.xml,U.S.,Nine years after he began serving a life sente...,1,2,nine years began serving life sentence unabomb...,"[nine, years, began, serving, life, sentence, ..."
...,...,...,...,...,...,...,...
5601,01/18/1819730.xml,Sports,"The telephone calls keep coming, so Mets Gener...",1,0,telephone calls keep coming mets general manag...,"[telephone, calls, keep, coming, mets, general..."
5610,01/18/1819595.xml,Arts,Stravinsky was arguably the most important com...,1,1,stravinsky arguably important composer dance 2...,"[stravinsky, arguably, important, composer, da..."
5615,01/18/1819592.xml,Arts,When is bridge like poker? When bluffing is in...,1,1,bridge like poker? bluffing involved poker per...,"[bridge, like, poker?, bluffing, involved, pok..."
5616,01/18/1819745.xml,Sports,Casey Powell made a big mistake bringing his w...,1,0,casey powell made big mistake bringing wife ne...,"[casey, powell, made, big, mistake, bringing, ..."


In [1736]:
for section, sec_keywords in keywords.items():
    get_counts_partial = lambda token_list: get_token_counts(token_list, section)
    art_df[section] = art_df['text_tokens'].apply(get_counts_partial)

In [1737]:
art_df

Unnamed: 0,id,sections,text_raw,num_sections,sec_num,text,text_tokens,U.S.,World,Arts,Business Day,Sports,Books,Movies,Food,Real Estate,Technology
4,01/22/1820684.xml,Sports,Marquette point guard Dominic James wore dark ...,1,0,marquette point guard dominic james wore dark ...,"[marquette, point, guard, dominic, james, wore...",0,1,0,0,14,0,0,0,3,0
5,01/22/1820655.xml,Arts,"Even in their absence, the Grateful Dead can p...",1,1,even absence grateful dead pack hall saturday ...,"[even, absence, grateful, dead, pack, hall, sa...",0,1,2,0,0,0,0,0,0,0
7,01/22/1820683.xml,World,A damaged and listing cargo ship was spilling ...,1,3,damaged listing cargo ship spilling fuel cargo...,"[damaged, listing, cargo, ship, spilling, fuel...",0,5,0,0,1,0,0,0,0,0
9,01/22/1820743.xml,Sports,As Chicago Bears defensive end Adewale Ogunley...,1,0,chicago bears defensive end adewale ogunleye w...,"[chicago, bears, defensive, end, adewale, ogun...",0,0,0,0,3,0,0,0,0,0
12,01/22/1820744.xml,U.S.,Nine years after he began serving a life sente...,1,2,nine years began serving life sentence unabomb...,"[nine, years, began, serving, life, sentence, ...",11,11,0,0,0,2,0,0,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5601,01/18/1819730.xml,Sports,"The telephone calls keep coming, so Mets Gener...",1,0,telephone calls keep coming mets general manag...,"[telephone, calls, keep, coming, mets, general...",0,1,0,0,7,0,0,0,3,0
5610,01/18/1819595.xml,Arts,Stravinsky was arguably the most important com...,1,1,stravinsky arguably important composer dance 2...,"[stravinsky, arguably, important, composer, da...",1,0,3,0,0,0,0,0,0,0
5615,01/18/1819592.xml,Arts,When is bridge like poker? When bluffing is in...,1,1,bridge like poker? bluffing involved poker per...,"[bridge, like, poker?, bluffing, involved, pok...",0,0,0,0,0,0,0,0,0,0
5616,01/18/1819745.xml,Sports,Casey Powell made a big mistake bringing his w...,1,0,casey powell made big mistake bringing wife ne...,"[casey, powell, made, big, mistake, bringing, ...",1,0,0,0,17,0,0,0,1,0


In [1738]:
art_df['predicted'] = art_df[sections_to_keep].apply(get_argmax, axis=1)

In [1739]:
art_df

Unnamed: 0,id,sections,text_raw,num_sections,sec_num,text,text_tokens,U.S.,World,Arts,Business Day,Sports,Books,Movies,Food,Real Estate,Technology,predicted
4,01/22/1820684.xml,Sports,Marquette point guard Dominic James wore dark ...,1,0,marquette point guard dominic james wore dark ...,"[marquette, point, guard, dominic, james, wore...",0,1,0,0,14,0,0,0,3,0,Sports
5,01/22/1820655.xml,Arts,"Even in their absence, the Grateful Dead can p...",1,1,even absence grateful dead pack hall saturday ...,"[even, absence, grateful, dead, pack, hall, sa...",0,1,2,0,0,0,0,0,0,0,Arts
7,01/22/1820683.xml,World,A damaged and listing cargo ship was spilling ...,1,3,damaged listing cargo ship spilling fuel cargo...,"[damaged, listing, cargo, ship, spilling, fuel...",0,5,0,0,1,0,0,0,0,0,World
9,01/22/1820743.xml,Sports,As Chicago Bears defensive end Adewale Ogunley...,1,0,chicago bears defensive end adewale ogunleye w...,"[chicago, bears, defensive, end, adewale, ogun...",0,0,0,0,3,0,0,0,0,0,Sports
12,01/22/1820744.xml,U.S.,Nine years after he began serving a life sente...,1,2,nine years began serving life sentence unabomb...,"[nine, years, began, serving, life, sentence, ...",11,11,0,0,0,2,0,0,2,0,U.S.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5601,01/18/1819730.xml,Sports,"The telephone calls keep coming, so Mets Gener...",1,0,telephone calls keep coming mets general manag...,"[telephone, calls, keep, coming, mets, general...",0,1,0,0,7,0,0,0,3,0,Sports
5610,01/18/1819595.xml,Arts,Stravinsky was arguably the most important com...,1,1,stravinsky arguably important composer dance 2...,"[stravinsky, arguably, important, composer, da...",1,0,3,0,0,0,0,0,0,0,Arts
5615,01/18/1819592.xml,Arts,When is bridge like poker? When bluffing is in...,1,1,bridge like poker? bluffing involved poker per...,"[bridge, like, poker?, bluffing, involved, pok...",0,0,0,0,0,0,0,0,0,0,
5616,01/18/1819745.xml,Sports,Casey Powell made a big mistake bringing his w...,1,0,casey powell made big mistake bringing wife ne...,"[casey, powell, made, big, mistake, bringing, ...",1,0,0,0,17,0,0,0,1,0,Sports


In [1740]:
predict_df = art_df[~pd.isna(art_df['predicted'])].copy()

In [1741]:
predict_df

Unnamed: 0,id,sections,text_raw,num_sections,sec_num,text,text_tokens,U.S.,World,Arts,Business Day,Sports,Books,Movies,Food,Real Estate,Technology,predicted
4,01/22/1820684.xml,Sports,Marquette point guard Dominic James wore dark ...,1,0,marquette point guard dominic james wore dark ...,"[marquette, point, guard, dominic, james, wore...",0,1,0,0,14,0,0,0,3,0,Sports
5,01/22/1820655.xml,Arts,"Even in their absence, the Grateful Dead can p...",1,1,even absence grateful dead pack hall saturday ...,"[even, absence, grateful, dead, pack, hall, sa...",0,1,2,0,0,0,0,0,0,0,Arts
7,01/22/1820683.xml,World,A damaged and listing cargo ship was spilling ...,1,3,damaged listing cargo ship spilling fuel cargo...,"[damaged, listing, cargo, ship, spilling, fuel...",0,5,0,0,1,0,0,0,0,0,World
9,01/22/1820743.xml,Sports,As Chicago Bears defensive end Adewale Ogunley...,1,0,chicago bears defensive end adewale ogunleye w...,"[chicago, bears, defensive, end, adewale, ogun...",0,0,0,0,3,0,0,0,0,0,Sports
12,01/22/1820744.xml,U.S.,Nine years after he began serving a life sente...,1,2,nine years began serving life sentence unabomb...,"[nine, years, began, serving, life, sentence, ...",11,11,0,0,0,2,0,0,2,0,U.S.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5600,01/18/1819742.xml,Sports,"It is halfway through the season, and just the...",1,0,halfway season right time struggling team rein...,"[halfway, season, right, time, struggling, tea...",2,0,0,0,7,0,0,0,0,0,Sports
5601,01/18/1819730.xml,Sports,"The telephone calls keep coming, so Mets Gener...",1,0,telephone calls keep coming mets general manag...,"[telephone, calls, keep, coming, mets, general...",0,1,0,0,7,0,0,0,3,0,Sports
5610,01/18/1819595.xml,Arts,Stravinsky was arguably the most important com...,1,1,stravinsky arguably important composer dance 2...,"[stravinsky, arguably, important, composer, da...",1,0,3,0,0,0,0,0,0,0,Arts
5616,01/18/1819745.xml,Sports,Casey Powell made a big mistake bringing his w...,1,0,casey powell made big mistake bringing wife ne...,"[casey, powell, made, big, mistake, bringing, ...",1,0,0,0,17,0,0,0,1,0,Sports


In [1742]:
predict_df['correct'] = predict_df['sections'] == predict_df['predicted']

In [1743]:
correct_counts = predict_df['correct'].value_counts()

In [1744]:
correct_counts

True     1060
False     337
Name: correct, dtype: int64

In [1745]:
correct_counts[True] / (correct_counts[True]+correct_counts[False])

0.7587687902648532

In [1746]:
pd.crosstab(predict_df['correct'], predict_df['sections'], normalize="columns")

sections,Arts,Real Estate,Sports,U.S.,World
correct,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
False,0.321937,0.133333,0.038986,0.561702,0.278027
True,0.678063,0.866667,0.961014,0.438298,0.721973
