## Labeling

This notebook contains code that export data for manual labeling

In [1]:
import pandas as pd

In [2]:
metadata = pd.read_json("../data/raw/210119_en_deter_preprocessed.json")
metadata = metadata.loc[~metadata["fulltext"].str.contains("^Not available.")].reset_index(drop=True)
embeddings = pd.read_csv("../data/processed/deciding-on-topic-reduction.csv")

In [3]:
assert metadata.shape[0] == embeddings.shape[0]

redundant_cols = ["doc_id", "link", "onestring", "threshold", "pub_title", "lang"]
df = pd.merge(
    metadata.drop(redundant_cols, axis=1),
    embeddings,
    how="left",
    left_index=True,
    right_index=True
)

In [4]:
df.head()

Unnamed: 0,title,author,date,fulltext,x,y,num_label_2,word_label_2,score_2,num_label_3,...,score_17,num_label_18,word_label_18,score_18,num_label_19,word_label_19,score_19,num_label_20,word_label_20,score_20
0,WORLD IN BRIEF Copters rescue ice fishermen,- From news services,1997-01-01,Helicopters rescued scores of people ice-fis...,8.063369,-0.291642,0,"pyongyang, hardly, surely, scarcely, inevitably",0.216458,0,...,0.321827,6,"republicans, republican, gop, democrat, democrats",0.319669,6,"republicans, republican, gop, democrat, democrats",0.322677,7,"republicans, republican, gop, democrat, democrats",0.320961
1,WORLD IN BRIEF Swiss official pressured to quit,- From news services,1997-01-01,Switzerland's largest political party on Sat...,8.119796,-0.541421,0,"pyongyang, hardly, surely, scarcely, inevitably",0.251642,0,...,0.304642,6,"republicans, republican, gop, democrat, democrats",0.303649,6,"republicans, republican, gop, democrat, democrats",0.305321,7,"republicans, republican, gop, democrat, democrats",0.305281
2,How to boost the Western response to Russian h...,"Юркова, Ольга",2018-01-01,"By Jakub Janda, for European View Abstract The...",10.569654,5.105814,0,"pyongyang, hardly, surely, scarcely, inevitably",0.150307,0,...,0.355095,8,"nato, russia, enlargement, russian, moscow",0.355415,7,"nato, russia, enlargement, russian, moscow",0.355408,6,"nato, russia, enlargement, russian, moscow",0.356287
3,Kremlin Watch Briefing: Does Putin underestima...,"Юркова, Ольга",2018-01-01,Topics of the Week Head of MI6: Putin should ...,11.484039,5.563009,0,"pyongyang, hardly, surely, scarcely, inevitably",0.189021,0,...,0.265259,8,"nato, russia, enlargement, russian, moscow",0.265297,7,"nato, russia, enlargement, russian, moscow",0.264855,6,"nato, russia, enlargement, russian, moscow",0.265003
4,Understanding and Combating Russian and Chines...,"Юркова, Ольга",2019-01-01,A motorcade car adorned with Chinese and Russi...,11.507264,5.589423,1,"airmen, warfighter, dod, cyber, sustainment",0.199532,1,...,0.321802,13,"cyber, malicious, cyberattacks, cybersecurity,...",0.321942,13,"cyber, malicious, cyberattacks, cybersecurity,...",0.320756,13,"cyber, malicious, cyberattacks, cybersecurity,...",0.31959


In [5]:
df.to_csv("../data/processed/metadata_embeddings.csv", index=False)

In [6]:
top20 = df.loc[:, ["x", "y", "title", "author", "date", "fulltext", "num_label_20", "word_label_20", "score_20"]]

In [7]:
top20.to_csv("../data/processed/metadata_embeddings_20-topics.csv", index=False)

In [8]:
top_docs = top20.groupby("num_label_20")["score_20"].nlargest(25)

In [9]:
top20.loc[top_docs.index.get_level_values(1), :].to_csv("../data/processed/labeling-20-topics-25-docs.csv", index=False)

---

In [10]:
LABELS = {
    0: "Unidentifiable",
    1: "North Korea nuclear issues",
    2: "India-Pakistani nuclear tensions I", #!
    3: "Middle East nuclear issues",
    4: "Russia's nuclear program",
    5: "Deterrence in the context of the size and reduction of WMD",
    6: "US/NATO rivalry with Russia",
    7: "American internal discourse on deterrence",
    8: "West Europe nuclear issues in the late Cold War context",
    9: "North Korea nuclear issues II", #!
    10: "Nuclear disarmament and non-proliferation",
    11: "Role of the US Air Force in deterrence I", #!
    12: "India-Pakistani nuclear tensions II",
    13: "Cyberdeterrence",
    14: "UK nuclear issues",
    15: "U.S. MPs involvement in defense policy",
    16: "Antiballistic/missile system and deterrence",
    17: "Private companies in nuclear policy",
    18: "Deterrence of China",
    19: "Role of the US Air Force in deterrence II" #!
}

In [11]:
top20["labels"] = top20["num_label_20"].map(LABELS)

In [14]:
top20["tweaked_labels"] = top20["labels"].str.replace("I$|II$", "", regex=True).str.strip()

In [15]:
top20["tweaked_labels"].value_counts()

North Korea nuclear issues                                    3148
India-Pakistani nuclear tensions                              2745
Unidentifiable                                                2740
Role of the US Air Force in deterrence                        2126
Middle East nuclear issues                                    1490
Russia's nuclear program                                      1426
Deterrence in the context of the size and reduction of WMD    1376
US/NATO rivalry with Russia                                   1322
American internal discourse on deterrence                     1303
West Europe nuclear issues in the late Cold War context       1275
Nuclear disarmament and non-proliferation                     1184
Cyberdeterrence                                               1159
UK nuclear issues                                             1098
U.S. MPs involvement in defense policy                        1069
Antiballistic/missile system and deterrence                   

In [18]:
top20.drop("fulltext", 1).to_csv("../data/processed/labeled-dataset.csv", index=False)