# Asignning Label to Sentence Clusters

## Imports

In [1]:
import collections
from pathlib import Path

import numpy as np
import pandas as pd
import spacy
from spacy import displacy

pd.set_option("display.max_rows", 600)
pd.set_option("display.max_columns", 500)
pd.set_option("max_colwidth", 400)

In [2]:
spacy.prefer_gpu()
nlp = spacy.load("en_core_web_md")

In [29]:
data_clustered = pd.read_csv('D:\\NLP\\Frame_NLP\\archive\\Sent_Cluster\\sample_clustered99.csv')
data_clustered = data_clustered[['Processed_Title', 'label_st1']]
data_clustered.sample(10)

Unnamed: 0,Processed_Title,label_st1
3398,Four new coronavirus patients in Mumbai Maha count 26,84
5203,"iPhone Now Answers When You Ask, 'Do I Have Coronavirus ? '",-1
6238,When did schools close in the UK due to coronavirus ?,68
1756,BBC Changes to The Archers during Coronavirus pandemic Media Centre,79
4500,Pompeo Claims 'Imperfect Coronavirus Data' From China Has Put US 'Behind the Curve',41
7832,Movie theaters stay open nationwide amid the coronavirus pandemic for now,-1
8090,Three coronavirus patients recover in Pakistan,83
5165,This is how coronavirus antibody testing works The Sun,45
7003,Coronavirus Israel PM Netanyahu thanks India for delivering hydroxychloroquine,-1
5213,Infant from Illinois Becomes Youngest Person in the U.S. to Die From Coronavirus,0


In [30]:
example_category = data_clustered[data_clustered['label_st1']==31].reset_index(drop=True)
example_category 

Unnamed: 0,Processed_Title,label_st1
0,Coronavirus Hits Another U.S. Navy Ship,31
1,Royal Caribbean cancels U.S. cruises for a month due to coronavirus,31
2,Coronavirus Update: Healthy Coral Princess Passengers To Begin Disembarking,31
3,US tests stranded cruise ship passengers for coronavirus,31
4,Thousands of cruise passengers stuck on ship in Italy as Chinese couple tested for coronavirus,31
5,"Coronavirus: cruise ship passenger in Akaroa, Canterbury, being tested",31
6,Twenty one people test positive for coronavirus aboard cruise ship.,31
7,Coronavirus to slash Atlantic Canada's cruise season,31
8,Fired Navy Captain Reportedly Tests Positive For Coronavirus,31
9,Ten holidaymakers in Majorca test positive for coronavirus on island,31


In [31]:
example_doc = nlp(list(example_category['Processed_Title'])[1])

print(f'{example_doc}\n')

for token in example_doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_ , token.is_stop)

Royal Caribbean cancels U.S. cruises for a month due to coronavirus

Royal royal ADJ JJ amod False
Caribbean Caribbean PROPN NNP nsubj False
cancels cancel VERB VBZ ROOT False
U.S. U.S. PROPN NNP compound False
cruises cruise NOUN NNS dobj False
for for ADP IN prep True
a a DET DT det True
month month NOUN NN pobj False
due due ADP IN amod True
to to ADP IN pcomp True
coronavirus coronavirus NOUN NN pobj False


In [32]:
displacy.render(example_doc, style="dep")

In [38]:
fig = displacy.render(example_doc, style="dep", jupyter=False)
output_path = Path("D:\\NLP\\Frame_NLP\\archive\\Dependency_Figure\\dependency_plot.svg") # you can keep there only "dependency_plot.svg" if you want to save it in the same folder where you run the script 
output_path.open("w", encoding="utf-8").write(fig)

7458

## Assistant Functions

In [33]:
def get_group(df, category_col, category):
    """
    Returns documents of a single category
    
    Arguments:
        df: pandas dataframe of documents
        category_col: str, column name corresponding to categories or clusters
        category: int, cluster number to return
    Returns:
        single_category: pandas dataframe with documents from a single category
    """
    
    single_category = df[df[category_col]==category].reset_index(drop=True)

    return single_category 

In [34]:
def most_common(lst, n_words):
    """
    Get most common words in a list of words
    
    Arguments:
        lst: list, each element is a word
        n_words: number of top common words to return
    
    Returns:
        counter.most_common(n_words): counter object of n most common words
    """
    counter=collections.Counter(lst)
    return counter.most_common(n_words)

In [35]:
def extract_labels(category_docs, print_word_counts=False):
    """
    Extract labels from documents in the same cluster by concatenating
    most common verbs, ojects, and nouns

    Argument:
        category_docs: list of documents, all from the same category or
                       clustering
        print_word_counts: bool, True will print word counts of each type in this category

    Returns:
        label: str, group label derived from concatentating most common
               verb, object, and two most common nouns

    """

    verbs = []
    dobjs = []
    nouns = []
    adjs = []
    
    verb = ''
    dobj = ''
    noun1 = ''
    noun2 = ''

    # for each document, append verbs, dobs, nouns, and adjectives to 
    # running lists for whole cluster
    for i in range(len(category_docs)):
        doc = nlp(category_docs[i])
        for token in doc:
            if token.is_stop==False:
                if token.dep_ == 'ROOT':
                    verbs.append(token.text.lower())

                elif token.dep_=='dobj':
                    dobjs.append(token.lemma_.lower())

                elif token.pos_=='NOUN':
                    nouns.append(token.lemma_.lower())
                    
                elif token.pos_=='ADJ':
                    adjs.append(token.lemma_.lower())

    # for printing out for inspection purposes
    if print_word_counts:
        for word_lst in [verbs, dobjs, nouns, adjs]:
            counter=collections.Counter(word_lst)
            print(counter)
    
    # take most common words of each form
    if len(verbs) > 0:
        verb = most_common(verbs, 1)[0][0]
    
    if len(dobjs) > 0:
        dobj = most_common(dobjs, 1)[0][0]
    
    if len(nouns) > 0:
        noun1 = most_common(nouns, 1)[0][0]
    
    if len(set(nouns)) > 1:
        noun2 = most_common(nouns, 2)[1][0]
    
    # concatenate the most common verb-dobj-noun1-noun2 (if they exist)
    label_words = [verb, dobj]
    
    for word in [noun1, noun2]:
        if word not in label_words:
            label_words.append(word)
    
    if '' in label_words:
        label_words.remove('')
    
    label = '_'.join(label_words)
    
    return label

In [36]:
def apply_and_summarize_labels(df, category_col):
    """
    Assign groups to original documents and provide group counts

    Arguments:
        df: pandas dataframe of original documents of interest to
            cluster
        category_col: str, column name corresponding to categories or clusters

    Returns:
        summary_df: pandas dataframe with model cluster assignment, number
                    of documents in each cluster and derived labels
    """
    
    numerical_labels = df[category_col].unique()
    
    # create dictionary of the numerical category to the generated label
    label_dict = {}
    for label in numerical_labels:
        current_category = list(get_group(df, category_col, label)['Processed_Title'])
        label_dict[label] = extract_labels(current_category)
        
    # create summary dataframe of numerical labels and counts
    summary_df = (df.groupby(category_col)['Processed_Title'].count()
                    .reset_index()
                    .rename(columns={'Processed_Title':'count'})
                    .sort_values('count', ascending=False))
    
    # apply generated labels
    summary_df['label'] = summary_df.apply(lambda x: label_dict[x[category_col]], axis = 1)
    
    return summary_df

In [37]:
def combine_ground_truth(df_clusters, df_ground, key):
    """
    Combines dataframes of documents with extracted and ground truth labels
    
    Arguments:
        df_clusters: pandas dataframe, each row as a document with corresponding extracted label
        df_ground: pandas dataframe, each row as a document with corresponding ground truth label
        key: str, key to merge tables on
        
    Returns:
        df_combined: pandas dataframe, each row as a document with extracted and ground truth labels
    """
    df_combined = pd.merge(df_clusters, df_ground, on=key, how = 'left')
    return df_combined

In [38]:
def get_top_category(df_label, df_summary):
    """
    Returns a dataframe comparing a single model's results to ground truth
    label to evalute cluster compositions and derived label relative to labels
    and counts of most commmon ground truth category

    Arguments:
        df_label: pandas dataframe, each row as a document with extracted and ground truth labels
                  (result of `combine_ground_truth` function)
        df_summary: pandas dataframe with model cluster assignment, number
                    of documents in each cluster and derived labels
                    (result from `apply_and_summarize_labels` function)

    Returns:
        df_result: pandas dataframe with each row containing information on
                   each cluster identified by this model, including count,
                   extracted label, most represented ground truth label name,
                   count and percentage of that group
    """
    df_label_ground = (df_label.groupby('label')
                      .agg(top_ground_category=('category', lambda x:x.value_counts().index[0]), 
                           top_cat_count = ('category', lambda x:x.value_counts()[0]))
                      .reset_index())
    
    df_result = pd.merge(df_summary, df_label_ground, on='label', how='left')
    df_result['perc_top_cat'] = df_result.apply(lambda x: int(round(100*x['top_cat_count']/x['count'])), axis=1)
    
    return df_result

## Inspecting the Output

In [39]:
example_category = list(get_group(data_clustered, 'label_st1', 12)['Processed_Title'])
extract_labels(example_category, True)

Counter({'reports': 5, 'confirms': 5, 'coronavirus': 4, 'says': 3, 'rises': 3, 'announces': 2, 'dies': 2, 'learn': 2, 'died': 2, 'cases': 2, 'death': 2, 'italy': 1, 'break': 1, 'head': 1, 'tightens': 1, 'started': 1, 'study': 1, 'tested': 1, 'reported': 1, 'join': 1, 'mastrangelo': 1, 'criticizes': 1, 'false': 1, 'photo': 1, 'scrambles': 1, 'advice': 1, 'marks': 1, 'remains': 1, 'evicted': 1, 'updates': 1, 'adopts': 1, 'wiped': 1, 'launches': 1, 'registers': 1, 'claims': 1, 'kept': 1, 'surge': 1, 'records': 1, 'end': 1, 'donates': 1})
Counter({'death': 10, 'case': 3, 'coronavirus': 2, 'record': 1, 'potency': 1, 'rule': 1, 'rival': 1, 'limit': 1, 'response': 1, 'outbreak': 1, 'measure': 1, 'economy': 1, 'illness': 1, 'fundraiser': 1, 'group': 1, 'fatality': 1, 'tommasi': 1, 'ps90,000': 1, 'pandemic': 1})
Counter({'coronavirus': 37, 'doctor': 5, 'crisis': 3, 'minister': 3, 'case': 3, 'death': 3, 'toll': 3, 'chef': 2, 'patient': 2, 'people': 1, 'hour': 1, 'day': 1, 'song': 1, 'lockdown': 

'reports_death_coronavirus_doctor'

In [40]:
data_clustered[data_clustered['label_st1']==12]

Unnamed: 0,Processed_Title,label_st1
200,Italy: 49 People Die of Coronavirus in 24 Hours,12
262,Italy reports record 250 coronavirus deaths in one day,12
401,Italians break out in song amid coronavirus lockdown,12
492,Denmark confirms first coronavirus death,12
685,All of Italy's Regions Infected with Coronavirus,12
734,Cuban doctors head to Italy to fight coronavirus,12
842,"New coronavirus losing potency, top Italian doctor says",12
1002,Denmark Announces Its First Coronavirus Death,12
1017,Denmark Announces First Coronavirus Death,12
1032,Italy tightens workplace rules in coronavirus crisis,12


In [41]:
cluster_summary = apply_and_summarize_labels(data_clustered, 'label_st1')
cluster_summary.head(20)

Unnamed: 0,label_st1,count,label
0,-1,1846,coronavirus_coronavirus_pandemic
85,84,404,coronavirus_coronavirus_case
42,41,376,coronavirus_case_outbreak
46,45,298,tests_test_coronavirus
86,85,297,reports_case_coronavirus_death
98,97,262,coronavirus_coronavirus_outbreak
80,79,214,coronavirus_case_death
65,64,204,coronavirus_coronavirus_player
70,69,182,says_coronavirus_trump
9,8,142,coronavirus_case


## Most Common Scenario

In [44]:
labeled_clusters = pd.merge(data_clustered, cluster_summary[['label_st1', 'label']], on='label_st1', how = 'left')
labeled_clusters.head()


Unnamed: 0,Processed_Title,label_st1,label
0,"US coronavirus death toll rises to 14, most cases in Washington",85,reports_case_coronavirus_death
1,"Iran rejects U.S. offer for coronavirus aid, cites conspiracy theory",17,recover_coronavirus_death
2,TSA is making 3 changes to stop coronavirus from spreading on planes,-1,coronavirus_coronavirus_pandemic
3,NBA Players to Take Pay Cut Amidst Coronavirus Crisis,64,coronavirus_coronavirus_player
4,"Coronavirus Cases confirmed in North Korea, say officials",21,vote_kit_coronavirus_case


### With Ground Labels

In [45]:
data_ground = pd.read_csv('D:\\NLP\\Frame_NLP\\archive\\Sent_Cluster\\sample_clustered99.csv')[['Processed_Title', 'category']]
data_ground.head()

Unnamed: 0,Processed_Title,category
0,"US coronavirus death toll rises to 14, most cases in Washington",rises_37_coronavirus_death
1,"Iran rejects U.S. offer for coronavirus aid, cites conspiracy theory",refuses_offer_coronavirus_conspiracy
2,TSA is making 3 changes to stop coronavirus from spreading on planes,expands_change_airport_coronavirus
3,NBA Players to Take Pay Cut Amidst Coronavirus Crisis,preparing_cut_coronavirus_game
4,"Coronavirus Cases confirmed in North Korea, say officials",coronavirus_exercise_case_week


In [46]:
labeled_clusters = combine_ground_truth(labeled_clusters, data_ground, 'Processed_Title')
labeled_clusters.sample(10)

Unnamed: 0,Processed_Title,label_st1,label,category
169,"Coronavirus Plays Havoc with Dating, Matchmaking",23,coronavirus_sex_wedding,coronavirus_coronavirus_case
2748,Blizzard Cancels Overwatch Events In China After Coronavirus Outbreak,64,coronavirus_coronavirus_player,cancels_race_coronavirus_outbreak
6659,Coronavirus to have little impact on economy: Trump adviser,69,says_coronavirus_trump,warns_impact_economy_trump
840,Coronavirus didn't originate from China: experts,41,coronavirus_case_outbreak,originate_coronavirus_expert
4038,America Is Not Ready For The Coronavirus,-1,coronavirus_coronavirus_pandemic,worried_shot_coronavirus_flu
1308,Melbourne doctor becomes latest Aussie to test positive to coronavirus,60,confirmed_case_coronavirus,continued_patient_doctor_coronavirus
1022,Protect Your Portfolio From The Coronavirus Crisis: Buy Gilead,-1,coronavirus_coronavirus_pandemic,invest_portfolio_coronavirus_investing
8240,Ultra Music Festival Suspended Over Coronavirus Outbreak,65,canceled_fear_coronavirus,coronavirus_coronavirus_case
5727,Instacart announces coronavirus protections amid threats of a strike,52,coronavirus_weekend_panic,responds_fear_coronavirus_concern
2125,"Coronavirus news and live updates: Coronavirus cases top 300,000",85,reports_case_coronavirus_death,"surpass_100,000_coronavirus_case"


In [47]:
labeled_clusters[labeled_clusters['label_st1']==45]

Unnamed: 0,Processed_Title,label_st1,label,category
93,LabCorp makes coronavirus test available for ordering in U.S.,45,tests_test_coronavirus,makes_direct_coronavirus_test
125,Coronavirus: More than 600 police officers and staff in isolation,45,tests_test_coronavirus,coronavirus_engineer_police
220,CRPF DG tests negative for coronavirus,45,tests_test_coronavirus,tests_test_coronavirus
221,First coronavirus antibody test given approval by Public Health England,45,tests_test_coronavirus,test_immunity_coronavirus_antibody
238,Person in WA being tested for coronavirus,45,tests_test_coronavirus,tested_coronavirus_person
321,2 Miami University students tested for coronavirus,45,tests_test_coronavirus,tested_coronavirus_student
334,Thousands of coronavirus test results 'disappear',45,tests_test_coronavirus,coronavirus_coronavirus_case
462,"Starbucks employee diagnosed with the coronavirus in Seattle, company says",45,tests_test_coronavirus,tests_positive_employee_coronavirus
463,Drive through coronavirus tests: coming to a store near you,45,tests_test_coronavirus,drive_testing_coronavirus_test
466,Stanford reports undergraduate student has coronavirus,45,tests_test_coronavirus,reports_case_coronavirus_death


### Count and name of most common category of generated labels and clusters

In [48]:
get_top_category(labeled_clusters, cluster_summary)

Unnamed: 0,label_st1,count,label,top_ground_category,top_cat_count,perc_top_cat
0,-1,1846,coronavirus_coronavirus_pandemic,coronavirus_coronavirus_case,205,11
1,84,404,coronavirus_coronavirus_case,coronavirus_coronavirus_case,92,23
2,41,376,coronavirus_case_outbreak,coronavirus_coronavirus_case,90,24
3,45,298,tests_test_coronavirus,coronavirus_coronavirus_case,42,14
4,85,297,reports_case_coronavirus_death,coronavirus_coronavirus_case,41,14
5,97,262,coronavirus_coronavirus_outbreak,coronavirus_coronavirus_case,50,19
6,79,214,coronavirus_case_death,coronavirus_coronavirus_case,34,16
7,64,204,coronavirus_coronavirus_player,coronavirus_coronavirus_case,16,8
8,69,182,says_coronavirus_trump,coronavirus_coronavirus_case,39,21
9,8,142,coronavirus_case,coronavirus_coronavirus_case,44,31


## Save Result

In [49]:
labeled_clusters.to_csv('D:\\NLP\\Frame_NLP\\archive\\Sent_Cluster\\labeled_clusters99.csv', index=None)

In [28]:
labeled_clusters[labeled_clusters['label_st1']==3]

Unnamed: 0,Processed_Title,label_st1,label,category
498,Former NATO chief Javier Solana has coronavirus source,3,confirms_case_coronavirus_death,confirms_case_coronavirus
536,Director of Mexican Border State Hospital Dies from Coronavirus,3,confirms_case_coronavirus_death,denies_case_coronavirus_death
630,Peru president confirms first coronavirus case,3,confirms_case_coronavirus_death,confirms_case_president_coronavirus
973,Bolivia Registers First Death From Coronavirus Related Complications Health Ministry,3,confirms_case_coronavirus_death,death_death_coronavirus
1223,Hungary suspends issuing visas to Iranians over coronavirus fears,3,confirms_case_coronavirus_death,coronavirus_coronavirus_case
1373,Coronavirus deaths climb as New Mexico sticks with lockdown,3,confirms_case_coronavirus_death,"climb_10,000_death_coronavirus"
1599,"Coronavirus Guatemala bans arrivals from U.S., Canada to fight coronavirus",3,confirms_case_coronavirus_death,stalls_hundred_coronavirus_dispute
1617,Negros Occidental town records 1st coronavirus case,3,confirms_case_coronavirus_death,confirms_case_coronavirus_month
1731,"Mexico frets about coronavirus spread, could restrict border",3,confirms_case_coronavirus_death,ready_stimulus_coronavirus_temper
1801,Mexico to ready fiscal stimulus to temper coronavirus hit official,3,confirms_case_coronavirus_death,ready_stimulus_coronavirus_temper


In [50]:
count = labeled_clusters.groupby(['label']).count()
print(count)

                                       Processed_Title  label_st1  category
label                                                                      
admitted_symptom_coronavirus_hospital               51         51        51
americans_state_coronavirus_emergency               19         19        19
block_package_coronavirus_stimulus                  44         44        44
brace_plan_coronavirus_home                         31         31        31
buddy_coronavirus_dog                               35         35        35
canceled_fear_coronavirus                          118        118       118
cases_case_coronavirus_wrestler                     66         66        66
closes_app_coronavirus_production                   54         54        54
confirmed_case_coronavirus                         131        131       131
confirms_case_coronavirus                           78         78        78
confirms_case_coronavirus_death                     95         95        95
confirms_per

In [52]:
label_count = labeled_clusters.groupby(['label'])['label_st1'].count().reset_index(
  name='Count').sort_values(['Count'], ascending=False)
label_count

Unnamed: 0,label,Count
26,coronavirus_coronavirus_pandemic,1846
19,coronavirus_coronavirus_case,469
16,coronavirus_case,424
18,coronavirus_case_outbreak,376
61,reports_case_coronavirus_death,336
81,tests_test_coronavirus,298
25,coronavirus_coronavirus_outbreak,262
60,reports_case_coronavirus,226
17,coronavirus_case_death,214
27,coronavirus_coronavirus_player,204


In [53]:
label_count.to_csv('D:\\NLP\\Frame_NLP\\archive\\Sent_Cluster\\label_freq99.csv')

In [19]:
labels = pd.read_csv('D:\\NLP\\Frame_NLP\\archive\\Sent_Cluster\\label_freq.csv')
labels.shape

(1229, 3)

In [107]:
labels.head(50)

Unnamed: 0.1,Unnamed: 0,label,Count
0,256,coronavirus_coronavirus_case,1048
1,828,reports_case_coronavirus_death,60
2,192,confirms_case_coronavirus,46
3,827,reports_case_coronavirus,36
4,325,coronavirus_market,34
5,776,race_vaccine_coronavirus,33
6,346,coronavirus_pinch_holiday,29
7,321,coronavirus_lockdown,28
8,387,coronavirus_transmission_glance,26
9,103,canceled_fear_coronavirus_concern,25


In [4]:
labeled_clusters[labeled_clusters['label']=='confirms_case_coronavirus']

NameError: name 'labeled_clusters' is not defined