# Asignning Label to Sentence Clusters

## Imports

In [1]:
import collections
from pathlib import Path

import numpy as np
import pandas as pd
import spacy
from spacy import displacy

pd.set_option("display.max_rows", 600)
pd.set_option("display.max_columns", 500)
pd.set_option("max_colwidth", 400)

In [2]:
spacy.prefer_gpu()
nlp = spacy.load("en_core_web_md")

In [34]:
data_clustered = pd.read_csv('D:\\NLP\\Frame_NLP\\archive\\Sent_Cluster\\sample_clustered48(2).csv.csv')
data_clustered = data_clustered[['Processed_Title', 'label_st1']]
data_clustered.sample(10)

Unnamed: 0,Processed_Title,label_st1
4673,Central govt suspends biometric attendance due to coronavirus threat,-1
3907,PornHub Sees Spike In Viewership Following Coronavirus Outbreak,34
4597,3rd NYPD Member Dies of Coronavirus After Hundreds of Officers Test Positive,10
2530,"China Coronavirus update: Flights canceled, death toll, and how it compares to the flu",17
1173,Murad Shah's brother in law dies due to Coronavirus,20
6544,Trump Vows to Ban Coronavirus Vaccine If Obama Invented It,41
184,UAE doubles stimulus to counter coronavirus impact,4
251,Organizer of NJ coronavirus stay at home protest hit with criminal violation,-1
991,Japan to enforce special measures for coronavirus from Saturday: government,11
4701,New Coronavirus Patients Show No Symptoms: Hospital,38


In [35]:
example_category = data_clustered[data_clustered['label_st1']==31].reset_index(drop=True)
example_category 

Unnamed: 0,Processed_Title,label_st1
0,Coronavirus WAR: Canada urges Trump to keep troops away from border,31
1,Alberta reports 18 new coronavirus cases and no new deaths News,31
2,Toronto opens hotline for passengers on flight with Canada's first coronavirus patient CBC.ca,31
3,BC reports first coronavirus in Vancouver region Lake Country Calendar,31
4,27 possible cases of coronavirus currently under investigation in Ontario News,31
5,Stanford reports undergraduate student has coronavirus,31
6,Coronavirus: 2 food banks in Okanagan changing way they fill out hampers,31
7,Coronavirus: Commuter 'fever check' and Princess Charlotte turns five,31
8,Alberta's first presumptive coronavirus case in Calgary,31
9,Coronavirus: What's happening around the world on Monday,31


In [5]:
example_doc = nlp(list(example_category['Processed_Title'])[1])

print(f'{example_doc}\n')

for token in example_doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_ , token.is_stop)

Woman knocked unconscious defending Chinese friend over coronavirus harassment

Woman woman NOUN NN nsubj False
knocked knock VERB VBD ROOT False
unconscious unconscious ADJ JJ amod False
defending defend VERB VBG amod False
Chinese chinese ADJ JJ amod False
friend friend NOUN NN dobj False
over over ADP IN prep True
coronavirus coronavirus NOUN NN compound False
harassment harassment NOUN NN pobj False


In [6]:
displacy.render(example_doc, style="dep")

In [6]:
fig = displacy.render(example_doc, style="dep", jupyter=False)
output_path = Path("D:\\NLP\\Frame_NLP\\archive\\Dependency_Figure\\dependency_plot.svg") # you can keep there only "dependency_plot.svg" if you want to save it in the same folder where you run the script 
output_path.open("w", encoding="utf-8").write(fig)

7222

## Assistant Functions

In [36]:
def get_group(df, category_col, category):
    """
    Returns documents of a single category
    
    Arguments:
        df: pandas dataframe of documents
        category_col: str, column name corresponding to categories or clusters
        category: int, cluster number to return
    Returns:
        single_category: pandas dataframe with documents from a single category
    """
    
    single_category = df[df[category_col]==category].reset_index(drop=True)

    return single_category 

In [37]:
def most_common(lst, n_words):
    """
    Get most common words in a list of words
    
    Arguments:
        lst: list, each element is a word
        n_words: number of top common words to return
    
    Returns:
        counter.most_common(n_words): counter object of n most common words
    """
    counter=collections.Counter(lst)
    return counter.most_common(n_words)

In [38]:
def extract_labels(category_docs, print_word_counts=False):
    """
    Extract labels from documents in the same cluster by concatenating
    most common verbs, ojects, and nouns

    Argument:
        category_docs: list of documents, all from the same category or
                       clustering
        print_word_counts: bool, True will print word counts of each type in this category

    Returns:
        label: str, group label derived from concatentating most common
               verb, object, and two most common nouns

    """

    verbs = []
    dobjs = []
    nouns = []
    adjs = []
    
    verb = ''
    dobj = ''
    noun1 = ''
    noun2 = ''

    # for each document, append verbs, dobs, nouns, and adjectives to 
    # running lists for whole cluster
    for i in range(len(category_docs)):
        doc = nlp(category_docs[i])
        for token in doc:
            if token.is_stop==False:
                if token.dep_ == 'ROOT':
                    verbs.append(token.text.lower())

                elif token.dep_=='dobj':
                    dobjs.append(token.lemma_.lower())

                elif token.pos_=='NOUN':
                    nouns.append(token.lemma_.lower())
                    
                elif token.pos_=='ADJ':
                    adjs.append(token.lemma_.lower())

    # for printing out for inspection purposes
    if print_word_counts:
        for word_lst in [verbs, dobjs, nouns, adjs]:
            counter=collections.Counter(word_lst)
            print(counter)
    
    # take most common words of each form
    if len(verbs) > 0:
        verb = most_common(verbs, 1)[0][0]
    
    if len(dobjs) > 0:
        dobj = most_common(dobjs, 1)[0][0]
    
    if len(nouns) > 0:
        noun1 = most_common(nouns, 1)[0][0]
    
    if len(set(nouns)) > 1:
        noun2 = most_common(nouns, 2)[1][0]
    
    # concatenate the most common verb-dobj-noun1-noun2 (if they exist)
    label_words = [verb, dobj]
    
    for word in [noun1, noun2]:
        if word not in label_words:
            label_words.append(word)
    
    if '' in label_words:
        label_words.remove('')
    
    label = '_'.join(label_words)
    
    return label

In [39]:
def apply_and_summarize_labels(df, category_col):
    """
    Assign groups to original documents and provide group counts

    Arguments:
        df: pandas dataframe of original documents of interest to
            cluster
        category_col: str, column name corresponding to categories or clusters

    Returns:
        summary_df: pandas dataframe with model cluster assignment, number
                    of documents in each cluster and derived labels
    """
    
    numerical_labels = df[category_col].unique()
    
    # create dictionary of the numerical category to the generated label
    label_dict = {}
    for label in numerical_labels:
        current_category = list(get_group(df, category_col, label)['Processed_Title'])
        label_dict[label] = extract_labels(current_category)
        
    # create summary dataframe of numerical labels and counts
    summary_df = (df.groupby(category_col)['Processed_Title'].count()
                    .reset_index()
                    .rename(columns={'Processed_Title':'count'})
                    .sort_values('count', ascending=False))
    
    # apply generated labels
    summary_df['label'] = summary_df.apply(lambda x: label_dict[x[category_col]], axis = 1)
    
    return summary_df

In [40]:
def combine_ground_truth(df_clusters, key):
    """
    Combines dataframes of documents with extracted and ground truth labels
    
    Arguments:
        df_clusters: pandas dataframe, each row as a document with corresponding extracted label
        df_ground: pandas dataframe, each row as a document with corresponding ground truth label
        key: str, key to merge tables on
        
    Returns:
        df_combined: pandas dataframe, each row as a document with extracted and ground truth labels
    """
    df_combined = pd.merge(df_clusters, on=key, how = 'left')
    return df_combined

In [41]:
def get_top_category(df_label, df_summary):
    """
    Returns a dataframe comparing a single model's results to ground truth
    label to evalute cluster compositions and derived label relative to labels
    and counts of most commmon ground truth category

    Arguments:
        df_label: pandas dataframe, each row as a document with extracted and ground truth labels
                  (result of `combine_ground_truth` function)
        df_summary: pandas dataframe with model cluster assignment, number
                    of documents in each cluster and derived labels
                    (result from `apply_and_summarize_labels` function)

    Returns:
        df_result: pandas dataframe with each row containing information on
                   each cluster identified by this model, including count,
                   extracted label, most represented ground truth label name,
                   count and percentage of that group
    """
    
    
    df_result = cluster_summary
    df_result['perc_top_cat'] = df_result.apply(lambda x: int(round(100*x['top_cat_count']/x['count'])), axis=1)
    
    return df_result

## Inspecting the Output

In [42]:
example_category = list(get_group(data_clustered, 'label_st1', 31)['Processed_Title'])
extract_labels(example_category, True)

Counter({'coronavirus': 24, 'reports': 12, 'case': 7, 'confirmed': 7, 'cases': 5, 'happening': 4, 'provide': 4, 'says': 4, 'timeline': 3, 'urges': 2, 'reach': 2, 'announces': 2, 'confirms': 2, 'records': 2, 'confirm': 2, 'report': 2, 'testing': 2, 'draws': 2, 'opens': 1, 'turns': 1, 'continues': 1, 'news': 1, 'remains': 1, 'covid': 1, 'updated': 1, 'feel': 1, 'outbreak': 1, 'awaits': 1, 'prepares': 1, 'allows': 1, 'symptoms': 1, 'canada': 1, 'rise': 1, 'recovered': 1, 'breaking': 1, 'patient': 1, 'q&a': 1, 'determined': 1, 'things': 1, 'surpasses': 1, 'minister': 1, 'mum': 1, 'invest': 1, 'scotia': 1, 'makes': 1, 'responds': 1, 'suspends': 1, 'open': 1, 'travel': 1, 'government': 1, 'park': 1, 'update': 1, 'poses': 1, 'nhs': 1, 'carries': 1, 'protests': 1, 'needed': 1, 'continue': 1, 'works': 1, 'ministry': 1, 'queen': 1, 'sees': 1, 'hits': 1, 'student': 1, 'detected': 1, 'shifted': 1, 'backtracks': 1, 'ruled': 1, 'deepens': 1, 'look': 1, 'resumes': 1, 'spike': 1})
Counter({'coronaviru

'coronavirus_coronavirus_case'

In [43]:
data_clustered[data_clustered['label_st1']==31]

Unnamed: 0,Processed_Title,label_st1
57,Coronavirus WAR: Canada urges Trump to keep troops away from border,31
85,Alberta reports 18 new coronavirus cases and no new deaths News,31
160,Toronto opens hotline for passengers on flight with Canada's first coronavirus patient CBC.ca,31
309,BC reports first coronavirus in Vancouver region Lake Country Calendar,31
335,27 possible cases of coronavirus currently under investigation in Ontario News,31
466,Stanford reports undergraduate student has coronavirus,31
472,Coronavirus: 2 food banks in Okanagan changing way they fill out hampers,31
571,Coronavirus: Commuter 'fever check' and Princess Charlotte turns five,31
637,Alberta's first presumptive coronavirus case in Calgary,31
683,Coronavirus: What's happening around the world on Monday,31


In [44]:
cluster_summary = apply_and_summarize_labels(data_clustered, 'label_st1')
cluster_summary.head(20)

Unnamed: 0,label_st1,count,label
35,34,1842,coronavirus_coronavirus_case
0,-1,1510,coronavirus_coronavirus_case
41,40,427,coronavirus_case
18,17,418,coronavirus_case_outbreak
28,27,394,coronavirus_impact_stock
7,6,217,coronavirus_coronavirus_player
39,38,211,tests_positive_coronavirus_test
38,37,175,coronavirus_case_death
29,28,154,canceled_fear_coronavirus
2,1,140,reports_case_coronavirus


## Most Common Scenario

In [45]:
labeled_clusters = pd.merge(data_clustered, cluster_summary[['label_st1', 'label']], on='label_st1', how = 'left')
labeled_clusters.head()

Unnamed: 0,Processed_Title,label_st1,label
0,"US coronavirus death toll rises to 14, most cases in Washington",34,coronavirus_coronavirus_case
1,"Iran rejects U.S. offer for coronavirus aid, cites conspiracy theory",-1,coronavirus_coronavirus_case
2,TSA is making 3 changes to stop coronavirus from spreading on planes,43,coronavirus_coronavirus_travel
3,NBA Players to Take Pay Cut Amidst Coronavirus Crisis,6,coronavirus_coronavirus_player
4,"Coronavirus Cases confirmed in North Korea, say officials",11,reports_case_coronavirus


In [46]:
labeled_clusters.to_csv('D:\\NLP\\Frame_NLP\\archive\\Sent_Cluster\\labeled_clusters48.csv', index=None)

In [47]:
labeled_clusters[labeled_clusters['label_st1']==3]

Unnamed: 0,Processed_Title,label_st1,label
498,Former NATO chief Javier Solana has coronavirus source,3,confirms_case_coronavirus_death
536,Director of Mexican Border State Hospital Dies from Coronavirus,3,confirms_case_coronavirus_death
630,Peru president confirms first coronavirus case,3,confirms_case_coronavirus_death
973,Bolivia Registers First Death From Coronavirus Related Complications Health Ministry,3,confirms_case_coronavirus_death
1223,Hungary suspends issuing visas to Iranians over coronavirus fears,3,confirms_case_coronavirus_death
1373,Coronavirus deaths climb as New Mexico sticks with lockdown,3,confirms_case_coronavirus_death
1599,"Coronavirus Guatemala bans arrivals from U.S., Canada to fight coronavirus",3,confirms_case_coronavirus_death
1617,Negros Occidental town records 1st coronavirus case,3,confirms_case_coronavirus_death
1731,"Mexico frets about coronavirus spread, could restrict border",3,confirms_case_coronavirus_death
1801,Mexico to ready fiscal stimulus to temper coronavirus hit official,3,confirms_case_coronavirus_death


In [48]:
count = labeled_clusters.groupby(['label']).count()
print(count)

                                        Processed_Title  label_st1
label                                                             
admitted_symptom_coronavirus_hospital                52         52
canceled_fear_coronavirus                           154        154
confirmed_restriction_coronavirus_case              132        132
confirms_case_coronavirus                            44         44
confirms_case_coronavirus_death                      55         55
coronavirus_case                                    568        568
coronavirus_case_death                              175        175
coronavirus_case_outbreak                           418        418
coronavirus_coronavirus_case                       3649       3649
coronavirus_coronavirus_death                       181        181
coronavirus_coronavirus_drug                         53         53
coronavirus_coronavirus_patient                      44         44
coronavirus_coronavirus_player                      217       

In [49]:
label_count = labeled_clusters.groupby(['label'])['label_st1'].count().reset_index(
  name='Count').sort_values(['Count'], ascending=False)

In [53]:
label_count.to_csv('D:\\NLP\\Frame_NLP\\archive\\Sent_Cluster\\label_freq.csv')

In [3]:
labels = pd.read_csv('D:\\NLP\\Frame_NLP\\archive\\Sent_Cluster\\labeled_clusters1229.csv')
labels.shape

(8376, 3)

In [52]:
labels.head(50)

Unnamed: 0.1,Unnamed: 0,Processed_Title,label_st1
0,0,"US coronavirus death toll rises to 14, most cases in Washington",59
1,1,"Iran rejects U.S. offer for coronavirus aid, cites conspiracy theory",19
2,2,TSA is making 3 changes to stop coronavirus from spreading on planes,34
3,3,NBA Players to Take Pay Cut Amidst Coronavirus Crisis,-1
4,4,"Coronavirus Cases confirmed in North Korea, say officials",18
5,5,Mayor Greg Fischer more coronavirus cases than thought,-1
6,6,Number Of Confirmed Coronavirus Cases In Southern Germany Rises To 4 Health Ministry,58
7,7,Coronavirus live updates: US President Donald Trump declares coronavirus pandemic a national emergency,1
8,8,South Africa Starts Easing Coronavirus Lockdown,53
9,9,Coronavirus: Ekiti state government declares 14 day curfew,7


In [58]:
labels[labels['label']=='coronavirus_market']

Unnamed: 0,Processed_Title,label_st1,label
19,Coronavirus Fears Spread: What Are The Market Implications ?,572,coronavirus_market
47,When The Coronavirus Outbreak Creates A Panic Buying Boom For Your Product,572,coronavirus_market
257,European yields extend falls as investors assess coronavirus impact,572,coronavirus_market
692,Commercial brokers say coronavirus not having a big impact yet,572,coronavirus_market
737,Covid 19 Coronavirus: NZ stockmarket dips back on the rollercoaster,572,coronavirus_market
1348,Tesla Faces Coronavirus Cash Crunch,572,coronavirus_market
1496,Warren Buffett Sells Airline Stocks Amid Coronavirus: 'I Made A Mistake',572,coronavirus_market
1498,Two Scenarios For How Coronavirus Will Impact The Market,572,coronavirus_market
1798,Agricultural Commodities May Be Misreading The Coronavirus,572,coronavirus_market
1855,Global Stocks Jump On Easing Coronavirus Crisis,572,coronavirus_market


In [9]:
pd.set_option('display.max_rows', None)

In [10]:
label_freq = pd.read_csv('D://NLP/Frame_NLP/archive/Sent_Cluster/label_freq.csv')
print(label_freq)

      Unnamed: 0                                             label  Count
0            256                      coronavirus_coronavirus_case   1048
1            828                    reports_case_coronavirus_death     60
2            192                         confirms_case_coronavirus     46
3            827                          reports_case_coronavirus     36
4            325                                coronavirus_market     34
5            776                          race_vaccine_coronavirus     33
6            346                         coronavirus_pinch_holiday     29
7            321                              coronavirus_lockdown     28
8            387                   coronavirus_transmission_glance     26
9            103                 canceled_fear_coronavirus_concern     25
10           167                         closes_school_coronavirus     25
11          1182                    urges_people_coronavirus_scare     23
12           196                confir

In [17]:
label_freq.shape

(1229, 3)