# Text Classification Using Heuristic Approach with Keyword Matching

This code is to use a keyword match method on RCT (Randomized Controlled Trial) data for classification 

Author: Jenna Kim  
Created: 2022/2/20  
Last Modified: 2022/2/20  



# 1. Set up

In [None]:
import timeit
import re
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
# Hide warning messages from display
import warnings
warnings.filterwarnings('ignore')

# disply all the dataframe columns
pd.set_option("display.max_columns", None)

# 2. Functions

In [None]:
def load_data(filename, colname, record):
    
    """
    Read in input file and load data
    
    filename: csv file
    colname: column name used for text input data
    record: text file to save summary

    return: dataframe
    
    """
    
    df = pd.read_csv(filename, encoding='utf-8')
    
    # No of rows and columns
    print("No of Rows (Raw data): {}".format(df.shape[0]), file=record)
    print("No of Columns: {}".format(df.shape[1]), file=record)
    
    print("No of Rows (Raw data): {}".format(df.shape[0]))
    print("No of Columns: {}".format(df.shape[1]))
    
    # Select data needed for processing & convert labels
    df = df[['pmid', 'title', 'abstract', 'pubtype']]
    df.iloc[:, -1] = df.iloc[:, -1].map({'RCT':1, 'Other':0})
    
    # Remove null values 
    df=df.dropna()

    print("No of rows (After removing null): {}".format(df.shape[0]), file=record)
    print("No of columns: {}".format(df.shape[1]), file=record)
    
    print("No of rows (After removing null): {}".format(df.shape[0]))
    print("No of columns: {}".format(df.shape[1]))
        
    # Select text columns
    if colname == "title":
        df = df[['pmid', 'title', 'pubtype']]
        df.rename({"title": "sentence", "pubtype": "label"}, axis=1, inplace=True)
    elif colname == "abs":
        df = df[['pmid', 'abstract', 'pubtype']]
        df.rename({"abstract": "sentence", "pubtype": "label"}, axis=1, inplace=True)
    elif colname == "mix":
        df['mix'] = df[['title','abstract']].apply(lambda x : '{} {}'.format(x[0],x[1]), axis=1)
        df = df[['pmid', 'mix', 'pubtype']]
        df.rename({"mix": "sentence", "pubtype": "label"}, axis=1, inplace=True)

    # Check the first few instances
    print("\n<Data View: First Few Instances>", file=record)
    print("\n", df.head(), file=record)
    print("\n<Data View: First Few Instances>")
    print("\n", df.head())
    
    # No of lables and rows 
    print('\nClass Counts(label, row): Total', file=record)
    print(df.label.value_counts(), file=record)
    
    print('\nClass Counts(label, row): Total')
    print(df.label.value_counts())
     
    return df

In [None]:
def find_exact_match(string, keywords):
  """
    Search exact match of terms in a text
    
    string: text string
    keywords: a list of terms used as keyword

    return: a list of matched terms
    
  """
  
  items = []
  for keyword in keywords:
    term = r'\b' + keyword + r'\b'
    found = re.findall(term, string, flags=re.IGNORECASE)

    if len(found) > 0:
      [items.append(word) for word in found]

  return items

In [None]:
def convert_match_to_label(df_data, keywords):
  
  """
    Identify strings that match keywords in texts 
    and convert to label if an instance includes any matched term
    
    df_data: input dataframe
    keywords: a list of terms used as keyword

    return: dataframe that includes matched terms and converted labels
    
  """
  
  # 1. Remove punctuation from texts
  df_data["sent_process"] = df_data["sentence"].str.replace('[!?,]', '')

  # 2. Detect keyword terms in each text
  df_data["match"] = df_data["sent_process"].apply(lambda x: find_exact_match(x, keywords))
  
  
  # 3. Label each match
  df_data["pred"] = df_data["match"].apply(lambda x: 1 if len(x)>0 else 0)

  return df_data

In [None]:
def evaluate_model(y_test, y_pred, record, eval_model=0):
    """
      evaluate model performance
      
      y_test: y test data
      y_pred: t prediction score
      record: text file to save output
      eval_model: indicator if this funtion is on or off
      
    """
    
    if eval_model:
        
        print('\nConfusion Matrix:\n', file=record)
        print('\nConfusion Matrix:\n')
        print(confusion_matrix(y_test, y_pred), file=record)
        print(confusion_matrix(y_test, y_pred))
        
        print('\nClassification Report:\n', file=record)
        print('\nClassification Report:\n')
        print(classification_report(y_test, y_pred, digits=4), file=record)
        print(classification_report(y_test, y_pred, digits=4)) 

# 3. Main Function

In [None]:
def main(input_file, 
         colname, 
         keywords,   
         eval_on, 
         match_file,
         result_file):
    
    """
       Main function for processing data, model fitting, and prediction
       
       input_file: input file
       colname: colume name for selection between title and abstract
       keywords: a list of terms used for keyword matching
       eval_on: indicator of model evaluation on or off
       match_file: name of csv file to save output
       result_file: name of text file to save evaluation
       
    """
    
    ## 0. open result file for records
    f=open(result_file, "a")
    
    ## 1. Load data
    
    print("\n************** Loading Data **************\n", file=f)
    print("\n************** Loading Data **************\n")
    df = load_data(input_file, colname, record=f)
    
    # testing
    print("\n<First Sentence>\n{}".format(df.sentence[0]), file=f)
    print("\n<First Sentence>\n{}".format(df.sentence[0]))

    ## 2. Train and test split
    
    print("\n************** Spliting Data **************\n", file=f)
    print("\n************** Spliting Data **************\n")
    
    df_train, df_test = train_test_split(df, test_size=0.2, random_state=42, stratify=df.label)
    df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=42, stratify=df_test.label)
    
    #for testing only: small size data
    #df_train, df_test = train_test_split(df, test_size=0.99, random_state=42, stratify=df.label)
    #df_val, df_test = train_test_split(df_test, test_size=0.99, random_state=42, stratify=df_test.label)
    #df_notuse, df_test = train_test_split(df_test, test_size=0.01, random_state=42, stratify=df_test.label)
    
    print("Train Data: {}".format(df_train.shape), file=f)
    print("Val Data: {}".format(df_val.shape), file=f)
    print("Test Data: {}".format(df_test.shape), file=f)   
    print("Train Data: {}".format(df_train.shape))
    print("Val Data: {}".format(df_val.shape))
    print("Test Data: {}".format(df_test.shape))
    
    print('\nClass Counts(label, row): Train', file=f)
    print(df_train.label.value_counts(), file=f)
    print('\nClass Counts(label, row): Val', file=f)
    print(df_val.label.value_counts(), file=f)
    print('\nClass Counts(label, row): Test', file=f)
    print(df_test.label.value_counts(), file=f)
    
    print("\nTest Data: First Few Instances", file=f)
    print(df_test.head(), file=f)
    print("\nTest Data: First Few Instances")
    print(df_test.head())
    
    # Reset index
    df_train=df_train.reset_index(drop=True)
    df_val=df_val.reset_index(drop=True)
    df_test=df_test.reset_index(drop=True)
    
    print("\n************** Processing Data **************", file=f)
    print("\n************** Processing Data **************")

    print("Test Data: {}".format(df_test.shape), file=f)
    print("Test Data: {}".format(df_test.shape))
    print('\nClass Counts(label, row): Test', file=f)
    print(df_test.label.value_counts(), file=f)
    print("\nTest Data: First Few Instances", file=f)
    print(df_test.head(), file=f)
    print("\nTest Data: First Few Instances")
    print(df_test.head())
    
    ## 3. Heuristic Method: keyword matching

    print("\n************** Heuristic Method: Keyword Match **************", file=f)
    print("\n************** Heuristic Method: Keyword Match **************")
    
    #df_matched=convert_match_to_label(df_train, keywords)   # for testing only: use df_test for implementation
    df_matched=convert_match_to_label(df_test, keywords)   
    
    print("Output Data: {}".format(df_matched.shape), file=f)
    print("\nOutput Data: {}".format(df_matched.shape))
    
    print("\nOutput Data: First Few Instances", file=f)
    print(df_matched.head(), file=f) 
    print("\nOutput Data: First Few Instances")
    print(df_matched.head()) 

    ## Save output
    df_matched.to_csv(match_file, encoding='utf-8', index=False, header=True)

    ## 4. Evaluating performance      
    print("\n************** Evaluating performance **************", file=f)
    print("\n************** Evaluating performance **************")

    y_test = df_matched["label"]
    y_pred = df_matched["pred"]

    evaluate_model(y_test, y_pred, record=f, eval_model=eval_on)
    
    print("\nOutput file:'" + result_file + "' Created", file=f)
    print("\nOutput file:'" + result_file + "' Created")
    
    f.close()

# 4. Run code for implementation


In [None]:
%%time

if __name__== "__main__":
    
    ###### 1. Set Parameter Values ######
    
    #### 1-1. Input file name & which column
    input_filename="rct_sample.csv"  
    column_name = "mix"                                        # 'title' for title text; 'abs' for abstract; 'mix' for title + abstract
    
    #### 1-2. Evaluation applied?    
    eval_on=1                                                  # 0 for no; 1 for yes (confusion matrix/classification report)
    
    #### 1-3. Term list for keyword matching
    keyword_list = ['RCT', 'RCTs', 
                    'randomized controlled trial', 'randomized controlled trials', 'randomised controlled trial', 'randomised controlled trials', 
                    'randomized trial', 'randomized trials', 'randomised trial', 'randomised trials',
                    'randomized clinical trial', 'randomized clinical trials', 'randomised clinical trial', 'randomised clinical trials',
                    'randomized controlled', 'randomised controlled', 'radomized clinical', 'randomised clinical',
                    'randomized', 'randomised', 'clinical trial', 'clinical trials', 'controlled trial', 'controlled trials']
    
    ###### 2. Run Main Fuction ###### 
    output_file = "result_baseline_heuristic_" + column_name + ".csv" 
    eval_file = "eval_baseline_heuristic_" + column_name + ".txt" 
            
    main(input_file=input_filename, 
         colname=column_name,
         keywords=keyword_list,
         eval_on=eval_on,
         match_file=output_file,
         result_file=eval_file
         )
        
    print("\n************** Processing Completed **************\n")


************** Loading Data **************

No of Rows (Raw data): 28776
No of Columns: 7
No of rows (After removing null): 26174
No of columns: 4

<Data View: First Few Instances>

        pmid                                           sentence  label
0  31887537  Outcomes based on age in the phase III METEOR ...      1
1  31886693  Values clarification and parental decision mak...      1
2  31883418  Avelumab plus axitinib vs sunitinib for advanc...      1
3  31882508  Reducing Radiation Dermatitis Using a Film-for...      1
4  31881877  The influence of a community intervention on i...      1

Class Counts(label, row): Total
1    13901
0    12273
Name: label, dtype: int64

<First Sentence>
Outcomes based on age in the phase III METEOR trial of cabozantinib versus everolimus in patients with advanced renal cell carcinoma. Cabozantinib improved progression-free survival (PFS), overall survival (OS) and objective response rate (ORR) compared with everolimus in patients with advanced r