### Κατηγοριοποίηση ιατρικών κειμένων

* Σκοπός είναι η κατηγοριοποίηση ιατρικών κειμένων (επιβλεπόμενη μάθηση) σε συγκεκριμένες κατηγορίες (Discharge Summary, Neurosurgery, ENT).

* Χρησιμοποιούμε N-grams για feature extraction αμέσως μετά την προεπεξεργασία των κειμένων του dataset.

### N-grams

* Είναι μια από τις πρώτες μεθοδολογίες εξαγωγής χαρακτηριστικών για κείμενα (προτάθηκε από τον Markov το 1913).

* Ένα N-gram είναι μια αλληλουχία N χαρακτήρων η οποία προέρχεται από ένα μεγαλύτερο αλφαριθμητικό (κείμενο).

* Το σκεπτικό είναι ότι αντί να χρησιμοποιείται ολόκληρη η συλλογή κειμένων (corpus) για προβλέψεις, ένα μοντέλο, μπορεί να κάνει ικανοποιητικές εκτιμήσεις χρησιμοποιώντας μόνο συνεχόμενες ακολουθίες N λέξεων.

* Για παράδειγμα, για την πρόταση: “The student is alone happily” έχουμε τα παρακάτω πιθανά n-grams:


| N-gram | Πρόταση μετά από N-gram feature extraction | Αριθμός χαρακτηριστικών N-gram |
|----------|----------|:----------:|
| Unigram 1-gram    | "The", "student", "is", "alone", "happily"     | 5     |
| Bigram 2-gram    | "The student", "student is", "is alone", "alone happily"     | 4     |
| Trigram 3-gram    | "The student is", "student is alone", "is alone happily"     | 3     |
| Quandrigram 4-gram    | "The student is alone", "student is alone happily"     | 2     |

In [1]:
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords
from wordcloud import WordCloud
from nltk.stem.snowball import SnowballStemmer
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_halving_search_cv  # noqa
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np
import re
import warnings
import matplotlib
import matplotlib.pyplot as plt
import warnings
import nltk
import os

warnings.filterwarnings('ignore')

random_state_number = 8888

### Αρχικοποίηση σημαντικών συναρτήσεων

In [2]:
def trim_dataframe(df):
    # Strip whitespace from the column names
    df.columns = df.columns.str.strip()
    
    # Remove duplicate rows from the DataFrame
    df = df.drop_duplicates()
    
    # Convert all column names to lowercase
    df.columns = df.columns.str.lower()
    
    # Replace spaces in the column names with underscores
    df.columns = df.columns.str.replace(' ','_')
    
    # Select columns that are of type 'object' (usually strings)
    df_obj = df.select_dtypes(['object'])
    
    # Strip whitespace from the beginning and end of strings in object-type columns
    df[df_obj.columns] = df_obj.apply(lambda x: x.str.strip())
    
    print("All column names have been stripped, made lowercase, and spaces replaced with underscores if any")
    print("Dropped duplicated instances if any")
    print("Categorical instances have been stripped")
    
    # Return the cleaned DataFrame
    return df


def value_counts(df, column, r=False):
    # Group by the specified column and calculate the count of each unique value
    vc_df = df.reset_index().groupby([column]).size().to_frame('count')
    
    # Calculate the percentage of each unique value relative to the total
    vc_df['percentage (%)'] = vc_df['count'].div(sum(vc_df['count'])).mul(100)
    
    # Sort the value counts by percentage in descending order
    vc_df = vc_df.sort_values(by=['percentage (%)'], ascending=False)
    
    # Check if the result should be returned or printed
    if r:
        # Return the DataFrame with value counts and percentages
        return vc_df
    else:
        # Print status message and display the DataFrame
        print(f'STATUS: Value counts of "{column}"...')
        display(vc_df)


def calculate_word_counts(df, category_column, text_column):
    """
    This function calculates the total word count for the text in one column of a DataFrame,
    grouped by unique categories in another column.

    Parameters:
    df (pandas.DataFrame): The DataFrame to process.
    category_column (str): The name of the column containing the categories to group by.
    text_column (str): The name of the column containing the text to count words in.

    Returns:
    pandas.DataFrame: A DataFrame with each category and its corresponding word count.
    """
    # Initialize empty lists to store the categories and their corresponding word counts
    category_list = []
    word_count_list = []

    # Loop through each unique category in the DataFrame
    for category in df[category_column].unique():
        # Filter the DataFrame for the current category
        df_filtered = df.loc[df[category_column] == category]
        
        # Calculate the total word count for the text column for the current category
        word_count = df_filtered[text_column].str.split().str.len().sum()
        
        # Append the current category and word count to their respective lists
        category_list.append(category)
        word_count_list.append(word_count)

    # Create a new DataFrame from the lists with columns for category and word count
    word_count_df = pd.DataFrame({category_column: category_list, 'Word Count': word_count_list})

    # Convert the 'Word Count' column to integers
    word_count_df['Word Count'] = word_count_df['Word Count'].astype('int')

    # Sort the DataFrame by 'Word Count' in descending order
    word_count_df = word_count_df.sort_values('Word Count', ascending=False)

    # Reset the index of the DataFrame without adding a new column
    word_count_df = word_count_df.reset_index(drop=True)

    # Return the resulting DataFrame
    return word_count_df


# To convert transcription into lowercase
def convert2lower(df, attribute):
    """
    Function made to convert text into lower-case
    """
    df.loc[:,attribute] = df[attribute].apply(lambda x : str.lower(x))
    return df


def remove_punc_num(df, attribute):
    # Remove punctuation from each entry in the specified attribute column
    df.loc[:, attribute] = df[attribute].apply(lambda x: " ".join(re.findall('[\w]+', x)))

    # Remove digits from each entry in the specified attribute column
    df[attribute] = df[attribute].str.replace('\d+', '')
    return df


def tokenise_data(df, attribute):
    # Instantiate the WhitespaceTokenizer
    tk = WhitespaceTokenizer()
    
    # Apply the tokenizer to each entry in the specified attribute column and create a new 'tokenised' column
    df['tokenised'] = df.apply(lambda row: tk.tokenize(str(row[attribute])), axis=1)
    # Explanation:
    # - df.apply(...) applies a function along the given axis (axis=1 applies the function to each row).
    # - lambda row: tk.tokenize(str(row[attribute])) defines a lambda function that converts the 
    #   value of the row's attribute to a string (to ensure compatibility with the tokenizer) and then
    #   applies the WhitespaceTokenizer to this string.
    # - The result, which is a list of tokens, is then assigned to the new 'tokenised' column in the DataFrame.

    # Return the modified DataFrame with the new 'tokenised' column
    return df


def stemming(df, attribute):
    """
    Apply stemming to all words in the specified column of the DataFrame.

    Parameters:
    - df (pandas.DataFrame): The DataFrame containing the text data.
    - attribute (str): The name of the column with text to be stemmed.

    Returns:
    - pandas.DataFrame: The DataFrame with an additional column for stemmed words.
    """
    # Instantiate an English language Snowball stemmer
    stemmer = SnowballStemmer("english")
    
    # Apply stemming to each word in the specified column and create a new 'stemmed' column
    df['stemmed'] = df[attribute].apply(lambda x: [stemmer.stem(y) for y in x])
    # Return the DataFrame with the newly added 'stemmed' column
    return df


def remove_stop_words(df, attribute):
    """
    Remove stop words from the specified column of the DataFrame.

    Parameters:
    - df (pandas.DataFrame): The DataFrame containing the text data.
    - attribute (str): The name of the column with text to be processed.

    Returns:
    - pandas.DataFrame: The DataFrame with an additional column excluding stop words.
    """
    # Load the list of English stop words
    stop = stopwords.words('english')
    
    # Create a new column 'stemmed_without_stop' excluding stop words from the specified column
    df['stemmed_without_stop'] = df[attribute].apply(
        lambda x: ' '.join([word for word in x if word not in (stop)])
    )
    # Explanation:
    # - df[attribute].apply(lambda x: ...) applies a function to each element in the column.
    # - The lambda function constructs a list comprehension that iterates over each word in the element 'x'.
    # - Each word is included in the resulting list only if it is not in the stop words list.
    # - ' '.join(...) joins the words in the list into a single string, with spaces in between.

    # Return the DataFrame with the newly added column 'stemmed_without_stop'
    return df


def flat_list(unflat_list):
    """
    Flatten a list of lists into a single list.

    Parameters:
    - unflat_list (list of list): A list where each item is itself a list.

    Returns:
    - list: A single list containing all the elements from the nested lists.
    """
    # Use a list comprehension to iterate through each sublist in the main list
    # and then iterate through each item in the sublist.
    # Each item is then added to the new list 'flatted'.
    flatted = [item for sublist in unflat_list for item in sublist]
    
    # Return the new flattened list
    return flatted


def to_list(df, attribute):
    """
    Convert a column of lists from a DataFrame into a single flat list.

    Parameters:
    - df (pandas.DataFrame): The DataFrame containing the data.
    - attribute (str): The name of the column which contains lists to be flattened.

    Returns:
    - list: A single list containing all the elements from each list in the DataFrame's column.
    """
    # Select the column from the DataFrame and convert it to a new DataFrame 'df_transcription'
    df_transcription = df[[attribute]]
    
    # Convert the DataFrame into a list of lists 'unflat_list_transcription'
    unflat_list_transcription = df_transcription.values.tolist()
    # Explanation:
    # - The .values attribute returns the data in the DataFrame as an ndarray.
    # - The .tolist() method converts this ndarray into a list of lists.

    # Use the previously defined 'flat_list' function to flatten the list of lists
    flat_list_transcription = flat_list(unflat_list_transcription)
    
    # Return the flattened list
    return flat_list_transcription


def generate_n_gram_features(flat_list_transcription, n_gram_features):
    """
    Generate n-gram features from a flat list of text data.

    Parameters:
    - flat_list_transcription (list): A flat list of text data.

    Returns:
    - list: A list of feature vectors corresponding to the n-grams of the text data.
    """
    # Initialize an empty list to store the CountVectorizer feature vectors
    temp = []

    # Iterate over the items in the global 'n_gram_features' dictionary
    for key, values in n_gram_features.items():
        # Create a CountVectorizer object with the n-gram range specified by the current dictionary entry
        vectorizer = CountVectorizer(ngram_range=values)
        
        # Fit the CountVectorizer to the flat list of text data
        vectorizer.fit(flat_list_transcription)
        
        # Transform the text data into n-gram feature vectors and append to the 'temp' list
        temp.append(vectorizer.transform(flat_list_transcription))
    
    # Return the list of n-gram feature vectors
    return temp


def get_best_vector_clf(classifier_result):

    temp = classifier_result[classifier_result['Metric'] =='f1_macro']
    temp2 = temp.iloc[temp['Best CV Metric Score'].idxmax()].to_frame().T
    best_vector = temp2['Vector'].values[0]
    best_clf = temp2['Calibrated Estimator'].values[0]\
    
    return best_vector, best_clf


def shape(df,df_name):
    print(f'STATUS: Dimension of "{df_name}" = {df.shape}')

### Συνάρτηση αξιολόγησης μοντέλου μηχανικής μάθησης

In [3]:
def get_performance(param_grid, base_estimator, dataframes, label, metrics):
    # Initialize lists to collect various results
    df_name_list =[]; best_estimator_list=[]; best_score_list=[]; test_predict_result_list=[];
    metric_list = [];
    
    # Iterate over each feature vector dataframe
    for df_name, df in dataframes.items():
        # Split the data into training and test sets
        X_train, X_test, y_train, y_test = train_test_split(df, label, test_size=0.2, random_state=random_state_number)
        
        # Iterate over each evaluation metric
        for _, metric_dict in metrics.items():
            # Perform halving grid search with cross-validation
            sh = HalvingGridSearchCV(base_estimator, param_grid, cv=5, scoring=metric_dict[1],
                                     random_state=random_state_number, factor=2).fit(X_train, y_train)

            # Fit the best estimator to the training data
            best_estimator = sh.best_estimator_
            clf = best_estimator.fit(X_train, y_train)
            
            # Make predictions on the test set
            prediction = clf.predict(X_test)
            
            # Evaluate predictions using the current metric
            test_predict_result = metric_dict[0](y_test, prediction, average='macro')

            # Append results to the lists
            df_name_list.append(df_name)
            best_estimator_list.append(best_estimator)
            best_score_list.append(sh.best_score_)
            test_predict_result_list.append(test_predict_result)
            metric_list.append(metric_dict[1])
            
    # Create a DataFrame to summarize the model results
    model_result = pd.DataFrame({
        'Vector':df_name_list,
        'Metric':metric_list,
        'Calibrated Estimator':best_estimator_list,
        'Best CV Metric Score':best_score_list,
        'Test Predict Metric Score': test_predict_result_list
    })
    
    # Return the summary DataFrame
    return model_result

### Συνάρτηση οπτικοποίησης αποτελεσμάτων ταξινομητή

In [4]:
font = {'family' : 'Tahoma',
        'weight' : 'bold',
        'size'   : 12}
matplotlib.rc('font', **font)

def vis_classification(dataframes, label, vector_type = 'unigram', estimator = KNeighborsClassifier(n_neighbors=9)):
    pca = PCA(n_components=2)
    # Convert to numpy array with np.asarray
    df1 = pca.fit_transform(np.asarray(dataframes[vector_type].todense()))
    X_train, X_test, y_train, y_test = train_test_split(df1, label, test_size=0.2, random_state=random_state_number)
    
    
    # get training set
    df2 = pd.DataFrame({'pca1':X_train[:,1], 'pca2': X_train[:,0], 'y':le.inverse_transform(y_train)})
    min_1, max_1 = df2['pca1'].min(), df2['pca1'].max()
    min_2, max_2 = df2['pca2'].min(), df2['pca2'].max()
    
    # generate dimension reduced, but extended data
    pca1_range = np.linspace(min_1,max_1,30)
    pca2_range = np.linspace(min_2,max_2,30)
    
    # shuffle
    np.random.shuffle(pca1_range) ; np.random.shuffle(pca2_range)
    
    # to dataframe
    prediction_test = pd.DataFrame({'pca1':pca1_range, 'pca2':pca2_range})

    best_estimator = estimator
    
    # fit training set and predict extended data
    clf = best_estimator.fit(X_train, y_train)

    fig, axs = plt.subplots(nrows = 1, ncols = 2, figsize=(15,6))
    cmap = plt.cm.get_cmap('tab10', 4)
    fig.suptitle(f"Visualising {type(estimator).__name__} on {vector_type.capitalize()} Vector", fontsize=14,fontweight='bold')


    def plot_scatter(ax, predictor_set, target, title):
        
        # plot area classifier
        clf = best_estimator.fit(X_train, y_train)
        axs[0].tricontourf(X_train[:,0], X_train[:,1], clf.predict(X_train), levels=np.arange(-0.5, 4), zorder=10, alpha=0.3, cmap=cmap, edgecolors="k")
        
        axs[1].tricontourf(X_test[:,0], X_test[:,1], clf.predict(X_test), levels=np.arange(-0.5, 4), zorder=10, alpha=0.3, cmap=cmap, edgecolors="k")
        
        # plot scatter
        df3 = pd.DataFrame({'pca1':predictor_set[:,1], 'pca2': predictor_set[:,0], 'y':le.inverse_transform(target)})
        for y_label in df3['y'].unique():
            df_filter = df3[df3['y']==y_label]
            ax.scatter(df_filter['pca1'], df_filter['pca2'], alpha=1,label=f"{y_label}")
        ax.legend(loc='upper left', bbox_to_anchor=(0, 1))
        ax.set_title(f'{title} ({predictor_set.shape[0]} Samples)',fontweight='bold')
    plot_scatter(axs[0], X_train, y_train, 'Training Set')
    plot_scatter(axs[1], X_test, y_test, 'Testing Set')
    axs[0].sharey(axs[1])
    return plt.show()

In [5]:
pd.set_option('display.max_colwidth', 255)
df =pd.read_csv('./data/kaggle_medical_transcriptions/mtsamples.csv')
df.drop('Unnamed: 0', axis=1, inplace=True)
df = trim_dataframe(df)
df.head(3)

All column names have been stripped, made lowercase, and spaces replaced with underscores if any
Dropped duplicated instances if any
Categorical instances have been stripped


Unnamed: 0,description,medical_specialty,sample_name,transcription,keywords
0,A 23-year-old white female presents with complaint of allergies.,Allergy / Immunology,Allergic Rhinitis,"SUBJECTIVE:, This 23-year-old white female presents with complaint of allergies. She used to have allergies when she lived in Seattle but she thinks they are worse here. In the past, she has tried Claritin, and Zyrtec. Both worked for short time b...","allergy / immunology, allergic rhinitis, allergies, asthma, nasal sprays, rhinitis, nasal, erythematous, allegra, sprays, allergic,"
1,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 2,"PAST MEDICAL HISTORY:, He has difficulty climbing stairs, difficulty with airline seats, tying shoes, used to public seating, and lifting objects off the floor. He exercises three times a week at home and does cardio. He has difficulty walking two b...","bariatrics, laparoscopic gastric bypass, weight loss programs, gastric bypass, atkin's diet, weight watcher's, body weight, laparoscopic gastric, weight loss, pounds, months, weight, laparoscopic, band, loss, diets, overweight, lost"
2,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 1,"HISTORY OF PRESENT ILLNESS: , I have seen ABC today. He is a very pleasant gentleman who is 42 years old, 344 pounds. He is 5'9"". He has a BMI of 51. He has been overweight for ten years since the age of 33, at his highest he was 358 pounds, at hi...","bariatrics, laparoscopic gastric bypass, heart attacks, body weight, pulmonary embolism, potential complications, sleep study, weight loss, gastric bypass, anastomosis, loss, sleep, laparoscopic, gastric, bypass, heart, pounds, weight,"


In [6]:
shape(df,'df')

STATUS: Dimension of "df" = (4999, 5)


* Φιλτράρουμε το dataframe έτσι ώστε να μείνουν κείμενα που αφορούν σε 3 μόνο κατηγορίες (Neurosurgery, ENT - Otolaryngology, Discharge Summary)

In [7]:
df = df[df['medical_specialty'].isin(['Neurosurgery','ENT - Otolaryngology','Discharge Summary'])]
shape(df,'df')

STATUS: Dimension of "df" = (300, 5)


In [8]:
word_count_df = calculate_word_counts(df, 'medical_specialty', 'transcription')
word_count_df

Unnamed: 0,medical_specialty,Word Count
0,Neurosurgery,54233
1,Discharge Summary,43103
2,ENT - Otolaryngology,42032


* Μπορούμε να δούμε πόσες λέξεις έχουμε συνολικά για όλα τα κείμενα

In [9]:
total_word_count = df['transcription'].str.split().str.len().sum()
print(f'The word count of all transcriptions is: {int(total_word_count)}')

The word count of all transcriptions is: 139368


* Βλέπουμε πόσα κείμενα έχουμε σε κάθε μια από τις τρεις κατηγορίες

In [10]:
value_counts(df, 'medical_specialty')

STATUS: Value counts of "medical_specialty"...


Unnamed: 0_level_0,count,percentage (%)
medical_specialty,Unnamed: 1_level_1,Unnamed: 2_level_1
Discharge Summary,108,36.0
ENT - Otolaryngology,98,32.666667
Neurosurgery,94,31.333333


In [11]:
# to print data shape
print(f'data shape is: {df.shape}')

# to identify the null values by descending order
df.isnull().sum().sort_values(ascending = False)

data shape is: (300, 5)


keywords             56
transcription         2
description           0
medical_specialty     0
sample_name           0
dtype: int64

* Αφαιρούμε τις γραμμές που είναι κενές (με NaN τιμές)

In [12]:
# to remove transcription rows that is empty
df = df[df['transcription'].notna()]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 298 entries, 2656 to 3994
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   description        298 non-null    object
 1   medical_specialty  298 non-null    object
 2   sample_name        298 non-null    object
 3   transcription      298 non-null    object
 4   keywords           242 non-null    object
dtypes: object(5)
memory usage: 14.0+ KB


* Αφαιρούμε στήλες που δεν χρειάζονται για την μελέτη

In [13]:
df =df.drop(['description','sample_name','keywords'], axis=1)
df.head(2)

Unnamed: 0,medical_specialty,transcription
2656,Neurosurgery,"TITLE OF OPERATION:, A complex closure and debridement of wound.,INDICATION FOR SURGERY:, The patient is a 26-year-old female with a long history of shunt and hydrocephalus presenting with a draining wound in the right upper quadrant, just below the..."
2657,Neurosurgery,"TITLE OF OPERATION: , Placement of right new ventriculoperitoneal (VP) shunts Strata valve and to removal of right frontal Ommaya reservoir.,INDICATION FOR SURGERY: , The patient is a 2-month-old infant, born premature with intraventricular hemorrhage..."


* Ας δούμε ξανά το shape του dataframe

In [14]:
shape(df,'df')

STATUS: Dimension of "df" = (298, 2)


In [15]:
df = convert2lower(df,'transcription')
df.head(3)

Unnamed: 0,medical_specialty,transcription
2656,Neurosurgery,"title of operation:, a complex closure and debridement of wound.,indication for surgery:, the patient is a 26-year-old female with a long history of shunt and hydrocephalus presenting with a draining wound in the right upper quadrant, just below the..."
2657,Neurosurgery,"title of operation: , placement of right new ventriculoperitoneal (vp) shunts strata valve and to removal of right frontal ommaya reservoir.,indication for surgery: , the patient is a 2-month-old infant, born premature with intraventricular hemorrhage..."
2658,Neurosurgery,"preoperative diagnosis: , aqueductal stenosis.,postoperative diagnosis:, aqueductal stenosis.,title of procedure: ,endoscopic third ventriculostomy.,anesthesia: , general endotracheal tube anesthesia.,devices:, bactiseal ventricular catheter with a..."


* Αφαιρούμε σημεία στίξης και αριθμούς

In [16]:
df =remove_punc_num(df, 'transcription')
df_no_punc =df.copy()
df.head(3)

Unnamed: 0,medical_specialty,transcription
2656,Neurosurgery,title of operation a complex closure and debridement of wound indication for surgery the patient is a year old female with a long history of shunt and hydrocephalus presenting with a draining wound in the right upper quadrant just below the costal ma...
2657,Neurosurgery,title of operation placement of right new ventriculoperitoneal vp shunts strata valve and to removal of right frontal ommaya reservoir indication for surgery the patient is a month old infant born premature with intraventricular hemorrhage and ommaya...
2658,Neurosurgery,preoperative diagnosis aqueductal stenosis postoperative diagnosis aqueductal stenosis title of procedure endoscopic third ventriculostomy anesthesia general endotracheal tube anesthesia devices bactiseal ventricular catheter with an aesculap burr hol...


* Το whitespace tokenization μια απλή μέθοδος μέσω της οποίας μπορούμε να "σπάσουμε" ένα string σε μικρότερα μέρη (components). Το χώρισμα βασίζεται σε χαρακτήρες τύπου whitespace (πρόκεται για οποιονδήποτε χαρακτήρα μπορεί να χρησιμοποιηθεί έτσι ώστε να χωρίσει οπτικά λέξεις).

* Π.χ: το κείμενο: "The quick brown fox jumps", χωρίζεται στα εξής tokens: "The", "quick", "brown", "fox", and "jumps".

* Είναι συνήθως ένα από τα πρώτα βήματα στην προεπεξεργασία κειμένου.

In [17]:
df =tokenise_data(df, 'transcription')
df_experiment =df.copy()
df.head(3)

Unnamed: 0,medical_specialty,transcription,tokenised
2656,Neurosurgery,title of operation a complex closure and debridement of wound indication for surgery the patient is a year old female with a long history of shunt and hydrocephalus presenting with a draining wound in the right upper quadrant just below the costal ma...,"[title, of, operation, a, complex, closure, and, debridement, of, wound, indication, for, surgery, the, patient, is, a, year, old, female, with, a, long, history, of, shunt, and, hydrocephalus, presenting, with, a, draining, wound, in, the, right, upp..."
2657,Neurosurgery,title of operation placement of right new ventriculoperitoneal vp shunts strata valve and to removal of right frontal ommaya reservoir indication for surgery the patient is a month old infant born premature with intraventricular hemorrhage and ommaya...,"[title, of, operation, placement, of, right, new, ventriculoperitoneal, vp, shunts, strata, valve, and, to, removal, of, right, frontal, ommaya, reservoir, indication, for, surgery, the, patient, is, a, month, old, infant, born, premature, with, intra..."
2658,Neurosurgery,preoperative diagnosis aqueductal stenosis postoperative diagnosis aqueductal stenosis title of procedure endoscopic third ventriculostomy anesthesia general endotracheal tube anesthesia devices bactiseal ventricular catheter with an aesculap burr hol...,"[preoperative, diagnosis, aqueductal, stenosis, postoperative, diagnosis, aqueductal, stenosis, title, of, procedure, endoscopic, third, ventriculostomy, anesthesia, general, endotracheal, tube, anesthesia, devices, bactiseal, ventricular, catheter, w..."


* Το stemming είναι μια διαδικασία κατά την οποία "κόβονται" μέρη από λέξεις έτσι ώστε αυτές να φτάσουν στην γονική μορφή τους (root form).

* Η "γονική" μορφή μιας λέξης (stem) εκπροσωπεί το βασικό της νόημα. Η ιδέα είναι ότι διαφορετικές μορφές της ίδιας "γονικής" λέξης, πρέπει να θεωρούνται ισότιμες όσον αφορά στο νόημα τους.

* Προσοχή: είναι ευριστική διαδικασία.

* Είναι χρήσιμο διότι συμβάλλει στην απλοποίηση των δεδομένων.

In [18]:
df =stemming(df_experiment, 'tokenised')
df.head(2)

Unnamed: 0,medical_specialty,transcription,tokenised,stemmed
2656,Neurosurgery,title of operation a complex closure and debridement of wound indication for surgery the patient is a year old female with a long history of shunt and hydrocephalus presenting with a draining wound in the right upper quadrant just below the costal ma...,"[title, of, operation, a, complex, closure, and, debridement, of, wound, indication, for, surgery, the, patient, is, a, year, old, female, with, a, long, history, of, shunt, and, hydrocephalus, presenting, with, a, draining, wound, in, the, right, upp...","[titl, of, oper, a, complex, closur, and, debrid, of, wound, indic, for, surgeri, the, patient, is, a, year, old, femal, with, a, long, histori, of, shunt, and, hydrocephalus, present, with, a, drain, wound, in, the, right, upper, quadrant, just, belo..."
2657,Neurosurgery,title of operation placement of right new ventriculoperitoneal vp shunts strata valve and to removal of right frontal ommaya reservoir indication for surgery the patient is a month old infant born premature with intraventricular hemorrhage and ommaya...,"[title, of, operation, placement, of, right, new, ventriculoperitoneal, vp, shunts, strata, valve, and, to, removal, of, right, frontal, ommaya, reservoir, indication, for, surgery, the, patient, is, a, month, old, infant, born, premature, with, intra...","[titl, of, oper, placement, of, right, new, ventriculoperiton, vp, shunt, strata, valv, and, to, remov, of, right, frontal, ommaya, reservoir, indic, for, surgeri, the, patient, is, a, month, old, infant, born, prematur, with, intraventricular, hemorr..."


* Βλέπουμε ενδοιάμεσες λέξεις της αγγλικής γλώσσας (english stop words)

In [19]:
# Showing the list of the English stop words, it has a number of 179 stop words in this list
# nltk.download('stopwords') #must run this code line too the first time the notebook is used
stop = stopwords.words('english')
print(f"There are {len(stop)} stop words \n")
print(stop)

There are 179 stop words 

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'ow

* Αφαιρούμε τις ενδοιάμεσες λέξεις από το κείμενο (stop words).

* Αφαιρώντας τις ενδοιάμεσες λέξεις μένουν στο κείμενο μόνο οι λέξεις που έχουν σημαντικό νόημα για το πρόβλημα που καλούμαστε να επιλύσουμε.

In [20]:
df = remove_stop_words(df, 'stemmed')
df.head(2)

Unnamed: 0,medical_specialty,transcription,tokenised,stemmed,stemmed_without_stop
2656,Neurosurgery,title of operation a complex closure and debridement of wound indication for surgery the patient is a year old female with a long history of shunt and hydrocephalus presenting with a draining wound in the right upper quadrant just below the costal ma...,"[title, of, operation, a, complex, closure, and, debridement, of, wound, indication, for, surgery, the, patient, is, a, year, old, female, with, a, long, history, of, shunt, and, hydrocephalus, presenting, with, a, draining, wound, in, the, right, upp...","[titl, of, oper, a, complex, closur, and, debrid, of, wound, indic, for, surgeri, the, patient, is, a, year, old, femal, with, a, long, histori, of, shunt, and, hydrocephalus, present, with, a, drain, wound, in, the, right, upper, quadrant, just, belo...",titl oper complex closur debrid wound indic surgeri patient year old femal long histori shunt hydrocephalus present drain wound right upper quadrant costal margin lanc general surgeri resolv howev continu drain evid fever crp normal shunt ct normal th...
2657,Neurosurgery,title of operation placement of right new ventriculoperitoneal vp shunts strata valve and to removal of right frontal ommaya reservoir indication for surgery the patient is a month old infant born premature with intraventricular hemorrhage and ommaya...,"[title, of, operation, placement, of, right, new, ventriculoperitoneal, vp, shunts, strata, valve, and, to, removal, of, right, frontal, ommaya, reservoir, indication, for, surgery, the, patient, is, a, month, old, infant, born, premature, with, intra...","[titl, of, oper, placement, of, right, new, ventriculoperiton, vp, shunt, strata, valv, and, to, remov, of, right, frontal, ommaya, reservoir, indic, for, surgeri, the, patient, is, a, month, old, infant, born, prematur, with, intraventricular, hemorr...",titl oper placement right new ventriculoperiton vp shunt strata valv remov right frontal ommaya reservoir indic surgeri patient month old infant born prematur intraventricular hemorrhag ommaya reservoir recommend remov replac new vp shunt preop diagno...


* Αφαιρούμε όλες τις στήλες που δεν χρειάζονται για την ανάλυση

In [21]:
df =df.drop(['transcription','stemmed', 'tokenised'], axis=1)
df.head()

Unnamed: 0,medical_specialty,stemmed_without_stop
2656,Neurosurgery,titl oper complex closur debrid wound indic surgeri patient year old femal long histori shunt hydrocephalus present drain wound right upper quadrant costal margin lanc general surgeri resolv howev continu drain evid fever crp normal shunt ct normal th...
2657,Neurosurgery,titl oper placement right new ventriculoperiton vp shunt strata valv remov right frontal ommaya reservoir indic surgeri patient month old infant born prematur intraventricular hemorrhag ommaya reservoir recommend remov replac new vp shunt preop diagno...
2658,Neurosurgery,preoper diagnosi aqueduct stenosi postop diagnosi aqueduct stenosi titl procedur endoscop third ventriculostomi anesthesia general endotrach tube anesthesia devic bactis ventricular cathet aesculap burr hole port skin prepar chloraprep complic none sp...
2661,Neurosurgery,procedur placement left ventriculostomi via twist drill preoper diagnosi massiv intraventricular hemorrhag hydrocephalus increas intracrani pressur postop diagnosi massiv intraventricular hemorrhag hydrocephalus increas intracrani pressur indic proced...
2662,Neurosurgery,preoper diagnos increas intracrani pressur cerebr edema due sever brain injuri postop diagnos increas intracrani pressur cerebr edema due sever brain injuri procedur burr hole insert extern ventricular drain cathet anesthesia bedsid sedat procedur sca...


* Βλέπουμε πόσες λέξεις έμειναν στα κείμενα του dataset μετά την επεξεργασία.

In [22]:
total_word_count_normalised = df['stemmed_without_stop'].str.split().str.len().sum()
print(f'The word count of transcription after normalised is: {int(total_word_count_normalised)}')
print(f'{round((total_word_count - total_word_count_normalised)/total_word_count*100, 2)}% less words')

The word count of transcription after normalised is: 83160
40.33% less words


* Δημιουργούμε μια αριθμητική μεταβλητή εξόδου (target variable) από την στήλη "medical_specialty"

In [23]:
# Instantiate a LabelEncoder object
le = preprocessing.LabelEncoder()

# Fit the LabelEncoder to the 'medical_specialty' column of the DataFrame
le.fit(df['medical_specialty'])
# Explanation:
# - The fit method of the LabelEncoder finds all unique labels (categories) in the 'medical_specialty' column,
#   and assigns a unique integer to each label.

# Transform the 'medical_specialty' labels into integers and store them in a new column 'encoded_target'
df['encoded_target'] = le.transform(df['medical_specialty'])
# Explanation:
# - The transform method then takes the 'medical_specialty' column and replaces each label with the integer
#   that was assigned to it during the fit method.
df.head()

Unnamed: 0,medical_specialty,stemmed_without_stop,encoded_target
2656,Neurosurgery,titl oper complex closur debrid wound indic surgeri patient year old femal long histori shunt hydrocephalus present drain wound right upper quadrant costal margin lanc general surgeri resolv howev continu drain evid fever crp normal shunt ct normal th...,2
2657,Neurosurgery,titl oper placement right new ventriculoperiton vp shunt strata valv remov right frontal ommaya reservoir indic surgeri patient month old infant born prematur intraventricular hemorrhag ommaya reservoir recommend remov replac new vp shunt preop diagno...,2
2658,Neurosurgery,preoper diagnosi aqueduct stenosi postop diagnosi aqueduct stenosi titl procedur endoscop third ventriculostomi anesthesia general endotrach tube anesthesia devic bactis ventricular cathet aesculap burr hole port skin prepar chloraprep complic none sp...,2
2661,Neurosurgery,procedur placement left ventriculostomi via twist drill preoper diagnosi massiv intraventricular hemorrhag hydrocephalus increas intracrani pressur postop diagnosi massiv intraventricular hemorrhag hydrocephalus increas intracrani pressur indic proced...,2
2662,Neurosurgery,preoper diagnos increas intracrani pressur cerebr edema due sever brain injuri postop diagnos increas intracrani pressur cerebr edema due sever brain injuri procedur burr hole insert extern ventricular drain cathet anesthesia bedsid sedat procedur sca...,2


* Δημιουργούμε μια λίστα με τα επεξεργασμένα κείμενα (ένα corpus).

In [24]:
flat_list_transcription = to_list(df, 'stemmed_without_stop')
flat_list_transcription

['titl oper complex closur debrid wound indic surgeri patient year old femal long histori shunt hydrocephalus present drain wound right upper quadrant costal margin lanc general surgeri resolv howev continu drain evid fever crp normal shunt ct normal thought insidi fistula versus tract recommend excis tract preop diagnosi possibl cerebrospin fluid versus wound fistula postop diagnosi possibl cerebrospin fluid versus wound fistula procedur detail patient brought oper room induct laryng mask airway posit supin right side prep drape usual steril fashion next work fistula ellipt excis onc excis follow fistul tract complet remov csf drainag cathet visual although adequ proper onc excis irrig close multipl layer use vicryl deep layer caprosyn indermil dri steril dress appli patient revers extub transfer recoveri room stabl condit multipl cultur sent well tract sent patholog spong needl count correct',
 'titl oper placement right new ventriculoperiton vp shunt strata valv remov right frontal 

In [25]:
# Define a dictionary with n-gram feature configurations.
n_gram_features = {
    'unigram': (1, 1),            # Single words
    'unigram_bigram': (1, 2),     # Single words and pairs of consecutive words
    'bigram': (2, 2),             # Pairs of consecutive words
    'bigram_trigram': (2, 3),     # Pairs and triples of consecutive words
    'trigram': (3, 3)             # Triples of consecutive words
}

# feature_name = []  # Initialize an empty list to hold feature names.
# feats = []          # Initialize a temporary list.

# # Iterate over the dictionary items.
# for key, values in n_gram_features.items():
#     feats.append(key)             # Append the key (feature name) to the temp list.
#     feature_name.append(key)     # Append the key (feature name) to the feature_name list.

# feats  # Print the temp list, which now contains the keys of the n_gram_features dictionary.

In [26]:
n_gram_feats = generate_n_gram_features(flat_list_transcription, n_gram_features)

In [27]:
# Assuming n_gram_feats is a list of sparse matrix representations of n-gram features,
# where each entry in the list corresponds to a different type of n-gram feature.
dataframes = {
    'unigram': n_gram_feats[0], 
    'unigram_bigram': n_gram_feats[1], 
    'bigram': n_gram_feats[2], 
    'bigram_trigram': n_gram_feats[3], 
    'trigram': n_gram_feats[4]
}

# Initialize empty lists to store the names of the feature vectors and their shapes.
feature_vector = [] 
feature_vector_shape = []

# Iterate over the items in the 'dataframes' dictionary.
for key in dataframes:
    # Append the name of the feature vector (key) to the 'feature_vector' list.
    feature_vector.append(key)
    # Append the shape of the feature vector to the 'feature_vector_shape' list.
    feature_vector_shape.append(dataframes[key].shape)

# Create a DataFrame that summarizes the feature vectors and their shapes.
n_gram_df = pd.DataFrame({
    'N-Gram Feature Vector': feature_vector, 
    'Data Dimension': feature_vector_shape
})

# Display the DataFrame
n_gram_df

Unnamed: 0,N-Gram Feature Vector,Data Dimension
0,unigram,"(298, 5604)"
1,unigram_bigram,"(298, 54038)"
2,bigram,"(298, 48434)"
3,bigram_trigram,"(298, 115329)"
4,trigram,"(298, 66895)"


In [28]:
n_gram_df[0]

KeyError: 0

* Προσοχή: για τα unigrams το μέγεθος του διανύσματος ισούται με τον αριθμό μοναδικών λέξεων (tokenized).

* Για τα bigrams και τα trigrams, το μέγεθος αντιπροσωπεύει συνδυασμούς λέξεων.


In [32]:
# to retrieve a unigram feature vector
dataframes['unigram'].todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [33]:
df_target =df[['encoded_target']].values.ravel()
df_target

array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [34]:
metrics = {
    'f1':[f1_score, 'f1_macro'], 
    'precision': [precision_score, 'precision_macro'], 
    'recall': [recall_score, 'recall_macro']
}

param_grid = {'max_depth': [None,30,32,35,37,38,39,40],'min_samples_split': [2,150,170,180,190,200]}
base_estimator = RandomForestClassifier(random_state=random_state_number)
rfc_result = get_performance(param_grid, base_estimator, dataframes, df_target, metrics)
rfc_result

KeyboardInterrupt: 

In [None]:
# konlazaros@gmail.com

In [None]:
best_vector, best_clf =  get_best_vector_clf(rfc_result)


In [None]:
vis_classification(dataframes, df_target, vector_type = best_vector, estimator = best_clf)

In [None]:
from matplotlib.lines import Line2D
from sklearn import preprocessing

# Ensure you have defined these variables outside this function:
# random_state_number, le (LabelEncoder instance)

def vis_classification2(dataframes, label, vector_type='unigram', estimator=KNeighborsClassifier(n_neighbors=9)):
    pca = PCA(n_components=2)
    df1 = pca.fit_transform(np.asarray(dataframes[vector_type].todense()))
    X_train, X_test, y_train, y_test = train_test_split(df1, label, test_size=0.2, random_state=random_state_number)

    clf = estimator.fit(X_train, y_train)
    
    # Create a mesh grid for plotting decision boundaries
    x_min, x_max = X_train[:, 0].min() - 1, X_train[:, 0].max() + 1
    y_min, y_max = X_train[:, 1].min() - 1, X_train[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1),
                         np.arange(y_min, y_max, 0.1))

    # Predict on the mesh grid
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    # Plotting
    fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(15, 6))
    cmap = plt.cm.get_cmap('tab10', np.unique(y_train).size)  # Define the colormap

    # Plot decision boundaries on both subplots
    axs[0].contourf(xx, yy, Z, alpha=0.3, cmap=cmap)
    axs[1].contourf(xx, yy, Z, alpha=0.3, cmap=cmap)

    # Plot training set points
    for i, class_value in enumerate(np.unique(y_train)):
        ix = np.where(y_train == class_value)
        axs[0].scatter(X_train[ix, 0], X_train[ix, 1], c=[cmap(i)], label=str(le.inverse_transform([class_value])[0]))

    # Plot testing set points
    for i, class_value in enumerate(np.unique(y_test)):
        ix = np.where(y_test == class_value)
        axs[1].scatter(X_test[ix, 0], X_test[ix, 1], c=[cmap(i)], label=str(le.inverse_transform([class_value])[0]))

    # Create legends from custom handles
    handles = [Line2D([0], [0], marker='o', color=cmap(i), linestyle='', label=str(le.inverse_transform([i])[0]))
               for i in range(np.unique(y_train).size)]
    axs[0].legend(handles=handles, loc='best')
    axs[1].legend(handles=handles, loc='best')

    # Set titles for subplots
    axs[0].set_title('Training Set')
    axs[1].set_title('Testing Set')
    fig.suptitle(f'Visualising {type(estimator).__name__} on {vector_type.capitalize()} Vector', fontsize=14, fontweight='bold')

    # Show plot
    plt.show()

In [None]:
vis_classification2(dataframes, df_target, vector_type = best_vector, estimator = best_clf)