## <center>Text Representations</center>

In [4]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from gensim.models import Word2Vec
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from gensim.models import Word2Vec
import pandas as pd

### Importing data and preprocessing

For this task, the folowing dataset was used: Medical Text Dataset -Cancer Doc Classification (https://www.kaggle.com/datasets/falgunipatel19/biomedical-text-publication-classification).
This dataset comprises 7569 publications, categorized into three distinct classes: colon cancer, lung cancer, and thyroid cancer.

In [5]:
# Load dataset
df = pd.read_csv('medical_text.csv', encoding='ISO-8859-1')

In [6]:
df.columns

Index(['Unnamed: 0', '0', 'a'], dtype='object')

In [7]:
df.isnull().sum()

Unnamed: 0    0
0             0
a             0
dtype: int64

In [8]:
df.head(10)

Unnamed: 0.1,Unnamed: 0,0,a
0,0,Thyroid_Cancer,Thyroid surgery in children in a single insti...
1,1,Thyroid_Cancer,""" The adopted strategy was the same as that us..."
2,2,Thyroid_Cancer,coronary arterybypass grafting thrombosis ï¬b...
3,3,Thyroid_Cancer,Solitary plasmacytoma SP of the skull is an u...
4,4,Thyroid_Cancer,This study aimed to investigate serum matrix ...
5,5,Thyroid_Cancer,This study was performed to explore the effec...
6,6,Thyroid_Cancer,This study was performed assess the clinical ...
7,7,Thyroid_Cancer,Journal of International Medical Research  Th...
8,8,Thyroid_Cancer,Gastric cancer GC persists as a worldwide pub...
9,9,Thyroid_Cancer,Scars Burns HealingVolume  reuse guideli...


In [9]:
df.drop('Unnamed: 0', axis=1, inplace=True)
df = df.rename(columns={'0': 'label', 'a': 'text'})

df.head(10)

Unnamed: 0,label,text
0,Thyroid_Cancer,Thyroid surgery in children in a single insti...
1,Thyroid_Cancer,""" The adopted strategy was the same as that us..."
2,Thyroid_Cancer,coronary arterybypass grafting thrombosis ï¬b...
3,Thyroid_Cancer,Solitary plasmacytoma SP of the skull is an u...
4,Thyroid_Cancer,This study aimed to investigate serum matrix ...
5,Thyroid_Cancer,This study was performed to explore the effec...
6,Thyroid_Cancer,This study was performed assess the clinical ...
7,Thyroid_Cancer,Journal of International Medical Research  Th...
8,Thyroid_Cancer,Gastric cancer GC persists as a worldwide pub...
9,Thyroid_Cancer,Scars Burns HealingVolume  reuse guideli...


In [10]:
# Encode labels
df['label'] = LabelEncoder().fit_transform(df['label'])

In [11]:
df.head(10)

Unnamed: 0,label,text
0,2,Thyroid surgery in children in a single insti...
1,2,""" The adopted strategy was the same as that us..."
2,2,coronary arterybypass grafting thrombosis ï¬b...
3,2,Solitary plasmacytoma SP of the skull is an u...
4,2,This study aimed to investigate serum matrix ...
5,2,This study was performed to explore the effec...
6,2,This study was performed assess the clinical ...
7,2,Journal of International Medical Research  Th...
8,2,Gastric cancer GC persists as a worldwide pub...
9,2,Scars Burns HealingVolume  reuse guideli...


In [12]:
value_counts = df['label'].value_counts()

print(value_counts)

label
2    2810
0    2580
1    2180
Name: count, dtype: int64


### Bag of Words

In [13]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

In [14]:
vectorizer_bow = CountVectorizer()

X_train_bow = vectorizer_bow.fit_transform(X_train).toarray()
X_test_bow = vectorizer_bow.transform(X_test).toarray()

In [15]:
clf_bow = MultinomialNB()
clf_bow.fit(X_train_bow, y_train)

y_pred_bow = clf_bow.predict(X_test_bow)

In [16]:
bow_acc = accuracy_score(y_test, y_pred_bow)

### TF-IDF

In [17]:
vectorizer_tfidf = TfidfVectorizer()

X_train_tfidf = vectorizer_tfidf.fit_transform(X_train).toarray()
X_test_tfidf = vectorizer_tfidf.transform(X_test).toarray()

In [18]:
clf_tfidf = MultinomialNB()
clf_tfidf.fit(X_train_tfidf, y_train)

y_pred_tfidf = clf_tfidf.predict(X_test_tfidf)

In [19]:
tf_idf_acc = accuracy_score(y_test, y_pred_tfidf)

### Word2Vec

In [20]:
sentences = [text.split() for text in X_train]

model_sg = Word2Vec(sentences, vector_size=100, window=5, sg=1, min_count=1) # skip-gram
model_cbow = Word2Vec(sentences, vector_size=100, window=5, sg=0, min_count=1) #cbow

In [21]:
# Function to get the average word vector for each sentence
def get_avg_vector(text, model):
    words = text.split()
    vectors = [model.wv[word] for word in words if word in model.wv]
    
    return np.mean(vectors, axis=0) if vectors else np.zeros(100)

In [22]:
X_train_sg = np.array([get_avg_vector(text, model_sg) for text in X_train])
X_test_sg = np.array([get_avg_vector(text, model_sg) for text in X_test])

X_train_cbow = np.array([get_avg_vector(text, model_cbow) for text in X_train])
X_test_cbow = np.array([get_avg_vector(text, model_cbow) for text in X_test])

In [23]:
log_reg_model = LogisticRegression(max_iter=100)

In [24]:
log_reg_model.fit(X_train_sg, y_train)
y_pred_sg = log_reg_model.predict(X_test_sg)

accuracy_sg = accuracy_score(y_test, y_pred_sg)

In [25]:
log_reg_model.fit(X_train_cbow, y_train)
y_pred_cbow = log_reg_model.predict(X_test_cbow)

accuracy_cbow = accuracy_score(y_test, y_pred_cbow)

### Accuracies

In [26]:
print(f"BoW Accuracy: {(bow_acc):.4f}")
print(f"TF-IDF Accuracy: {(tf_idf_acc):.4f}")
print(f"Word2Vec Skip-Gram Accuracy: {accuracy_sg}")
print(f"Word2Vec CBOW Accuracy: {accuracy_cbow}")

BoW Accuracy: 0.9320
TF-IDF Accuracy: 0.9254
Word2Vec Skip-Gram Accuracy: 1.0
Word2Vec CBOW Accuracy: 1.0
