In [30]:
import os
import re
import nltk
from sklearn.svm import SVC
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\JonathanChackoPattas\AppData\Roaming\nltk_dat
[nltk_data]     a...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\JonathanChackoPattas\AppData\Roaming\nltk_dat
[nltk_data]     a...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
script_dir = os.getcwd()

# === Load labeled dataset ===
labeled_dir = os.path.join(
    script_dir,
    # "ignore",
    "dataset",
    "labeled_dataset"
)
labeled_corpus_raw = []

for file in os.listdir(labeled_dir):
    if file.endswith(".txt"):
        with open(os.path.join(labeled_dir, file), 'r', encoding='utf-8') as f:
            labeled_corpus_raw.extend(f.readlines())

# Clean up (strip whitespace, remove empty lines)
labeled_corpus_raw = [line.strip() for line in labeled_corpus_raw if line.strip()]

# === Load unlabeled dataset ===
unlabeled_dir = os.path.join(
    script_dir,
    # "ignore",
    "dataset",
    "unlabeled_dataset"
)
unlabeled_corpus_raw = []

for file in os.listdir(unlabeled_dir):
    if file.endswith(".txt"):
        with open(os.path.join(unlabeled_dir, file), 'r', encoding='utf-8') as f:
            unlabeled_corpus_raw.extend(f.readlines())

# Clean unlabeled corpus as well
unlabeled_corpus_raw = [line.strip() for line in unlabeled_corpus_raw if line.strip()]

# Preview
labeled_corpus_raw[:5]

['### abstract ###',
 'MISC\talthough the internet as level topology has been extensively studied over the past few years  little is known about the details of the as taxonomy',
 'MISC\tan as  node  can represent a wide variety of organizations  e g   large isp  or small private business  university  with vastly different network characteristics  external connectivity patterns  network growth tendencies  and other properties that we can hardly neglect while working on veracious internet representations in simulation environments',
 'AIMX\tin this paper  we introduce a radically new approach based on machine learning techniques to map all the ases in the internet into a natural as taxonomy',
 'OWNX\twe successfully classify  NUMBER   NUMBER  percent  of ases with expected accuracy of  NUMBER   NUMBER  percent']

In [32]:
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = re.sub(r'###.*?###', '', text, flags=re.IGNORECASE) # Remove ### headings | ALT => r'### (abstract|introduction) ###'
    text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
    words = word_tokenize(text.lower()) # Tokenization and lowercasing
    cleaned = [word for word in words if word not in stop_words] # Remove stopwords
    return ' '.join(cleaned).strip() # Return cleaned text

# === Clean data ===
labels = []
labeled_sentences = []
for line in labeled_corpus_raw:
    if '\t' in line:
        label, sentence = line.split('\t', 1)
        cleaned_sentence = preprocess_text(sentence)
        if cleaned_sentence:  # Only keep if non-empty
            labels.append(label.strip())
            labeled_sentences.append(cleaned_sentence)
# labeled_sentences = [preprocess_text(line.split('\t', 1)[1]) for line in labeled_corpus_raw if '\t' in line]


# === Clean unlabeled data ===
unlabeled_sentences = []
for raw_line in unlabeled_corpus_raw:
    processed = preprocess_text(raw_line)
    if processed:  # Only include non-empty results
        unlabeled_sentences.append(processed)
# unlabeled_sentences = [preprocess_text(line) for line in unlabeled_corpus_raw]

# Preview
labeled_sentences

['although internet level topology extensively studied past years little known details taxonomy',
 'node represent wide variety organizations e g large isp small private business university vastly different network characteristics external connectivity patterns network growth tendencies properties hardly neglect working veracious internet representations simulation environments',
 'paper introduce radically new approach based machine learning techniques map ases internet natural taxonomy',
 'successfully classify number number percent ases expected accuracy number number percent',
 'release community level topology dataset augmented number taxonomy information number set attributes used classify ases',
 'believe dataset serve invaluable addition understanding structure evolution internet',
 'rapid expansion internet last two decades produced large scale system thousands diverse independently managed networks collectively provide global connectivity across wide spectrum geopolitical env

In [33]:
# Combine both labeled and unlabeled sentences to build the vocabulary
combined_sentences = labeled_sentences + unlabeled_sentences

vectorizer = TfidfVectorizer()
vectorizer.fit(combined_sentences)

# Transform labeled and unlabeled sets separately
X_labeled = vectorizer.transform(labeled_sentences)
X_unlabeled = vectorizer.transform(unlabeled_sentences)

# View label distribution
label_counts = Counter(labels)
label_counts

Counter({'MISC': 65, 'OWNX': 59, 'CONT': 27, 'AIMX': 12, 'BASE': 7})

In [34]:
# Split using the updated TF-IDF matrix
X_train, X_test, y_train, y_test = train_test_split(
    X_labeled, labels, test_size=0.2, random_state=42, stratify=labels
)

# Initialize and train Naive Bayes classifier
clf = MultinomialNB()
clf.fit(X_train, y_train)

# Predict
y_pred = clf.predict(X_test)

# Evaluation labels (as per assignment spec)
label_order = ['AIMX', 'OWNX', 'CONT', 'BASE', 'MISC']

# Report
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, labels=label_order, zero_division=0))

Accuracy: 0.6470588235294118

Classification Report:
               precision    recall  f1-score   support

        AIMX       0.00      0.00      0.00         2
        OWNX       0.52      0.92      0.67        12
        CONT       0.00      0.00      0.00         6
        BASE       0.00      0.00      0.00         1
        MISC       0.85      0.85      0.85        13

    accuracy                           0.65        34
   macro avg       0.27      0.35      0.30        34
weighted avg       0.51      0.65      0.56        34



In [35]:
log_reg_report = classification_report(
    y_test, 
    y_pred,
    labels = label_order,
    # zero_division=0  # Avoid warning when a class is missing in predictions
) 
print(log_reg_report)

              precision    recall  f1-score   support

        AIMX       0.00      0.00      0.00         2
        OWNX       0.52      0.92      0.67        12
        CONT       0.00      0.00      0.00         6
        BASE       0.00      0.00      0.00         1
        MISC       0.85      0.85      0.85        13

    accuracy                           0.65        34
   macro avg       0.27      0.35      0.30        34
weighted avg       0.51      0.65      0.56        34



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [36]:
# Initialize the SVM classifier
svm = SVC() # kernel='linear'

# Create the decision tree.
tree = DecisionTreeClassifier() # random_state=42

# Fit classifiers
svm.fit(X_train, y_train)
tree.fit(X_train, y_train)
# Predict
svm_pred = svm.predict(X_test)
tree_pred = tree.predict(X_test)

#  Evaluate SVM
print("SVM Classification Report:")
print(classification_report(y_test, svm_pred, labels=label_order, zero_division=0))

# Evaluate Decision Tree
print("\nDecision Tree Classification Report:")
print(classification_report(y_test, tree_pred, labels=label_order, zero_division=0))

SVM Classification Report:
              precision    recall  f1-score   support

        AIMX       1.00      0.50      0.67         2
        OWNX       0.55      0.92      0.69        12
        CONT       0.00      0.00      0.00         6
        BASE       0.00      0.00      0.00         1
        MISC       0.85      0.85      0.85        13

    accuracy                           0.68        34
   macro avg       0.48      0.45      0.44        34
weighted avg       0.58      0.68      0.61        34


Decision Tree Classification Report:
              precision    recall  f1-score   support

        AIMX       0.67      1.00      0.80         2
        OWNX       0.82      0.75      0.78        12
        CONT       0.25      0.17      0.20         6
        BASE       0.33      1.00      0.50         1
        MISC       0.77      0.77      0.77        13

    accuracy                           0.68        34
   macro avg       0.57      0.74      0.61        34
weighted avg

In [37]:
# Predict labels for unlabeled sentences using Naive Bayes model
unlabeled_predictions = clf.predict(X_unlabeled)

# Print each sentence with predicted label
print("\n--- Predicted Labels for Unlabeled Sentences ---\n")
for label, sentence in zip(unlabeled_predictions, unlabeled_sentences):
    print(f"{label}\t{sentence.strip()}")


--- Predicted Labels for Unlabeled Sentences ---

OWNX	wholegenome transporter analyses conducted 141 organisms whose complete genome sequences available
OWNX	organism complete set membrane transport systems identified predicted functions classified protein families based transporter classification system
OWNX	organisms larger genome sizes generally possessed relatively greater number transport systems
MISC	prokaryotes unicellular eukaryotes significant factor increase transporter content genome size greater diversity transporter types
OWNX	contrast multicellular eukaryotes greater number paralogs specific transporter families important factor increase transporter content genome size
MISC	eukaryotic prokaryotic intracellular pathogens endosymbionts exhibited markedly limited transport capabilities
MISC	hierarchical clustering phylogenetic profiles transporter families derived presence absence certain transporter family showed clustering patterns organisms correlated evolutionary histo

### Summary <!-- Summarize your work and your findings in a few sentences. -->
- We _________________________________________________________________________