#### Team: Balogh Szilard, Bajan Ramona-Maria, Popa Sebastian
## Problem 2

In [1]:
import os
import pandas as pd
from bs4 import BeautifulSoup
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer


folder_path = r"reuters21578"

data = []

for file_name in os.listdir(folder_path):
    if file_name.endswith(".sgm"):
        file_path = os.path.join(folder_path, file_name)
        
        with open(file_path, "r", encoding="latin1") as file:
            content = file.read()
            soup = BeautifulSoup(content, "html.parser")
            
            for reuters in soup.find_all("reuters"):
                topics = reuters.find("topics")
                body = reuters.find("body")
                
                if topics and body and len(topics.contents) == 1:
                    topic_text = topics.get_text(strip=True)
                    body_text = body.get_text(separator=" ", strip=True)
                    
                    data.append({"Topic": topic_text, "Body": body_text})

raw_df = pd.DataFrame(data)
print(f"Raw DataFrame created with {len(raw_df)} entries.")

raw_df.head()

Raw DataFrame created with 8654 entries.


Unnamed: 0,Topic,Body
0,cocoa,Showers continued throughout the week in\nthe ...
1,earn,Champion Products Inc said its\nboard of direc...
2,acq,Computer Terminal Systems Inc said\nit has com...
3,earn,"Shr 34 cts vs 1.19 dlrs\n Net 807,000 vs 2,..."
4,earn,Oper shr loss two cts vs profit seven cts\n ...


In [7]:
processed_bodies = []

for body in raw_df["Body"]:
    cleaned_text = re.sub(r"[^a-zA-Z\s]", "", body)
    cleaned_text = cleaned_text.lower()
    tokens = cleaned_text.split()
    processed_bodies.append(tokens)

raw_df["Body Tokens"] = processed_bodies

processed_df = raw_df.drop(columns=["Body"])

print(f"DataFrame preprocessed.")

processed_df.head()

DataFrame preprocessed.


Unnamed: 0,Topic,Body Tokens
0,cocoa,"[showers, continued, throughout, the, week, in..."
1,earn,"[champion, products, inc, said, its, board, of..."
2,acq,"[computer, terminal, systems, inc, said, it, h..."
3,earn,"[shr, cts, vs, dlrs, net, vs, assets, mln, vs,..."
4,earn,"[oper, shr, loss, two, cts, vs, profit, seven,..."


In [8]:
processed_df["Body Text"] = processed_df["Body Tokens"].apply(lambda tokens: " ".join(tokens))

count_vectorizer = CountVectorizer()
X_count = count_vectorizer.fit_transform(processed_df["Body Text"])

tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(processed_df["Body Text"])

processed_df.head()

Unnamed: 0,Topic,Body Tokens,Body Text
0,cocoa,"[showers, continued, throughout, the, week, in...",showers continued throughout the week in the b...
1,earn,"[champion, products, inc, said, its, board, of...",champion products inc said its board of direct...
2,acq,"[computer, terminal, systems, inc, said, it, h...",computer terminal systems inc said it has comp...
3,earn,"[shr, cts, vs, dlrs, net, vs, assets, mln, vs,...",shr cts vs dlrs net vs assets mln vs mln depos...
4,earn,"[oper, shr, loss, two, cts, vs, profit, seven,...",oper shr loss two cts vs profit seven cts oper...


In [9]:
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()

def preprocess_text(tokens):
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)

processed_df["BOW Text"] = processed_df["Body Tokens"].apply(preprocess_text)

print(processed_df["BOW Text"])

bow_vectorizer = CountVectorizer()
X_bow = bow_vectorizer.fit_transform(processed_df["BOW Text"])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\BajanRamonaMaria\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


0       shower continu throughout week bahia cocoa zon...
1       champion product inc said board director appro...
2       comput termin system inc said complet sale sha...
3       shr ct vs dlr net vs asset mln vs mln deposit ...
4       oper shr loss two ct vs profit seven ct oper s...
                              ...                        
8649    soviet union agre suppli iran refin oil produc...
8650    chase corp ltd chcaw said make offer fullypaid...
8651    japanindiapakistangulfjapan ship confer said w...
8652    soviet union industri output grow slower pace ...
8653    six black miner kill two injur rock fall three...
Name: BOW Text, Length: 8654, dtype: object


In [10]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(processed_df["Topic"])

X_train_idx, X_test_idx, y_train, y_test = train_test_split(
    range(len(y)), y, test_size=0.2, random_state=42
)

X_train_count = X_count[X_train_idx]
X_test_count = X_count[X_test_idx]

X_train_tfidf = X_tfidf[X_train_idx]
X_test_tfidf = X_tfidf[X_test_idx]

X_train_bow = X_bow[X_train_idx]
X_test_bow = X_bow[X_test_idx]


In [11]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

metrics = ["euclidean", "cosine", "manhattan",]
k_values = [3, 5, 7]

def train_knn(X_train, X_test, y_train, y_test, metric, k):
    knn = KNeighborsClassifier(n_neighbors=k, metric=metric, weights='distance')
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    return accuracy_score(y_test, y_pred)

In [12]:
for metric in metrics:
    for k in k_values:
        acc_count = train_knn(X_train_count, X_test_count, y_train, y_test, metric, k)
        acc_tfidf = train_knn(X_train_tfidf, X_test_tfidf, y_train, y_test, metric, k)
        acc_bow = train_knn(X_train_bow, X_test_bow, y_train, y_test, metric, k)
        
        print(f"Metric: {metric}, k: {k}")
        print(f"Raw Frequencies Accuracy: {acc_count:.4f}")
        print(f"TF-IDF Accuracy: {acc_tfidf:.4f}")
        print(f"BOW Accuracy: {acc_bow:.4f}")
        print("-" * 40)


Metric: euclidean, k: 3
Raw Frequencies Accuracy: 0.8070
TF-IDF Accuracy: 0.8891
BOW Accuracy: 0.8348
----------------------------------------
Metric: euclidean, k: 5
Raw Frequencies Accuracy: 0.8070
TF-IDF Accuracy: 0.9006
BOW Accuracy: 0.8313
----------------------------------------
Metric: euclidean, k: 7
Raw Frequencies Accuracy: 0.8070
TF-IDF Accuracy: 0.9006
BOW Accuracy: 0.8284
----------------------------------------
Metric: cosine, k: 3
Raw Frequencies Accuracy: 0.8562
TF-IDF Accuracy: 0.8897
BOW Accuracy: 0.9029
----------------------------------------
Metric: cosine, k: 5
Raw Frequencies Accuracy: 0.8585
TF-IDF Accuracy: 0.9029
BOW Accuracy: 0.9035
----------------------------------------
Metric: cosine, k: 7
Raw Frequencies Accuracy: 0.8533
TF-IDF Accuracy: 0.9041
BOW Accuracy: 0.9018
----------------------------------------


[WinError 2] The system cannot find the file specified
  File "d:\MASTER_AN_I_SEM_II\DM\TEMA1\venv\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
        "wmic CPU Get NumberOfCores /Format:csv".split(),
        capture_output=True,
        text=True,
    )
  File "C:\Users\BajanRamonaMaria\AppData\Local\Programs\Python\Python313\Lib\subprocess.py", line 556, in run
    with Popen(*popenargs, **kwargs) as process:
         ~~~~~^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\BajanRamonaMaria\AppData\Local\Programs\Python\Python313\Lib\subprocess.py", line 1038, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
    ~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
                        pass_fds, cwd, env,
                        ^^^^^^^^^^^^^^^^^^^
    ...<5 lines>...
                        gid, gids, uid, umask,
                        ^^^^^^^^^^^^^^^^^^^^^^
                

Metric: manhattan, k: 3
Raw Frequencies Accuracy: 0.7614
TF-IDF Accuracy: 0.4749
BOW Accuracy: 0.7383
----------------------------------------
Metric: manhattan, k: 5
Raw Frequencies Accuracy: 0.7603
TF-IDF Accuracy: 0.4685
BOW Accuracy: 0.7210
----------------------------------------
Metric: manhattan, k: 7
Raw Frequencies Accuracy: 0.7481
TF-IDF Accuracy: 0.4633
BOW Accuracy: 0.6880
----------------------------------------


In [13]:
from sklearn.preprocessing import normalize

X_count = normalize(X_count, norm='l1', axis=1)
X_tfidf = normalize(X_tfidf, norm='l1', axis=1)
X_bow = normalize(X_bow, norm='l1', axis=1)

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(processed_df["Topic"])

X_train_idx, X_test_idx, y_train, y_test = train_test_split(
    range(len(y)), y, test_size=0.2, random_state=42
)

X_train_count = X_count[X_train_idx]
X_test_count = X_count[X_test_idx]

X_train_tfidf = X_tfidf[X_train_idx]
X_test_tfidf = X_tfidf[X_test_idx]

X_train_bow = X_bow[X_train_idx]
X_test_bow = X_bow[X_test_idx]

metrics = ["manhattan",]
k_values = [3, 5, 7]

def train_knn(X_train, X_test, y_train, y_test, metric, k):
    knn = KNeighborsClassifier(n_neighbors=k, metric=metric, weights='distance')
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    return accuracy_score(y_test, y_pred)

for metric in metrics:
    for k in k_values:
        acc_count = train_knn(X_train_count, X_test_count, y_train, y_test, metric, k)
        acc_tfidf = train_knn(X_train_tfidf, X_test_tfidf, y_train, y_test, metric, k)
        acc_bow = train_knn(X_train_bow, X_test_bow, y_train, y_test, metric, k)
        
        print(f"Metric: {metric}, k: {k}")
        print(f"Raw Frequencies Accuracy: {acc_count:.4f}")
        print(f"TF-IDF Accuracy: {acc_tfidf:.4f}")
        print(f"BOW Accuracy: {acc_bow:.4f}")
        print("-" * 40)

Metric: manhattan, k: 3
Raw Frequencies Accuracy: 0.9041
TF-IDF Accuracy: 0.7163
BOW Accuracy: 0.9203
----------------------------------------
Metric: manhattan, k: 5
Raw Frequencies Accuracy: 0.9041
TF-IDF Accuracy: 0.7129
BOW Accuracy: 0.9191
----------------------------------------
Metric: manhattan, k: 7
Raw Frequencies Accuracy: 0.9076
TF-IDF Accuracy: 0.7152
BOW Accuracy: 0.9237
----------------------------------------


##### In this experiment we aimed to demonstrate the importance of distances and similarities in the context of the Reuters Text Categorization Collection, using different metrics (cosine, manhattan, euclidian), different vectorization strategies (raw word frequencies, TF-IDF and BOW), using different values for 'k' (3, 5, 7) in the KNeighborsClassifier algorithm. 

##### We observed that, overall, the raw word frequency vectorization approach gives the poorest results as it only counts the number of times each word appears in a document, without weighing the importance of a word. Common words like "the", "and" may dominate over less frequent but significant words, the overall accuracy rate being about 0.8. The BoW approach does better than the previously mentioned strategy, as it gets rid of the stopwords and uses a stemmer to reduce the words to their root, the overall accuracy rate being 0.85. Overall, the best results came with the TF-IDF vectorization strategy, which combines how often a word appears in a document with how unique the word is across all documents, which 4 times out of 9 came with an accuracy greater than 0.9. 

##### As for the metrics, we observed that cosine distance works best overall. Our biggest accuracy was for the cosine distance combined with the TF-IDF and k = 7, which is 0.9041. Similarly, for k = 5, the same metric-vectorization strategy combination gives a result of 0.9029. In our experiment, for the cosine metric, out of the 9 accuracies, 5 are above the 0.9 threshold, and all 9 of them are above the 0.85 threshold. Out of the remaining 18 accuracies, 16 are below the 0.9 threshold. The second best metric is the euclidian distance, for which all 9 accuracies are above 0.8 and 2 are above the 0.9 threshold. The manhattan distance suffers due to the "curse of dimensionality" and is generally not used in contextes where text data is being processed. However, after applying the L1 normalization, which is also called Manhattan normalization, the accuracies for manhattan distance improve significantly. On the other hand, the results of TF-IDF remain poorer than those of row word frequencies or BoW, which is due to the fact that after normalization each document is scaled, which interferes with the algorithm of TD-IDF which relies on rare word importance.

## Problem 3

In [14]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(processed_df["Topic"])

count_vectorizer = CountVectorizer()
X_count = count_vectorizer.fit_transform(processed_df["BOW Text"])

X_train, X_test, y_train, y_test = train_test_split(X_count, y, test_size=0.2, random_state=42)

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
accuracy_original = accuracy_score(y_test, y_pred)
print(f"Accuracy on Original Data: {accuracy_original:.4f}")

Accuracy on Original Data: 0.9492


In [16]:
pca = PCA(n_components=100)
X_train_pca = pca.fit_transform(X_train.toarray())
X_test_pca = pca.transform(X_test.toarray())

logreg_pca = LogisticRegression(max_iter=1000)
logreg_pca.fit(X_train_pca, y_train)
y_pred_pca = logreg_pca.predict(X_test_pca)
accuracy_pca = accuracy_score(y_test, y_pred_pca)
print(f"Accuracy on PCA-Reduced Data: {accuracy_pca:.4f}")

Accuracy on PCA-Reduced Data: 0.9076


In [17]:
tsne = TSNE(n_components=2, random_state=42)
X_train_tsne = tsne.fit_transform(X_train.toarray())
X_test_tsne = tsne.fit_transform(X_test.toarray())

logreg_tsne = LogisticRegression(max_iter=60000)
logreg_tsne.fit(X_train_tsne, y_train)
y_pred_tsne = logreg_tsne.predict(X_test_tsne)
accuracy_tsne = accuracy_score(y_test, y_pred_tsne)
print(f"Accuracy on t-SNE-Reduced Data: {accuracy_tsne:.4f}")


Accuracy on t-SNE-Reduced Data: 0.5425


##### In this experiment, we took the high dimensional dataset obtained using the BoW vectorization strategy, then created a classification using logistic regression. 
##### First, we classified the whole dataset, after which we reduced the data using PCA and TSNE respectively. As we expected, we obtained the highest accuracy with the non-reduced dataset, which was about 0.95. 
##### The second best accuracy came from the PCA approach. It makes sense that this approach cannot be as accurate as the first one, because reducing thousands of dimensions to a few hundred means that some important words are lost. The accuracy depends greatly on the number of principal components specified. 
##### For instance, setting this value to 100 got us an accuracy of about 0.90. If we set the number of components to a higher value, the accuracy increases, if we do the opposite it decreases. It is worth noting that the purpose of reduction techniques is to increase the processing time of the dataset (in our case the classification), and therefore, the value of n_components cannot be set to a value that is too great (for example, we tried setting it to 500, which resulted in an accuracy of about 0.94), because the computation time will take just as much as if we didn't reduce our dataset at all. As for TSNE, it performed very poorly. It is caused by the fact that reducing the data to only 2D, the classifier lost almost all usefult information. 
##### TSNE is designed for visualization, not classification. This explain the huge drop to 0.44 accuracy. It is also worth noting that this approach takes the most time, more than 1 minute and we needed to set the max_iter value to 60000. We tried lower values, but it resulted in an error saying that the model didn't manage to converge. 