## PROBLEM 1: Supervised Classification Libraries: Regression, Decision Tree

In [1]:
import os
import gzip
import string
import struct

import nltk
import numpy as np
import pandas as pd

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import TfidfVectorizer

from mnist_parser import MNISTDataParser

In [2]:
seed = 42

np.random.seed(seed=seed)

### 1. MNIST Dataset

In [3]:
data_parser = MNISTDataParser()
X_train, y_train, X_test, y_test = data_parser.parse_data(verbose=0, sample_data=False)

In [4]:
print(f"Shape of Training Data = {X_train.shape}")
print(f"Shape of Training Labels = {y_train.shape}")

print(f"Shape of Testing Data = {X_test.shape}")
print(f"Shape of Testing Labels = {y_test.shape}")

Shape of Training Data = (60000, 784)
Shape of Training Labels = (60000,)
Shape of Testing Data = (10000, 784)
Shape of Testing Labels = (10000,)


In [5]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

#### A. Logistic Regression

In [17]:
lr = LogisticRegression(random_state=seed)
lr.fit(X_train_scaled, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [18]:
y_train_pred = lr.predict(X_train_scaled)

training_acc = accuracy_score(y_train, y_train_pred) * 100

print(f"Training Accuracy: {training_acc:.2f}%")

Training Accuracy: 94.34%


In [19]:
y_test_pred = lr.predict(X_test_scaled)

testing_acc = accuracy_score(y_test, y_test_pred) * 100

print(f"Testing Accuracy: {testing_acc:.2f}%")

Testing Accuracy: 92.46%


In [34]:
coefficients = np.abs(lr.coef_)

print(f"""As Logistic Regression is a binary classifier, and this problem
          statement is a multiclass classification. We are going to build
          {coefficients.shape[0]} models in total.""")

n = 30
for idx in range(coefficients.shape[0]):
    top_n_features = np.argsort(coefficients[idx])[-n:]
    
    print(top_n_features)

As Logistic Regression is a binary classifier, and this problem
          statement is a multiclass classification. We are going to build
          10 models in total.
[265 326 123 715 714 320 262 402 324 339 177 267 444 517 399 240 285 309
 368 329 323 367 145 629 406 377 408 461 434 718]
[436 233 548 688 304 636 711 638 200 374 137 716 628 120  89  69 350 662
 434 493 611 710 537 612 510 621 314 465 520 565]
[325 258 686 248 136 388 545 743 740 744 371 444 565 745 742 465 509 709
 611 318  97 321 741 347 739 425 219 343 397 368]
[640 186 529 164 456 516 104  96 217 572 657 249 289 344 445 303 135 221
 472  98 473 397 386 359 107 360 304 501 332 276]
[594 249 266 666 624 568 317 500 183 350 636 748 546 318 360 411 565 570
 246 322 595  93 442 357 320 544 321 554 444  97]
[652 357 227 359 204 709 438 411 370 444 654  68 697  69 255 470  67 455
 248 328 482 389 442 304 416 283 311 330 332 360]
[691 202 265 546 688 325 151 186 486 216 570 242 355 270 313 572 268 409
 487 459 220 511 543 

#### B. Decision Tree Classifier

In [35]:
clf = DecisionTreeClassifier(random_state=seed)

In [36]:
clf.fit(X_train_scaled, y_train)

In [37]:
y_train_pred = clf.predict(X_train_scaled)

training_acc = accuracy_score(y_train, y_train_pred) * 100

print(f"Training Accuracy: {training_acc:.2f}%")

Training Accuracy: 100.00%


In [38]:
y_test_pred = clf.predict(X_test_scaled)

testing_acc = accuracy_score(y_test, y_test_pred) * 100

print(f"Testing Accuracy: {testing_acc:.2f}%")

Testing Accuracy: 87.55%


In [42]:
top_n_features = np.argsort(clf.feature_importances_)[-n:]
top_n_features

array([353, 300, 514, 297, 267, 296, 348, 658, 381, 101, 271, 155, 153,
       402, 655, 354, 290, 486, 484,  98, 156, 234, 405, 346, 211, 430,
       350, 568, 435, 489])

### 2. 20 News Group

In [43]:
BASE_DIR = os.path.abspath('..')
DATA_DIR = os.path.join(BASE_DIR, "data", "20news-bydate")

train_data = fetch_20newsgroups(
    data_home=DATA_DIR,
    subset="train"
)

In [44]:
X_train = np.array(train_data.data)
y_train = train_data.target

In [45]:
def clean_data(X: np.array) -> np.array:
    X = np.vectorize(lambda x: x.lower())(X)
    # X = np.vectorize(lambda x: " ".join(x.split("\n\n")[1:]))(X)
    X = np.vectorize(lambda x: x.translate(str.maketrans("", "", string.punctuation)))(X)

    english_dict = set(nltk.corpus.words.words())
    stop_words = nltk.corpus.stopwords.words("english")
    X = np.vectorize(lambda x: " ".join([word for word in nltk.wordpunct_tokenize(x) if word not in stop_words and word in english_dict]))(X)

    return X

X_train = clean_data(X_train)

In [46]:
vectorizer = TfidfVectorizer(
    max_df=0.5,
    min_df=5,
    stop_words="english",
)
vectorizer.fit(X_train)

X_train_vectorized = vectorizer.transform(X_train)
print(f"Shape of X_train_vectorized = {X_train_vectorized.shape}")

del X_train

Shape of X_train_vectorized = (11314, 10034)


In [47]:
lr = LogisticRegression(random_state=seed)
lr.fit(X_train_vectorized, y_train)

In [48]:
y_train_pred = lr.predict(X_train_vectorized)

training_acc = accuracy_score(y_train, y_train_pred) * 100

print(f"Training Accuracy: {training_acc:.2f}%")

Training Accuracy: 93.42%


In [49]:
coefficients = np.abs(lr.coef_)

print(f"""As Logistic Regression is a binary classifier, and this problem
          statement is a multiclass classification. We are going to build
          {coefficients.shape[0]} models in total.""")

n = 30
for idx in range(coefficients.shape[0]):
    top_n_features = np.argsort(coefficients[idx])[-n:]
    
    print(top_n_features)

As Logistic Regression is a binary classifier, and this problem
          statement is a multiclass classification. We are going to build
          20 models in total.
[7703  487 7672 1006 6002 7721 4943 3161 4623 5143  716  516 6961 6386
 5709 5740 7305 7515 1855  590 6601  980 5711 7304 8818 4861  268 7746
  589 3893]
[6386 4415 1651 2553 5663 8880 5847 3855 5152 4197 9646 8570 8360 1604
 5264 3453 9627 6842 6230  262 1647 9649 8698 8338  375 3631 6611 9028
 4410 3960]
[6825 2553 6275 5339 9879 4172 6151 4638 2491 5042 1270 9627 1399 5366
 6806 4693 6787 5748 6842 9545 7640 1291   53 9536 3580 1554 2811 9876
 2753 3453]
[9043  699 1855 4092 6126 6634 1854 5663 1465 1850 8326 1013 8756 9822
 5512 4172 3545 2626  910  975 8944 2753 6631 5692 1951 1291 1198 2808
 3790 4372]
[3281 9646 4092 6785 6806 7782 4949 2626  975 8944 1225   48 1214 2512
 6771 5512 1228 1568 8326 7070 8157 7793 4759 9515 2808 5692 2838 6985
  437 5320]
[4746 3578 2315 8764 6842 3156 9434 4949 6806  158 3261 7061 9

In [50]:
clf = DecisionTreeClassifier(random_state=seed)

In [51]:
clf.fit(X_train_vectorized, y_train)

In [52]:
y_train_pred = lr.predict(X_train_vectorized)

training_acc = accuracy_score(y_train, y_train_pred) * 100

print(f"Training Accuracy: {training_acc:.2f}%")

Training Accuracy: 93.42%


In [53]:
top_n_features = np.argsort(clf.feature_importances_)[-n:]
top_n_features

array([5173, 3008, 1291, 8944, 8110, 1951, 2753, 4948, 6124, 2614, 2679,
       3926, 5320, 7746, 8853, 3960,  516, 9879, 2720,  437, 9434,  764,
       4032, 8287, 3893, 4232, 1288, 7677,  891, 1563])

## 3. Spambase Dataset

In [55]:
BASE_DIR = os.path.abspath('..')
DATA_DIR = os.path.join(BASE_DIR, "data", "spambase")

DATA_URL = os.path.join(DATA_DIR, "spambase.data")
COLS_URL = os.path.join(DATA_DIR, "spambase.names")

In [56]:
columns = []

with open(COLS_URL, "r") as file:
    for line in file:
        if line[0] in {'w', 'c'}:
            columns.append(line.split(':')[0])

columns.append("is_spam")

In [57]:
df = pd.read_csv(DATA_URL, header=None)
df.columns = columns

In [58]:
df.head()

Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total,is_spam
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


In [59]:
df.shape

(4601, 58)

In [60]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4601 entries, 0 to 4600
Data columns (total 58 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   word_freq_make              4601 non-null   float64
 1   word_freq_address           4601 non-null   float64
 2   word_freq_all               4601 non-null   float64
 3   word_freq_3d                4601 non-null   float64
 4   word_freq_our               4601 non-null   float64
 5   word_freq_over              4601 non-null   float64
 6   word_freq_remove            4601 non-null   float64
 7   word_freq_internet          4601 non-null   float64
 8   word_freq_order             4601 non-null   float64
 9   word_freq_mail              4601 non-null   float64
 10  word_freq_receive           4601 non-null   float64
 11  word_freq_will              4601 non-null   float64
 12  word_freq_people            4601 non-null   float64
 13  word_freq_report            4601 

In [61]:
X = df.drop(columns=["is_spam"]).values
y = df["is_spam"].values

In [62]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    shuffle=True, random_state=seed)

In [63]:
print(f"X_train shape = {X_train.shape}")
print(f"X_test shape = {X_test.shape}")
print(f"y_train shape = {y_train.shape}")
print(f"y_test shape = {y_test.shape}")

X_train shape = (3680, 57)
X_test shape = (921, 57)
y_train shape = (3680,)
y_test shape = (921,)


In [64]:
lr = LogisticRegression(random_state=seed)
lr.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [65]:
y_train_pred = lr.predict(X_train)

training_acc = accuracy_score(y_train, y_train_pred) * 100

print(f"Training Accuracy: {training_acc:.2f}%")

Training Accuracy: 92.17%


In [66]:
y_test_pred = lr.predict(X_test)

testing_acc = accuracy_score(y_test, y_test_pred) * 100

print(f"Testing Accuracy: {testing_acc:.2f}%")

Testing Accuracy: 92.62%


In [67]:
coefficients = np.abs(lr.coef_)

top_n_features = np.argsort(coefficients)[-n:]
print(top_n_features)

[[56 55 11 18 54  0 46 13 50 12 10 35  1 37 39  2 31 33 53 21  9 27 49 20
   8 42  3 14 47 30 40 17 48  5 38 51 19 29 23 34 36 28 43 52 32  7 16 41
  44  4 22  6 25 15 45 24 26]]


In [68]:
clf = DecisionTreeClassifier(random_state=seed)
clf.fit(X_train, y_train)

In [69]:
y_train_pred = clf.predict(X_train)

training_acc = accuracy_score(y_train, y_train_pred) * 100

print(f"Training Accuracy: {training_acc:.2f}%")

Training Accuracy: 99.95%


In [70]:
y_test_pred = clf.predict(X_test)

testing_acc = accuracy_score(y_test, y_test_pred) * 100

print(f"Testing Accuracy: {testing_acc:.2f}%")

Testing Accuracy: 91.86%


In [71]:
top_n_features = np.argsort(clf.feature_importances_)[-n:]
top_n_features

array([47, 49, 38, 12,  5, 10, 48, 44,  7,  9, 35, 17,  8, 20, 34, 23, 11,
       16,  4, 18, 45, 56, 26, 15, 55, 24, 54, 51,  6, 52])