<a href="https://colab.research.google.com/github/kelvin3720/CU_CMT316_Coursework_1/blob/main/part_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import operator
import os
from typing import List
import nltk
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import accuracy_score, precision_score
from sklearn.metrics import recall_score, f1_score

In [None]:
# Please upload the bbc folder first if it does not exist
# Do it by uploading the bbc.zip which contains the bbc folder
# Then run !unzip bbc.zip
!ls

bbc  bbc.zip  sample_data


In [None]:
def show_marco_average_metrics(y_true: List[int], Y_pred: List[int]) -> None:
    """
    Print out the macro-averaged precision,
    macro-averaged recall and macro-averaged F1
    """
    precision = precision_score(y_true, y_pred, average='macro')
    recall = recall_score(y_true, y_pred, average='macro')
    f1 = f1_score(y_true, y_pred, average='macro')

    print("macro-averaged precision: ", precision)
    print("macro-averaged recall: ", recall)
    print("macro-averaged F1: ", f1)

    return

In [None]:
# Please put the bbc folder at the same location with this part_2.ipynb file
folders = ["business", "entertainment", "politics", "sport", "tech"]
cwd = os.getcwd()
path = os.path.join(cwd, "bbc")
# Unprocessed data, index 0 is the txext, 1 is the categories
# 0: business; 1: entertainment; 2:politics; 3: sport; 4: tech
raw_data = []

# Write the data into row_data
for index, folder in enumerate(folders):
    folder_path = os.path.join(path, folder)
    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        # There are a little bit characters cannot decode,
        # so errors='ignore' is used
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
            raw_data.append([file.read(), index])

In [None]:
## Data processing

# Code from live class sessions, 2_FeatureEngineeringSelection_Sklearn.ipynb
lemmatizer = nltk.stem.WordNetLemmatizer()

def get_list_tokens(string):
  sentence_split=nltk.tokenize.sent_tokenize(string)
  list_tokens=[]
  for sentence in sentence_split:
    list_tokens_sentence=nltk.tokenize.word_tokenize(sentence)
    for token in list_tokens_sentence:
      list_tokens.append(lemmatizer.lemmatize(token).lower())
  return list_tokens

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

stopwords = set(nltk.corpus.stopwords.words('english'))
stopwords.add(".")
stopwords.add(",")
stopwords.add("--")
stopwords.add("``")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
dict_word_frequency={}

for entry in raw_data:
    sentence_tokens=get_list_tokens(entry[0])
    for word in sentence_tokens:
        if word in stopwords: continue
        if word not in dict_word_frequency: dict_word_frequency[word]=1
        else: dict_word_frequency[word]+=1

sorted_list = sorted(
    dict_word_frequency.items(), key=operator.itemgetter(1), reverse=True
    )[:1000]
i=0
for word,frequency in sorted_list[:25]:
  i+=1
  print (str(i)+". "+word+" - "+str(frequency))

vocabulary=[]
for word,frequency in sorted_list:
  vocabulary.append(word)

1. '' - 9296
2. 's - 8895
3. said - 7253
4. wa - 6088
5. ha - 4971
6. - - 3197
7. mr - 2994
8. year - 2824
9. would - 2629
10. ) - 2229
11. ( - 2227
12. also - 2156
13. people - 2045
14. % - 1968
15. new - 1966
16. one - 1806
17. us - 1673
18. : - 1667
19. could - 1546
20. game - 1401
21. last - 1380
22. time - 1361
23. first - 1283
24. say - 1265
25. n't - 1258


In [None]:
def get_vector_text(list_vocab,string):
  vector_text=np.zeros(len(list_vocab))
  list_tokens_string=get_list_tokens(string)
  for i, word in enumerate(list_vocab):
    if word in list_tokens_string:
      vector_text[i]=list_tokens_string.count(word)
  return vector_text

In [None]:
## With only word frequency as feature
x_all=[]
y_all=[]

for entry in raw_data:
    # Feature 1: Word frequency
    vector_pos=get_vector_text(vocabulary, entry[0])
    x_all.append(vector_pos)
    y_all.append(entry[1])

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_all, y_all, test_size=0.4)

In [None]:
x_train_sentanalysis = np.asarray(x_train)
y_train_sentanalysis = np.asarray(y_train)
x_test = np.asarray(x_test)
y_test = np.asarray(y_test)

In [None]:
# Feature selection
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest

fs_sentanalysis=SelectKBest(
    chi2, k=500
    ).fit(x_train_sentanalysis, y_train_sentanalysis)
x_train_sentanalysis_new = fs_sentanalysis.transform(x_train_sentanalysis)
x_test_new = fs_sentanalysis.transform(x_test)
print ("Size original training matrix: "+str(x_train_sentanalysis.shape))
print ("Size new training matrix: "+str(x_train_sentanalysis_new.shape))

Size original training matrix: (1335, 1000)
Size new training matrix: (1335, 500)


In [None]:
model = sklearn.svm.SVC(kernel="linear",gamma='auto')
model.fit(x_train_sentanalysis_new,y_train_sentanalysis)

In [None]:
y_pred = model.predict(x_test_new)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9561797752808989


In [None]:
kf = KFold(n_splits=10, shuffle=True)
cross_val_results = cross_val_score(model, x_all, y_all, cv=kf)

print(f"K-fold Results: {cross_val_results}")
print(f"Mean Accuracy: {np.mean(cross_val_results)}")

K-fold Results: [0.96412556 0.95964126 0.98206278 0.96860987 0.96412556 0.95945946
 0.95945946 0.94144144 0.95945946 0.97297297]
Mean Accuracy: 0.9631357815214319


In [None]:
show_marco_average_metrics(y_test, y_pred)

macro-averaged precision:  0.9584351412851945
macro-averaged recall:  0.9543280013503226
macro-averaged F1:  0.9561604182685943


In [None]:
## Added Word count of the text file as second feature
x_all=[]
y_all=[]

for entry in raw_data:
    # Feature 1: Word frequency
    vector_pos=get_vector_text(vocabulary, entry[0])
    # Feature 2: Word count of the text file
    word_count = len(get_list_tokens(entry[0]))
    features = np.append(vector_pos, word_count)
    x_all.append(features)
    y_all.append(entry[1])

x_train, x_test, y_train, y_test = train_test_split(
    x_all, y_all, test_size=0.4
    )

x_train_sentanalysis = np.asarray(x_train)
y_train_sentanalysis = np.asarray(y_train)
x_test = np.asarray(x_test)
y_test = np.asarray(y_test)

fs_sentanalysis=SelectKBest(
    chi2, k=500
    ).fit(x_train_sentanalysis, y_train_sentanalysis)

# Force keep the manually added features
selected_indices = fs_sentanalysis.get_support(indices=True)
if 1000 not in selected_indices:
    print("Adding the word count feature")
    selected_indices = np.concatenate([selected_indices, [1000]])
else:
    print("Word count feature is already selected by SelectKBest()")

x_train_sentanalysis_new = x_train_sentanalysis[:, selected_indices]
x_test_new = x_test[:, selected_indices]

print ("Size original training matrix: "+str(x_train_sentanalysis.shape))
print ("Size new training matrix: "+str(x_train_sentanalysis_new.shape))

Word count feature is already selected by SelectKBest()
Size original training matrix: (1335, 1001)
Size new training matrix: (1335, 500)


In [None]:
model = sklearn.svm.SVC(kernel="linear",gamma='auto')
model.fit(x_train_sentanalysis_new,y_train_sentanalysis)

In [None]:
y_pred = model.predict(x_test_new)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9415730337078652


In [None]:
kf = KFold(n_splits=10, shuffle=True)
cross_val_results = cross_val_score(model, x_all, y_all, cv=kf)

print(f"K-fold Results: {cross_val_results}")
print(f"Mean Accuracy: {np.mean(cross_val_results)}")

K-fold Results: [0.96412556 0.95515695 0.97757848 0.92825112 0.97757848 0.95945946
 0.94144144 0.96396396 0.95945946 0.93693694]
Mean Accuracy: 0.95639518442209


In [None]:
show_marco_average_metrics(y_test, y_pred)

macro-averaged precision:  0.9424743379013758
macro-averaged recall:  0.9398601452078157
macro-averaged F1:  0.9407663972383599


In [None]:
## Added Named Entity recognition as the thrid feature

import spacy

"""
Please run
python -m spacy download en
if you don't have en_core_web_sm
"""
# English NLP model
nlp = spacy.load("en_core_web_sm")

ner_labels = nlp.get_pipe("ner").labels
print(ner_labels) # Possible labels

('CARDINAL', 'DATE', 'EVENT', 'FAC', 'GPE', 'LANGUAGE', 'LAW', 'LOC', 'MONEY', 'NORP', 'ORDINAL', 'ORG', 'PERCENT', 'PERSON', 'PRODUCT', 'QUANTITY', 'TIME', 'WORK_OF_ART')


In [None]:
def count_entity(text: str) -> List[int]:
    """
    Count the occurance of the following selected labels in the input text:
    GPE (locations, like city anc country),
    LAW,
    MONEY,
    ORG,
    PRODUCT,
    WORK_OF_ART
    Which will be in corrosponding index from 0 to 5
    """
    useful_labels = ['GPE', 'MONEY', 'ORG', 'PRODUCT', 'WORK_OF_ART']
    count = {label: 0 for label in useful_labels}
    doc = nlp(text)

    for ent in doc.ents:
        if ent.label_ in useful_labels:
            count[ent.label_] += 1

    # Convert to List[int] and return
    return [count[label] for label in useful_labels]

In [None]:
x_all=[]
y_all=[]

for entry in raw_data:
    # Feature 1: Word frequency
    vector_pos=get_vector_text(vocabulary, entry[0])
    # Feature 2: Word count of the text file
    word_count = len(get_list_tokens(entry[0]))
    features = np.append(vector_pos, word_count)
    # Feature 3: Named Entity recognition label count
    entity_count = count_entity(entry[0])
    features = np.append(features, entity_count)
    x_all.append(features)
    y_all.append(entry[1])

In [None]:
x_train, x_test, y_train, y_test = train_test_split(
    x_all, y_all, test_size=0.4
    )

x_train_sentanalysis = np.asarray(x_train)
y_train_sentanalysis = np.asarray(y_train)
x_test = np.asarray(x_test)
y_test = np.asarray(y_test)

fs_sentanalysis=SelectKBest(
    chi2, k=500
    ).fit(x_train_sentanalysis, y_train_sentanalysis)

# Force keep the manually added features
selected_indices = fs_sentanalysis.get_support(indices=True)

for i in range(1000, 1006):
    if i not in selected_indices:
        print(f"Adding the feature with index {str(i)}")
        selected_indices = np.concatenate([selected_indices, [i]])
    else:
        print(f"Feature index {str(i)} is already selected by SelectKBest()")

x_train_sentanalysis_new = x_train_sentanalysis[:, selected_indices]
x_test_new = x_test[:, selected_indices]

print ("Size original training matrix: "+str(x_train_sentanalysis.shape))
print ("Size new training matrix: "+str(x_train_sentanalysis_new.shape))

Feature index 1000 is already selected by SelectKBest()
Feature index 1001 is already selected by SelectKBest()
Feature index 1002 is already selected by SelectKBest()
Feature index 1003 is already selected by SelectKBest()
Feature index 1004 is already selected by SelectKBest()
Feature index 1005 is already selected by SelectKBest()
Size original training matrix: (1335, 1006)
Size new training matrix: (1335, 500)


In [None]:
model = sklearn.svm.SVC(kernel="linear",gamma='auto')
model.fit(x_train_sentanalysis_new,y_train_sentanalysis)

In [None]:
y_pred = model.predict(x_test_new)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9516853932584269


In [None]:
kf = KFold(n_splits=10, shuffle=True)
cross_val_results = cross_val_score(model, x_all, y_all, cv=kf)

print(f"K-fold Results: {cross_val_results}")
print(f"Mean Accuracy: {np.mean(cross_val_results)}")

K-fold Results: [0.94618834 0.95067265 0.96412556 0.94618834 0.97309417 0.96846847
 0.96846847 0.95495495 0.93693694 0.96846847]
Mean Accuracy: 0.9577566355593261


In [None]:
show_marco_average_metrics(y_test, y_pred)

macro-averaged precision:  0.9523152028956984
macro-averaged recall:  0.9506877360310991
macro-averaged F1:  0.9513660760459995
