In [1]:
import numpy as np
import nltk
import sklearn
import os
import operator

In [2]:
# nltk packages needed for tagging tokens in verb feature
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Iwan_Munro\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\Iwan_Munro\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

# Verb Feature

In [3]:
from collections import Counter
def count_verbs(file):
    text = nltk.tokenize.word_tokenize(file)
    # individually tag each word in article
    tagged_toks = nltk.pos_tag(text, tagset='universal')
    # count each instance of each word type and group
    all_counts = Counter(tag for token, tag in tagged_toks)

    return all_counts['VERB']

# Pre-Processing

In [None]:
# Y Labels
# 0 - Sport
# 1 - Business
# 2 - Entertainment
# 3 - Politics
# 4 - Technology

# list for all the articles, their verb count and label
articles_verbs_label = []

# list of subjects in order to iterate through and have one block of code
subjects = ['sport', 'business', 'entertainment', 'politics', 'tech']
for i in subjects:
    # iterate through each article topic directory
    for file in os.listdir("datasets_coursework1\\bbc\\" + i):
        # make sure only text files are included
        if file.endswith(".txt"):
            file_path = "datasets_coursework1\\bbc\\" + i + "\\" + file
            f = open(file_path, "r")
            file = f.read()
            # get verb count for article
            verbs = count_verbs(file)
            arr = [file, verbs]
            # based on which article topic is being read, label accordingly
            if i == 'sport':
                arr.append(0)
                articles_verbs_label.append(arr)
            elif i == 'business':
                arr.append(1)
                articles_verbs_label.append(arr)
            elif i == 'entertainment':
                arr.append(2)
                articles_verbs_label.append(arr)
            elif i == 'politics':
                arr.append(3)
                articles_verbs_label.append(arr)
            elif i == 'tech':
                arr.append(4)
                articles_verbs_label.append(arr)
        else: continue

# Bi-gram Feature

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
# instantiate n-gram vectorizer forcing bi-grams, removing english stop words and capping features at 10,000
vectorizer = CountVectorizer(ngram_range=(2, 2), stop_words={'english'}, max_features=10000)
# fit vectoriser to only the articles in the list
n_gram_vectorised = vectorizer.fit([element[0] for element in articles_verbs_label])

# Vectorised Word Count Feature

In [None]:
# instantiate lemmatizer
lemmatizer = nltk.stem.WordNetLemmatizer()

def get_list_tokens(article):
    list_tokens=[]
    # get word tokens from article
    list_tokens_sentence=nltk.tokenize.word_tokenize(article)
    # take each token, lemmatize and lower
    for token in list_tokens_sentence:
        list_tokens.append(lemmatizer.lemmatize(token).lower())

    return list_tokens

In [None]:
# dictionary for word freqencies
dict_word_frequency = {}
# start with english stopwords
stopwords=nltk.corpus.stopwords.words('english')
# add some custom stopwords into the list
stopwords.extend([".",",","'","''","``","%","-","(",")",":"])

# take it article, tokenise and then add to word frequency dictionary
for entry in articles_verbs_label:    
    for word in get_list_tokens(entry[0]):
        if word in stopwords: continue
        if word not in dict_word_frequency: dict_word_frequency[word]=1
        else: dict_word_frequency[word]+=1

# create sorted frequency list with the top 1000 words
sorted_list = sorted(dict_word_frequency.items(), key=operator.itemgetter(1), reverse=True)[:1000]
  
# Create a vocabulary based on the sorted frequency list 
vocabulary=[]
for word,frequency in sorted_list:
  vocabulary.append(word)

In [None]:
def get_vector_text(list_vocab,string):
    # create vector to house number of each word in article
    vector_text=np.zeros(len(list_vocab))
    list_tokens_string=get_list_tokens(string)

    # for each word in the vocab see if its in the article
    # and count the number of instances to add to vector
    for i, word in enumerate(list_vocab):
        if word in list_tokens_string:
            vector_text[i]=list_tokens_string.count(word)

    return vector_text

# Instantiate X and Y variables for training

In [None]:
# X and Y lists to put all features in
X=[]
Y=[]

# for each element of the list vectorise the article, get the bi-gram array of the article and get
# the verb count and concatenate them all together and add the label in the Y list in the same place
for entry in articles_verbs_label:
    sub_story=get_vector_text(vocabulary,entry[0])
    var = np.concatenate((n_gram_vectorised.transform([entry[0]]).toarray()[0], np.append(sub_story, entry[-2])))
    X.append(var)
    Y.append(entry[-1])

In [None]:
# perrom chi2 tests on features and take best 2500
from sklearn.feature_selection import SelectKBest, chi2
X_new = SelectKBest(chi2, k=2500).fit_transform(X,Y)

In [None]:
# used in testing but did not make accuracy any better
# from sklearn import cluster
# agglo = cluster.FeatureAgglomeration(n_clusters=1000)
# agglo.fit(X_new)
# X_reduced = agglo.transform(X_new)
# X_reduced.shape

In [None]:
# split X and Y into training, testing and development sets
from sklearn.model_selection import train_test_split
X_train, X_dev, Y_train, Y_dev = train_test_split(X_new, Y, test_size=0.3, random_state=42)
X_dev, X_test, Y_dev, Y_test = train_test_split(X_dev, Y_dev, test_size=0.5, random_state=32)

## TRAINING

In [None]:
# train linear svm in training set!!
svm_clf_storytype=sklearn.svm.SVC(kernel="linear",gamma='auto')
svm_clf_storytype.fit(X_train,Y_train)

## DEVELOPMENT

In [None]:
# predict development values using model
Y_dev_pred = svm_clf_storytype.predict(X_dev)

In [None]:
# compare predicted dev values to known dev values for accuracy score
from sklearn.metrics import accuracy_score
accuracy_test = accuracy_score(Y_dev_pred,Y_dev)
print('The accuracy in the test set is: '+str(round(accuracy_test,3)))

## TESTING

In [None]:
# predict testing values using model
Y_pred = svm_clf_storytype.predict(X_test)

In [None]:
# compare predicted test values to known test values for accuracy score
from sklearn.metrics import accuracy_score
accuracy_test = accuracy_score(Y_pred,Y_test)
print('The accuracy in the test set is: '+str(round(accuracy_test,3)))

In [None]:
# get further evaluation stats for model
print(sklearn.metrics.classification_report(Y_pred, Y_test))

In [None]:
# extra evaluation stat to see where mislabelled values were being labelled as
print(sklearn.metrics.confusion_matrix(Y_pred, Y_test))