In [1]:
import numpy as np
import nltk
import sklearn
import os
import operator

In [2]:
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Iwan_Munro\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\Iwan_Munro\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

In [3]:
from collections import Counter
def count_verbs(file):
    text = nltk.tokenize.word_tokenize(file)
    tagged_toks = nltk.pos_tag(text, tagset='universal')
    all_counts = Counter(tag for token, tag in tagged_toks)

    return all_counts['VERB']

In [4]:
# Y Labels
# 0 - Sport
# 1 - Business
# 2 - Entertainment
# 3 - Politics
# 4 - Technology

articles_verbs_label = []

subjects = ['sport', 'business', 'entertainment', 'politics', 'tech']
for i in subjects:
    for file in os.listdir("datasets_coursework1\\bbc\\" + i):
        if file.endswith(".txt"):
            file_path = "datasets_coursework1\\bbc\\" + i + "\\" + file
            f = open(file_path, "r")
            file = f.read()
            verbs = count_verbs(file)
            arr = [file, verbs]
            if i == 'sport':
                arr.append(0)
                articles_verbs_label.append(arr)
            elif i == 'business':
                arr.append(1)
                articles_verbs_label.append(arr)
            elif i == 'entertainment':
                arr.append(2)
                articles_verbs_label.append(arr)
            elif i == 'politics':
                arr.append(3)
                articles_verbs_label.append(arr)
            elif i == 'tech':
                arr.append(4)
                articles_verbs_label.append(arr)
        else: continue

In [5]:
lemmatizer = nltk.stem.WordNetLemmatizer()

def get_list_tokens(article):
    list_tokens=[]
    list_tokens_sentence=nltk.tokenize.word_tokenize(article)
    for token in list_tokens_sentence:
        list_tokens.append(lemmatizer.lemmatize(token).lower())

    return list_tokens

In [6]:
def dict_creation(ind_story, dict_words):
    stopwords=nltk.corpus.stopwords.words('english')
    stopwords.extend([".",",","'","''","``","%","-","(",")",":"])
    
    sentence_tokens=get_list_tokens(ind_story)
    for word in sentence_tokens:
        if word in stopwords: continue
        if word not in dict_words: dict_words[word]=1
        else: dict_words[word]+=1

    return dict_words

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(analyzer='word', ngram_range=(2, 2), stop_words={'english'}, max_features=10000)
n_gram_vectorised = vectorizer.fit([element[0] for element in articles_verbs_label])

In [8]:
dict_word_frequency = {}
for entry in articles_verbs_label:
    dict_word_frequency = dict_creation(entry[0], dict_word_frequency)

# Now we create a sorted frequency list with the top 1000 words, using the function "sorted"
sorted_list = sorted(dict_word_frequency.items(), key=operator.itemgetter(1), reverse=True)[:1000]
  
# Create a vocabulary based on the sorted frequency list 
vocabulary=[]
for word,frequency in sorted_list:
  vocabulary.append(word)

In [9]:
def get_vector_text(list_vocab,string):
    vector_text=np.zeros(len(list_vocab))
    list_tokens_string=get_list_tokens(string)

    for i, word in enumerate(list_vocab):
        if word in list_tokens_string:
            vector_text[i]=list_tokens_string.count(word)

    return vector_text

In [10]:
X=[]
Y=[]

for entry in articles_verbs_label:
    sub_story=get_vector_text(vocabulary,entry[0])
    var = np.concatenate((n_gram_vectorised.transform([entry[0]]).toarray()[0], np.append(sub_story, entry[-2])))
    X.append(var)
    Y.append(entry[-1])

In [95]:
from sklearn.feature_selection import SelectKBest, chi2
X_new = SelectKBest(chi2, k=2500).fit_transform(X,Y)

In [96]:
from sklearn import cluster
agglo = cluster.FeatureAgglomeration(n_clusters=500)
agglo.fit(X_new)
X_reduced = agglo.transform(X_new)
X_reduced.shape

(2225, 500)

In [97]:
from sklearn.model_selection import train_test_split
X_train, X1_test, Y_train, Y1_test = train_test_split(X_reduced, Y, test_size=0.3, random_state=42)
X_dev, X_test, Y_dev, Y_test = train_test_split(X1_test, Y1_test, test_size=0.5, random_state=32)

In [98]:
svm_clf_storytype=sklearn.svm.SVC(kernel="linear",gamma='auto')
svm_clf_storytype.fit(X_train,Y_train)

SVC(gamma='auto', kernel='linear')

In [99]:
Y_dev_pred = svm_clf_storytype.predict(X_dev)

In [100]:
from sklearn.metrics import accuracy_score
accuracy_test = accuracy_score(Y_dev_pred,Y_dev)
print('The accuracy in the test set is: '+str(round(accuracy_test*100,3)))

The accuracy in the test set is: 95.21


In [37]:
Y_pred = svm_clf_storytype.predict(X_test)

In [38]:
from sklearn.metrics import accuracy_score
accuracy_test = accuracy_score(Y_pred,Y_test)
print('The accuracy in the test set is: '+str(round(accuracy_test*100,3)))

The accuracy in the test set is: 94.012


In [39]:
print(sklearn.metrics.classification_report(Y_pred, Y_test))

              precision    recall  f1-score   support

           0       0.99      0.95      0.97        87
           1       0.93      0.92      0.92        87
           2       0.90      0.93      0.91        46
           3       0.88      0.90      0.89        49
           4       0.97      0.98      0.98        65

    accuracy                           0.94       334
   macro avg       0.93      0.94      0.94       334
weighted avg       0.94      0.94      0.94       334



In [40]:
print(sklearn.metrics.confusion_matrix(Y_pred, Y_test))

[[83  2  0  2  0]
 [ 0 80  3  3  1]
 [ 1  0 43  1  1]
 [ 0  3  2 44  0]
 [ 0  1  0  0 64]]
