In [1]:
import numpy as np
import nltk
import sklearn
import os
import operator

In [2]:
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Iwan_Munro\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\Iwan_Munro\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

In [3]:
from collections import Counter
def count_verbs(file):
    text = nltk.tokenize.word_tokenize(file)
    tagged_toks = nltk.pos_tag(text, tagset='universal')
    all_counts = Counter(tag for token, tag in tagged_toks)

    return all_counts['VERB']

In [4]:
df_sport = []
df_bus = []
df_enter = []
df_pol = []
df_tech = []

subjects = ['sport', 'business', 'entertainment', 'politics', 'tech']
for i in subjects:
    for file in os.listdir("datasets_coursework1\\bbc\\" + i):
        if file.endswith(".txt"):
            file_path = "datasets_coursework1\\bbc\\" + i + "\\" + file
            f = open(file_path, "r")
            file = f.read()
            verbs = count_verbs(file)
            list_of_sents = file.split("\n\n")
            list_of_sents.append(verbs)
            if i == 'sport':
                df_sport.append(list_of_sents)
            elif i == 'business':
                df_bus.append(list_of_sents)
            elif i == 'entertainment':
                df_enter.append(list_of_sents)
            elif i == 'politics':
                df_pol.append(list_of_sents)
            elif i == 'tech':
                df_tech.append(list_of_sents)
        else: continue

In [5]:
lemmatizer = nltk.stem.WordNetLemmatizer()

def get_list_tokens(sentence_split):
    list_tokens=[]
    for sentence in sentence_split:
        temp_tokens=[]
        list_tokens_sentence=nltk.tokenize.word_tokenize(sentence)

        for token in list_tokens_sentence:
            list_tokens.append(lemmatizer.lemmatize(token).lower())

    return list_tokens

In [6]:
def dict_creation(ind_story, dict_words):
    stopwords=nltk.corpus.stopwords.words('english')
    stopwords.extend([".",",","'","''","``","%","-","(",")",":"])
    
    sentence_tokens=get_list_tokens(ind_story)
    for word in sentence_tokens:
        if word in stopwords: continue
        if word not in dict_words: dict_words[word]=1
        else: dict_words[word]+=1

    return dict_words

In [7]:
subjects = ['sport', 'business', 'entertainment', 'politics', 'tech']
df_all = []
for i in subjects:
    for file in os.listdir("datasets_coursework1\\bbc\\" + i):
        if file.endswith(".txt"):
            file_path = "datasets_coursework1\\bbc\\" + i + "\\" + file
            f = open(file_path, "r")
            file = f.read()
            df_all.append(file)

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(analyzer='word', ngram_range=(2, 2), stop_words={'english'}, max_features=10000)
n_gram_vectorised = vectorizer.fit(df_all)

In [9]:
dict_word_frequency = {}
for story in df_sport:
    dict_word_frequency = dict_creation(story[:-1], dict_word_frequency)

for story in df_bus:
    dict_word_frequency = dict_creation(story[:-1], dict_word_frequency)
        
for story in df_enter:
    dict_word_frequency = dict_creation(story[:-1], dict_word_frequency)
        
for story in df_pol:
    dict_word_frequency = dict_creation(story[:-1], dict_word_frequency)
        
for story in df_tech:
    dict_word_frequency = dict_creation(story[:-1], dict_word_frequency)

# Now we create a sorted frequency list with the top 1000 words, using the function "sorted". Let's see the 15 most frequent words
sorted_list = sorted(dict_word_frequency.items(), key=operator.itemgetter(1), reverse=True)[:1000]
  
# Finally, we create our vocabulary based on the sorted frequency list 
vocabulary=[]
for word,frequency in sorted_list:
  vocabulary.append(word)

In [10]:
def get_vector_text(list_vocab,string):
    vector_text=np.zeros(len(list_vocab))
    list_tokens_string=get_list_tokens(string)

    for i, word in enumerate(list_vocab):
        if word in list_tokens_string:
            vector_text[i]=list_tokens_string.count(word)

    return vector_text

In [24]:
X=[]
Y=[]

# Y Labels
# 0 - Sport
# 1 - Business
# 2 - Entertainment
# 3 - Politics
# 4 - Technology

for entry in df_sport:
    sub_story=get_vector_text(vocabulary,entry[:-1])
    var = [sub_story, entry[-1], n_gram_vectorised.transform(entry[:-1])]
    X.append(var)
    Y.append(0)

for entry in df_bus:
    sub_story=get_vector_text(vocabulary,entry[:-1])
    var = [sub_story, entry[-1], n_gram_vectorised.transform(entry[:-1])]
    X.append(var)
    Y.append(1)

for entry in df_enter:
    sub_story=get_vector_text(vocabulary,entry[:-1])
    var = [sub_story, entry[-1], n_gram_vectorised.transform(entry[:-1])]
    X.append(var)
    Y.append(2)

for entry in df_pol:
    sub_story=get_vector_text(vocabulary,entry[:-1])
    var = [sub_story, entry[-1], n_gram_vectorised.transform(entry[:-1])]
    X.append(var)
    Y.append(3)

for entry in df_tech:
    sub_story=get_vector_text(vocabulary,entry[:-1])
    var = [sub_story, entry[-1], n_gram_vectorised.transform(entry[:-1])]
    X.append(var)
    Y.append(4)

In [25]:
# from sklearn.feature_selection import SelectKBest, chi2
# X_new = SelectKBest(chi2, k=500).fit_transform(X,Y)

In [26]:
# from sklearn import cluster
# agglo = cluster.FeatureAgglomeration(n_clusters=32)
# agglo.fit(X)
# X_reduced = agglo.transform(X)
# X_reduced.shape

In [27]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X_reduced, Y, test_size=0.2)

In [28]:
svm_clf_storytype=sklearn.svm.SVC(kernel="linear",gamma='auto')
svm_clf_storytype.fit(X_train,Y_train)

SVC(gamma='auto', kernel='linear')

In [29]:
Y_pred = svm_clf_storytype.predict(X_test)

In [30]:
from sklearn.metrics import accuracy_score
accuracy_test = accuracy_score(Y_pred,Y_test)
print('The accuracy in the test set is: '+str(round(accuracy_test*100,3)))

The accuracy in the test set is: 93.034


In [31]:
print(sklearn.metrics.classification_report(Y_pred, Y_test))

              precision    recall  f1-score   support

           0       0.97      0.91      0.94        96
           1       0.94      0.92      0.93       106
           2       0.91      0.96      0.93        73
           3       0.93      0.92      0.93        89
           4       0.90      0.95      0.92        81

    accuracy                           0.93       445
   macro avg       0.93      0.93      0.93       445
weighted avg       0.93      0.93      0.93       445



In [32]:
print(sklearn.metrics.confusion_matrix(Y_pred, Y_test))

[[87  1  3  1  4]
 [ 0 98  2  3  3]
 [ 2  1 70  0  0]
 [ 1  3  1 82  2]
 [ 0  1  1  2 77]]
