<a href="https://colab.research.google.com/github/hassankhan0296/BBCArticleClassification/blob/main/CMT316_Q2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# mount google drive

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Import libraries

import pandas as pd
import numpy as np
import nltk
import sklearn
import operator
import requests

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import warnings
warnings.filterwarnings("ignore")


nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
#moving data from folders to single dataframe

from os import listdir
from os.path import isfile, join
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split

#change path value if datasets folder is different
path = "/content/drive/MyDrive/CMT316/bbc"

data = []
dictionary = {}

i = 0
for folder in sorted(listdir(path)):
  
    for file in sorted(listdir(path+'/'+folder)):
        contents = Path(path+'/'+folder+'/'+file).read_text("unicode_escape")
        data.append((folder, i, file, contents))
        dictionary[i] = {"folder":folder,"file":file,"contents":contents}
       
        
        i+=1

df = pd.DataFrame.from_dict(dictionary, "index")



In [None]:
df

Unnamed: 0,folder,file,contents
0,business,001.txt,Ad sales boost Time Warner profit\n\nQuarterly...
1,business,002.txt,Dollar gains on Greenspan speech\n\nThe dollar...
2,business,003.txt,Yukos unit buyer faces loan claim\n\nThe owner...
3,business,004.txt,High fuel prices hit BA's profits\n\nBritish A...
4,business,005.txt,Pernod takeover talk lifts Domecq\n\nShares in...
...,...,...,...
2230,tech,397.txt,BT program to beat dialler scams\n\nBT is intr...
2231,tech,398.txt,Spam e-mails tempt net shoppers\n\nComputer us...
2232,tech,399.txt,Be careful how you code\n\nA new European dire...
2233,tech,400.txt,US cyber security chief resigns\n\nThe man mak...


In [None]:
# check articles count in different categories
df.groupby(['folder']).folder.count()

folder
business         510
entertainment    396
politics         417
sport            511
tech             401
Name: folder, dtype: int64

In [None]:
# Drop duplicate data
df.drop_duplicates(subset=['folder', 'contents'], inplace=True)


# check articles count in different categories again
df.groupby(['folder']).folder.count()

folder
business         503
entertainment    369
politics         403
sport            505
tech             347
Name: folder, dtype: int64

In [None]:
# Drop empty data
df.dropna(subset = ["contents"], inplace=True)

# check articles count in different categories
df.groupby(['folder']).folder.count()

folder
business         503
entertainment    369
politics         403
sport            505
tech             347
Name: folder, dtype: int64

In [None]:
# Tokenize
df['text_clean'] = df['contents'].apply(nltk.word_tokenize)
print('Tokenization complete.')

df



Tokenization complete.


Unnamed: 0,folder,file,contents,text_clean
0,business,001.txt,Ad sales boost Time Warner profit\n\nQuarterly...,"[Ad, sales, boost, Time, Warner, profit, Quart..."
1,business,002.txt,Dollar gains on Greenspan speech\n\nThe dollar...,"[Dollar, gains, on, Greenspan, speech, The, do..."
2,business,003.txt,Yukos unit buyer faces loan claim\n\nThe owner...,"[Yukos, unit, buyer, faces, loan, claim, The, ..."
3,business,004.txt,High fuel prices hit BA's profits\n\nBritish A...,"[High, fuel, prices, hit, BA, 's, profits, Bri..."
4,business,005.txt,Pernod takeover talk lifts Domecq\n\nShares in...,"[Pernod, takeover, talk, lifts, Domecq, Shares..."
...,...,...,...,...
2229,tech,396.txt,New consoles promise big problems\n\nMaking ga...,"[New, consoles, promise, big, problems, Making..."
2230,tech,397.txt,BT program to beat dialler scams\n\nBT is intr...,"[BT, program, to, beat, dialler, scams, BT, is..."
2232,tech,399.txt,Be careful how you code\n\nA new European dire...,"[Be, careful, how, you, code, A, new, European..."
2233,tech,400.txt,US cyber security chief resigns\n\nThe man mak...,"[US, cyber, security, chief, resigns, The, man..."


In [None]:
#import regex
import re

# Remove stopwords
stop_words=set(nltk.corpus.stopwords.words("english"))
df['text_clean'] = df['text_clean'].apply(lambda x: [item for item in x if item not in stop_words])
print('Stopwords removed.')

# Remove numbers, punctuation and special characters (only keep words)
regex = '[a-z]+'
df['text_clean'] = df['text_clean'].apply(lambda x: [item for item in x if re.match(regex, item)])
print('Numbers, punctuation and special characters removed.')

# Lemmatization
lem = nltk.stem.wordnet.WordNetLemmatizer()
df['text_clean'] = df['text_clean'].apply(lambda x: [lem.lemmatize(item, pos='v') for item in x])
print('Lemmatization  completed.\n')

Stopwords removed.
Numbers, punctuation and special characters removed.
Lemmatization  completed.



In [None]:
df

Unnamed: 0,folder,file,contents,text_clean
0,business,001.txt,Ad sales boost Time Warner profit\n\nQuarterly...,"[sales, boost, profit, profit, media, giant, j..."
1,business,002.txt,Dollar gains on Greenspan speech\n\nThe dollar...,"[gain, speech, dollar, hit, highest, level, eu..."
2,business,003.txt,Yukos unit buyer faces loan claim\n\nThe owner...,"[unit, buyer, face, loan, claim, owners, embat..."
3,business,004.txt,High fuel prices hit BA's profits\n\nBritish A...,"[fuel, price, hit, profit, blame, high, fuel, ..."
4,business,005.txt,Pernod takeover talk lifts Domecq\n\nShares in...,"[takeover, talk, lift, drink, food, firm, rise..."
...,...,...,...,...
2229,tech,396.txt,New consoles promise big problems\n\nMaking ga...,"[console, promise, big, problems, game, future..."
2230,tech,397.txt,BT program to beat dialler scams\n\nBT is intr...,"[program, beat, dialler, scam, introduce, two,..."
2232,tech,399.txt,Be careful how you code\n\nA new European dire...,"[careful, code, new, directive, could, put, so..."
2233,tech,400.txt,US cyber security chief resigns\n\nThe man mak...,"[cyber, security, chief, resign, man, make, su..."


In [None]:
# Encode label categories to numbers
enc = LabelEncoder()
df['folder'] = enc.fit_transform(df['folder'])
labels = list(enc.classes_)

df

Unnamed: 0,folder,file,contents,text_clean
0,0,001.txt,Ad sales boost Time Warner profit\n\nQuarterly...,"[sales, boost, profit, profit, media, giant, j..."
1,0,002.txt,Dollar gains on Greenspan speech\n\nThe dollar...,"[gain, speech, dollar, hit, highest, level, eu..."
2,0,003.txt,Yukos unit buyer faces loan claim\n\nThe owner...,"[unit, buyer, face, loan, claim, owners, embat..."
3,0,004.txt,High fuel prices hit BA's profits\n\nBritish A...,"[fuel, price, hit, profit, blame, high, fuel, ..."
4,0,005.txt,Pernod takeover talk lifts Domecq\n\nShares in...,"[takeover, talk, lift, drink, food, firm, rise..."
...,...,...,...,...
2229,4,396.txt,New consoles promise big problems\n\nMaking ga...,"[console, promise, big, problems, game, future..."
2230,4,397.txt,BT program to beat dialler scams\n\nBT is intr...,"[program, beat, dialler, scam, introduce, two,..."
2232,4,399.txt,Be careful how you code\n\nA new European dire...,"[careful, code, new, directive, could, put, so..."
2233,4,400.txt,US cyber security chief resigns\n\nThe man mak...,"[cyber, security, chief, resign, man, make, su..."


In [None]:
# convet array of words in text_clean column to string
df['text_clean_string'] = [','.join(map(str, l)) for l in df['text_clean']]

# split dataset to training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(df['text_clean_string'], 
                                                    df['folder'], test_size=0.2, 
                                                    shuffle=True)

X_train.count(),X_test.count()

(1701, 426)

In [None]:
# TFIDF vectorizer

# Vectorize training and testing data
def Vectorize(vec, X_train, X_test):    
    
    X_train_vec = vec.fit_transform(X_train)
    X_test_vec = vec.transform(X_test)
    
    print('Vectorization complete.\n')
    
    return X_train_vec, X_test_vec

In [None]:
# call vectorize method
X_train_vec, X_test_vec = Vectorize(TfidfVectorizer(max_features=500, min_df=0.01, max_df=0.1), X_train, X_test)

# check no of features in training set
print(X_train_vec.shape)


model = LogisticRegression()
model.fit(X_train_vec, y_train)
y_pred = model.predict(X_test_vec)

print("Accuracy: %1.3f \tPrecision: %1.3f \tRecall: %1.3f \t\tF1: %1.3f\n" % 
      (accuracy_score(y_test, y_pred), precision_score(y_test,y_pred, average='macro'), 
      recall_score(y_test, y_pred, average='macro'), f1_score(y_test, y_pred, average='macro')))


Vectorization complete.

(1701, 500)
Accuracy: 0.948 	Precision: 0.946 	Recall: 0.948 		F1: 0.947

