# Overview

# Importing Necessary Libraries

In [45]:
import pandas as pd
import os
import spacy
import re

# !python -m spacy download en_core_web_md

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.cluster import KMeans

import numpy as np

# Loading Dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
dataset_path = '/content/drive/MyDrive/bbc'
dataset = []

Firstly, I would combine the various text files into a list, and then a dataframe, to make processing easier

In [None]:
for category in os.listdir(dataset_path):       #loops through the items in root dataset folder
    category_path = os.path.join(dataset_path, category)       #constructs the path for each item
    if os.path.isdir(category_path):       #checks what items are directories
        for filename in os.listdir(category_path):       #loops through directories
            file_path = os.path.join(category_path, filename)       #constructs the path for each filee
            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read().strip()
                dataset.append({'text':text, 'category':category})

In [None]:
text_df = pd.DataFrame(dataset)      # converting the resulting list to a dataframe

In [None]:
text_df

Unnamed: 0,text,category
0,Media gadgets get moving\n\nPocket-sized devic...,tech
1,Can Yahoo dominate next decade?\n\nYahoo has r...,tech
2,Robots learn 'robotiquette' rules\n\nRobots ar...,tech
3,PC photo printers challenge pros\n\nHome print...,tech
4,Games win for Blu-ray DVD format\n\nThe next-g...,tech
...,...,...
2220,Iraqi voters turn to economic issues\n\nBeyond...,business
2221,Bank holds interest rate at 4.75%\n\nThe Bank ...,business
2222,Jobs growth still slow in the US\n\nThe US cre...,business
2223,Renault boss hails 'great year'\n\nStrong sale...,business


# Preprocessing Text Data

In [None]:
nlp = spacy.load('en_core_web_sm')

#Using the stopword list from http://mlg.ucd.ie/files/datasets/stopwords.txt to preprocess the dataset

custom_stopwords = []
with open('/content/drive/MyDrive/bbc/stopwords.txt', 'r', encoding='utf-8') as file:
    for line in file:
        custom_stopwords.append(line.strip())

print(custom_stopwords)

['a', 'about', 'above', 'according', 'across', 'actually', 'adj', 'after', 'afterwards', 'again', 'all', 'almost', 'along', 'already', 'also', 'although', 'always', 'among', 'amongst', 'an', 'am', 'and', 'another', 'any', 'anyhow', 'anyone', 'anything', 'anywhere', 'are', 'aren', "aren't", 'around', 'as', 'at', 'be', 'became', 'because', 'become', 'becomes', 'been', 'beforehand', 'begin', 'being', 'below', 'beside', 'besides', 'between', 'both', 'but', 'by', 'can', 'cannot', "can't", 'caption', 'co', 'come', 'could', 'couldn', "couldn't", 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'don', "don't", 'down', 'during', 'each', 'early', 'eg', 'either', 'else', 'elsewhere', 'end', 'ending', 'enough', 'etc', 'even', 'ever', 'every', 'everywhere', 'except', 'few', 'for', 'found', 'from', 'further', 'had', 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'he', 'hence', 'her', 'here', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers', 'him', 'his', 'how', 'however', 'ie', 'i

In [None]:
#creating a function for preprocessing - lowercasing, lemmatization, stopword removal

def preprocess(text):

    # Remove special characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

    # Processing with Spacy
    doc = nlp(text)

    # Lemmatization and removing stopwords using thee custom list
    tokens = [token.lemma_.lower() for token in doc if token.lemma_.lower() not in custom_stopwords]
    return ' '.join(tokens)

text_df['preprocessed_text'] = text_df['text'].apply(preprocess)

Unnamed: 0,text,category,preprocessed_text
0,Media gadgets get moving\n\nPocket-sized devic...,tech,medium gadget get move \n\n pocketsize device ...
1,Can Yahoo dominate next decade?\n\nYahoo has r...,tech,can yahoo dominate next decade \n\n yahoo have...
2,Robots learn 'robotiquette' rules\n\nRobots ar...,tech,robot learn robotiquette rule \n\n robot be le...
3,PC photo printers challenge pros\n\nHome print...,tech,pc photo printer challenge pro \n\n home print...
4,Games win for Blu-ray DVD format\n\nThe next-g...,tech,game win for bluray dvd format \n\n the nextge...


In [None]:
# Remove newline characters
text_df['preprocessed_text'] = text_df['preprocessed_text'].str.replace('\n', '')
text_df.head()

Unnamed: 0,text,category,preprocessed_text
0,Media gadgets get moving\n\nPocket-sized devic...,tech,medium gadget get move pocketsize device that...
1,Can Yahoo dominate next decade?\n\nYahoo has r...,tech,can yahoo dominate next decade yahoo have rea...
2,Robots learn 'robotiquette' rules\n\nRobots ar...,tech,robot learn robotiquette rule robot be learn ...
3,PC photo printers challenge pros\n\nHome print...,tech,pc photo printer challenge pro home print pic...
4,Games win for Blu-ray DVD format\n\nThe next-g...,tech,game win for bluray dvd format the nextgenera...


# Data Vizualization