# **Imports**

In [586]:
# importing packages
import pandas as pd
import numpy as np
import glob
import os
from google.colab import files
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support as score
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import MinMaxScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# **Reading Data**

### ***Reading the folders***


In [587]:
print('Reading folder')
print('_____________')

Reading folder
_____________


In [588]:
# importing files from github
!git clone https://github.com/kundanmail55/bbc-classification

fatal: destination path 'bbc-classification' already exists and is not an empty directory.


In [589]:
main_folder = "./bbc-classification/bbc"
folders = ["business","entertainment","politics","sport","tech"]

In [590]:
os.listdir(main_folder)

['README.TXT', 'politics', 'tech', 'business', 'entertainment', 'sport']

In [591]:
# removing the README file and read all other
folderslist = [f for f in os.listdir(main_folder) if not f.startswith('README')]
folderslist

['politics', 'tech', 'business', 'entertainment', 'sport']

## ***Reading the files and adding it to dataframe with type***

In [592]:
print('Read files')
print('_____________')

Read files
_____________


In [615]:
news = []
n_type = []

In [616]:
# function to read files and processing its type
for folder in folders:
    folder_path = main_folder + '/' + folder + '/'
    print(folder_path)
    files = os.listdir(folder_path)
    for text_file in files:
        file_path = folder_path + "/" + text_file
        with open(file_path, errors='replace') as f:
            data = f.readlines()
        data = ' '.join(data)
        news.append(data)
        n_type.append(folder)

./bbc-classification/bbc/business/
./bbc-classification/bbc/entertainment/
./bbc-classification/bbc/politics/
./bbc-classification/bbc/sport/
./bbc-classification/bbc/tech/


In [595]:
len(news)

2225

In [617]:
# Add in dataframe and save to a excel sheet
news_frame = pd.DataFrame(columns=('type', 'news'))
news_frame['type'] = n_type
news_frame['news'] = news

# **Preprocessing**

In [597]:
print('Preprocessing')
print('_____________')

Preprocessing
_____________


In [618]:
news_frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2225 entries, 0 to 2224
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   type    2225 non-null   object
 1   news    2225 non-null   object
dtypes: object(2)
memory usage: 34.9+ KB


In [619]:
# Drop duplicate data
news_frame.drop_duplicates(subset=['type', 'news'], inplace=True)
news_frame.head()

Unnamed: 0,type,news
0,business,Winemaker rejects Foster's offer\n \n Australi...
1,business,Cuba winds back economic clock\n \n Fidel Cast...
2,business,Beer giant swallows Russian firm\n \n Brewing ...
3,business,Brussels raps mobile call charges\n \n The Eur...
4,business,Lloyd's of London head chides FSA\n \n The hea...


In [620]:
news_frame['type'].value_counts()

sport            505
business         503
politics         403
entertainment    369
tech             347
Name: type, dtype: int64

In [621]:
# Associate Category names with numerical index and save it in new column category_id
news_frame['type_id'] = news_frame['type'].factorize()[0]
news_frame.head()

Unnamed: 0,type,news,type_id
0,business,Winemaker rejects Foster's offer\n \n Australi...,0
1,business,Cuba winds back economic clock\n \n Fidel Cast...,0
2,business,Beer giant swallows Russian firm\n \n Brewing ...,0
3,business,Brussels raps mobile call charges\n \n The Eur...,0
4,business,Lloyd's of London head chides FSA\n \n The hea...,0


In [622]:
# Creating a type_id to each news type
type_dataframe = news_frame[['type_id', 'type']].drop_duplicates().sort_values('type_id')
type_dataframe

Unnamed: 0,type_id,type
0,0,business
510,1,entertainment
896,2,politics
1313,3,sport
1824,4,tech


In [623]:
type_to_id = dict(type_dataframe.values)
id_to_type = dict(type_dataframe[['type_id', 'type']].values)
id_to_type

{0: 'business', 1: 'entertainment', 2: 'politics', 3: 'sport', 4: 'tech'}

# **Data Cleaning**

In [604]:
print('Data Cleaning')
print('_____________')

Data Cleaning
_____________


In [624]:
# Function to remove whitespaces and other characters except alphabets and converting text to smallcase
def clean_text(text):
    text = re.sub("[^a-zA-Z]", " ", text)
    text = ' '.join(text.split())
    text = text.lower()
    
    return text

In [625]:
news_frame['processed'] = news_frame['news'].apply(clean_text)

In [626]:
# function to remove stopwords
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    return [w for w in text.split() if not w in stop_words]
    #  ' '.join(no_stopword_text)
  
news_frame['processed'] = news_frame['processed'].apply(lambda x: remove_stopwords(x))

In [627]:
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
news_frame['processed'] = news_frame['processed'].apply(lambda x: " ".join([lemmatizer.lemmatize(item) for item in x]))
print('Lemmatization complete.')

Lemmatization complete.


In [611]:
print('Data cleaned.')

Data cleaned.


In [628]:
news_frame.head()

Unnamed: 0,type,news,type_id,processed
0,business,Winemaker rejects Foster's offer\n \n Australi...,0,winemaker reject foster offer australian winem...
1,business,Cuba winds back economic clock\n \n Fidel Cast...,0,cuba wind back economic clock fidel castro dec...
2,business,Beer giant swallows Russian firm\n \n Brewing ...,0,beer giant swallow russian firm brewing giant ...
3,business,Brussels raps mobile call charges\n \n The Eur...,0,brussels rap mobile call charge european commi...
4,business,Lloyd's of London head chides FSA\n \n The hea...,0,lloyd london head chides fsa head lloyd london...


# **Feature Engineering**

### *Adding features*

In [629]:
# Character count
news_frame['char_count'] = news_frame["processed"].apply(lambda x: sum(len(word) for word in str(x).split(" ")))

In [630]:
# Word count
news_frame['word_count'] = news_frame["processed"].apply(lambda x: len(str(x).split(" ")))

In [631]:
news_frame.head()

Unnamed: 0,type,news,type_id,processed,char_count,word_count
0,business,Winemaker rejects Foster's offer\n \n Australi...,0,winemaker reject foster offer australian winem...,1183,184
1,business,Cuba winds back economic clock\n \n Fidel Cast...,0,cuba wind back economic clock fidel castro dec...,2225,366
2,business,Beer giant swallows Russian firm\n \n Brewing ...,0,beer giant swallow russian firm brewing giant ...,707,119
3,business,Brussels raps mobile call charges\n \n The Eur...,0,brussels rap mobile call charge european commi...,1444,215
4,business,Lloyd's of London head chides FSA\n \n The hea...,0,lloyd london head chides fsa head lloyd london...,979,157


### *Selecting Features*

In [632]:
features = ['processed', 'char_count', 'word_count']
X = news_frame[features]
Y = news_frame['type_id']

In [633]:
# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=55)

### *Adding Custom Transformer*

In [634]:
class TextSelector(BaseEstimator, TransformerMixin):

    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]
    
class NumberSelector(BaseEstimator, TransformerMixin):

    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]

### *Pipelines*

In [635]:
# Processing string/text using TextSelector pipeline
preprocess_text = Pipeline([
    ('key', TextSelector(key="processed")),
    ('tfidf', TfidfVectorizer())
])

In [636]:
# Processing integer using NumberSelector
word = Pipeline([
    ('key', NumberSelector(key="word_count")),
    ('min_max', MinMaxScaler())
])

char = Pipeline([
    ('key', NumberSelector(key="char_count")),
    ('min_max', MinMaxScaler())
])

In [637]:
# Adding all feature to one using Feature Union
feature_union = FeatureUnion([
    ('preprocess_text', preprocess_text),
    ('word_len', word),
    ('char_len', char)
])

In [638]:
# Passing the features, Kbest and classifier in one pipeline

final_pipeline = Pipeline([
  ('features', feature_union),
  ('select', SelectKBest(score_func=chi2, k=1000)),
  ('classifier', RandomForestClassifier())
])

### *Training the model*

In [639]:
final_pipeline.fit(X_train, y_train)

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('preprocess_text',
                                                 Pipeline(steps=[('key',
                                                                  TextSelector(key='processed')),
                                                                 ('tfidf',
                                                                  TfidfVectorizer())])),
                                                ('word_len',
                                                 Pipeline(steps=[('key',
                                                                  NumberSelector(key='word_count')),
                                                                 ('min_max',
                                                                  MinMaxScaler())])),
                                                ('char_len',
                                                 Pipeline(steps=[('key',
                                    

### *Prediction*

In [640]:
pred = final_pipeline.predict(X_test)

### *Performance*

In [585]:
accuracy = accuracy_score(y_test, pred)
print('Accuracy:', accuracy)
precision, recall, f1score, support = score(y_test, pred, average='macro')
print('Precision:', precision)
print('Recall:', recall)
print('F1 scrore:', f1score)

Accuracy: 0.9672364672364673
Precision: 0.9674223781441388
Recall: 0.9647066541171101
F1 scrore: 0.96594629107143
