In [None]:
#!pip install docx2txt

In [1]:
import pandas as pd
import numpy as np
import re
from scipy import interp
import os
import string
import docx2txt
from docx import Document
# Visualuzation packages
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format='retina'
from docx import Document
# NLP
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
import nltk
from joblib import dump, load
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
# ML Model packages
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import RidgeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.base import clone
from sklearn.preprocessing import label_binarize, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import KFold, cross_validate, cross_val_score, train_test_split
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.metrics import roc_curve, auc

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jegad\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jegad\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jegad\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
#!pip install python-docx
from docx import Document

In [3]:
def read_files_into_dataframe(file_directory, file_category):
    news_path = 'news_dataf.csv' 
    if not os.path.exists(news_path):
        news_dataf = pd.DataFrame(columns=["Class", "Text"])
    else:
        news_dataf = pd.read_csv(news_path, index_col=0)

    for name_of_file in os.listdir(file_directory):
        if name_of_file.startswith("~$") or not name_of_file.endswith(".docx"):
            continue

        path_of_file = os.path.join(file_directory, name_of_file)
        try:
            with open(path_of_file, "rb") as fileObject:
                doc = Document(fileObject)
                doc_data = " ".join([para.text for para in doc.paragraphs])
        except Exception as e:
            print(f"Error in reading {path_of_file}: {e}")
            continue

        if doc_data not in news_dataf['Text'].values:
            news_dataf = pd.concat([news_dataf, pd.DataFrame({"Class": [file_category], "Text": [doc_data]})], ignore_index=True)

        print(f"Processed file: {name_of_file}")

    news_dataf.to_csv(news_path, escapechar='\\')
    
# Define directories for each category
business_dir = r'C:\Users\jegad\BBC_BusinessA'
science_dir = r'C:\Users\jegad\BBC_ScienceA'
sports_dir= r'C:\Users\jegad\BBC_SportsA'

# Define categories
news_categories = ['business', 'sports', 'science']

# Read files from each directory and add them to the DataFrame
read_files_into_dataframe(business_dir, news_categories[0])
read_files_into_dataframe(sports_dir, news_categories[1])
read_files_into_dataframe(science_dir, news_categories[2])

# Read the CSV file back into a DataFrame
news_path = 'news_dataf.csv' 
news_dataf = pd.read_csv(news_path, index_col=0)

# Print the DataFrame
print(news_dataf)
print(news_dataf.shape)

Processed file: BBC_Business1.docx
Processed file: BBC_Business10.docx
Processed file: BBC_Business11.docx
Processed file: BBC_Business12.docx
Processed file: BBC_Business13.docx
Processed file: BBC_Business14.docx
Processed file: BBC_Business15.docx
Processed file: BBC_Business2.docx
Processed file: BBC_Business3.docx
Processed file: BBC_Business4.docx
Processed file: BBC_Business5.docx
Processed file: BBC_Business6.docx
Processed file: BBC_Business7.docx
Processed file: BBC_Business8.docx
Processed file: BBC_Business9.docx
Processed file: BBC_Sports1.docx
Processed file: BBC_Sports10.docx
Processed file: BBC_Sports11.docx
Processed file: BBC_Sports12.docx
Processed file: BBC_Sports13.docx
Processed file: BBC_Sports14.docx
Processed file: BBC_Sports15.docx
Processed file: BBC_Sports2.docx
Processed file: BBC_Sports3.docx
Processed file: BBC_Sports4.docx
Processed file: BBC_Sports5.docx
Processed file: BBC_Sports6.docx
Processed file: BBC_Sports7.docx
Processed file: BBC_Sports8.docx
P

In [4]:
# Check for missing values
news_dataf.isna().sum()


Class    0
Text     0
dtype: int64

In [5]:
#Preprocessing of Data
def data_preprocess(df):
    df['Text2'] = df['Text'].replace('\n',' ')
    df['Text2'] = df['Text2'].replace('\r',' ')
    
    df['Text2'] = df['Text2'].str.lower()
    df['Text2'] = df['Text2'].str.translate(str.maketrans('', '', string.punctuation))
    
    stp_wrds = stopwords.words("english")
    stem_lemmatizer = WordNetLemmatizer()
    
    def postag(each):
        p_tag = pos_tag([each])[0][1][0].upper()
        p_hash_tag = {"N": wordnet.NOUN,"R": wordnet.ADV, "V": wordnet.VERB,"J": wordnet.ADJ}        
        return p_hash_tag.get(p_tag, wordnet.NOUN)

    
    def lematize(text):
        tkns = nltk.word_tokenize(text)
        ay = ""
        for each in tkns:
            if each not in stp_wrds:
                ay += stem_lemmatizer.lemmatize(each, postag(each)) + " "
        return ay
    
    df['Text2'] = df['Text2'].apply(lematize)
    
data_preprocess(news_dataf)
print(news_dataf)

print(news_dataf.iloc[1]['Text'])
print("\n After Data Preprocessing")
print(news_dataf.iloc[1]['Text2'])

def preprocess_of_text(txt):
    txt = txt.replace('\n', ' ')
    txt = txt.replace('\r', ' ')
    
    txt = txt.lower()
    txt = txt.translate(str.maketrans('', '', string.punctuation))
    
    stp_wrds = set(stopwords.words("english"))
    stem_lemmatizer = WordNetLemmatizer()
    
    def postag(each):
        tag = pos_tag([each])[0][1][0].upper()
        hash_tag = {"N": wordnet.NOUN, "R": wordnet.ADV, "V": wordnet.VERB, "J": wordnet.ADJ}
        return hash_tag.get(tag, wordnet.NOUN)

    tkns = nltk.word_tokenize(txt)
    txt = " ".join(stem_lemmatizer.lemmatize(each, postag(each)) for each in tkns if each not in stp_wrds)
    return txt

       Class                                               Text  \
0   business  Mattel is doubling down on its plans to expand...   
1   business  Wildfires have swept across the Greek island o...   
2   business  Chipmaking giant Taiwan Semiconductor (TSMC) h...   
3     sports  The Lionesses have been "itching to begin" the...   
4     sports  Mark Wood struck crucial late blows in England...   
5     sports  Manchester United have rejected a £20m bid fro...   
6    science  The Gulf Stream system of warm ocean currents ...   
7    science  The heatwaves battering Europe and the US in J...   
8    science  It's taken just shy of 20 years but Sir Richar...   
9   business  Virgin Money will close 39 of its UK banks as ...   
10  business  Banking boss Dame Alison Rose has apologised t...   
11  business  The European Central Bank (ECB) has raised int...   
12   science  False claims suggesting that the BBC has been ...   
13   science  The record-breaking UK heat experienced in 202..

In [6]:
#Splitting the training and testing set
X_train, X_test, y_train, y_test = train_test_split(news_dataf['Text2'], 
                                                    news_dataf['Class'], 
                                                    test_size=0.2, 
                                                    random_state=9)
print(y_train)
print(y_test)


print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)


2     business
0     business
3       sports
4       sports
9     business
1     business
11    business
8      science
6      science
5       sports
12     science
14     science
Name: Class, dtype: object
13     science
7      science
10    business
Name: Class, dtype: object
(12,) (3,) (12,) (3,)


In [7]:
#TfidfVectorizer Configuration
vect = TfidfVectorizer(stop_words='english', 
                         ngram_range = (1,2),
                         min_df = 3,
                         max_df = 1.,
                         max_features = 10000)

In [8]:
#Cross validation

def fit_model(model, model_name):
    fit_line = Pipeline([('vectorize', vect), (model_name, model)])
    
    
    op = cross_validate(fit_line, 
                            X_train, 
                            y_train, 
                            cv = KFold(shuffle = True, 
                                       n_splits = 3,  
                                       random_state = 9),
                            scoring = ('accuracy', 'f1_weighted','precision_weighted','recall_weighted'),           
                            return_train_score=True)
    
    return op
#Splitting the training and testing set
X_train, X_test, y_train, y_test = train_test_split(news_dataf['Text2'], 
                                                    news_dataf['Class'], 
                                                    test_size=0.25, 
                                                    random_state=9)

randforest = fit_model(RandomForestClassifier(), 'RF')
ridge = fit_model(RidgeClassifier(), 'Ridge')
bayes = fit_model(MultinomialNB(), 'NB')

rf = pd.DataFrame.from_dict(randforest)
rc = pd.DataFrame.from_dict(ridge)
bc = pd.DataFrame.from_dict(bayes)

l1 = [bc, rc, rf]
l2 =["NB", "Ridge", "RF"]

for each, tag in zip(l1, l2):
    each['model'] = [tag, tag, tag]

joined_output = pd.concat([bc,rc,rf])

relevant_measures = list(['test_accuracy','test_precision_weighted', 'test_recall_weighted', 'test_f1_weighted'])

rand_forest_metrics = joined_output.loc[joined_output.model == 'RF'][relevant_measures]
nb_metrics = joined_output.loc[joined_output.model == 'NB'][relevant_measures]
r_metrics = joined_output.loc[joined_output.model == 'Ridge'][relevant_measures]

metrics_ = [rand_forest_metrics, nb_metrics, r_metrics]
names_ = ['Random Forest', 'Naive Bayes', 'Ridge Classifier']

for scores, namess in zip(metrics_, names_):
    print(f'{namess} Mean Metrics:')
    print(scores.mean())
    print('  ')
    
# Join training and test datasets
X = pd.concat([X_train, 
               X_test])
y = pd.concat([y_train, 
               y_test])


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Random Forest Mean Metrics:
test_accuracy              0.722222
test_precision_weighted    0.611111
test_recall_weighted       0.722222
test_f1_weighted           0.643519
dtype: float64
  
Naive Bayes Mean Metrics:
test_accuracy              0.444444
test_precision_weighted    0.259259
test_recall_weighted       0.444444
test_f1_weighted           0.305556
dtype: float64
  
Ridge Classifier Mean Metrics:
test_accuracy              0.805556
test_precision_weighted    0.791667
test_recall_weighted       0.805556
test_f1_weighted           0.768519
dtype: float64
  


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [9]:
#Create and Fit a Text Classification Model
def create_and_fit(classf, x, y):
    best_classf = classf
    pipeline = Pipeline([('vectorize', vect), ('model', best_classf)])
    return pipeline.fit(x, y)
# Create model
CLASSIFIER = create_and_fit(MultinomialNB(), X, y)

In [10]:
def predict_user_input(classifier):
    usr_input = input("Enter a news article text: ")
    preprocessd_input = preprocess_of_text(usr_input)

    # Use classifier to query and find the class
    predict_class = classifier.predict([preprocessd_input])[0]
    
    print(f"Predicted class: {predict_class}")

predict_user_input(CLASSIFIER)

Enter a news article text: Aeolus was regarded as something of a revolution. Its ultraviolet laser tracked the movement of air in all locations, at every altitude, across the world.
Predicted class: science


In [None]:
#!pip install tk

In [11]:
import tkinter as tk
from tkinter import scrolledtext
from tkinter import END

# Function to make a prediction
def predict():
    usr_input = usr_input_text.get(1.0, END).strip()
    if not usr_input:
        result_text.config(state=tk.NORMAL)
        result_text.delete(1.0, END)
        result_text.insert(tk.INSERT, "Enter the news article to find the class:")
        result_text.config(state=tk.DISABLED)
        return

    preprocessd_input = preprocess_of_text(usr_input)
    predict_class = CLASSIFIER.predict([preprocessd_input])[0]

    result_text.config(state=tk.NORMAL)
    result_text.delete(1.0, END)
    result_text.insert(tk.INSERT, f"Predicted class: {predict_class}")
    result_text.config(state=tk.DISABLED)

# Create the Tkinter application window
root = tk.Tk()
root.title("Text Classifier GUI")

# Create a label and input text box for the user input
usr_input_label = tk.Label(root, text="Enter a news article text:")
usr_input_label.pack()
usr_input_text = scrolledtext.ScrolledText(root, width=40, height=5, wrap=tk.WORD)
usr_input_text.pack()

# Create a button to trigger prediction
predict_button = tk.Button(root, text="Predict", command=predict)
predict_button.pack()

# Create a text widget to display the prediction result
result_text = scrolledtext.ScrolledText(root, width=40, height=3, wrap=tk.WORD, state=tk.DISABLED)
result_text.pack()

# Start the Tkinter event loop
root.mainloop()