In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from sklearn.feature_extraction.text import TfidfVectorizer

# Load dataset
df = pd.read_csv('/content/BBC News Train.csv')
print(df)

# Drop unnecessary columns
df.drop('ArticleId', axis=1, inplace=True)

# Clean text by removing punctuation and converting to lowercase
df['Text'] = df['Text'].str.replace('[^\w\s]', '').str.lower()

# Tokenize text by splitting it into words
df['Text'] = df['Text'].apply(nltk.word_tokenize)

# Remove stop words
stop_words = set(stopwords.words('english'))
df['Text'] = df['Text'].apply(lambda x: [word for word in x if word not in stop_words])

# Perform stemming or lemmatization
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
df['Text'] = df['Text'].apply(lambda x: [stemmer.stem(word) for word in x])

# Implement TF-IDF weighting scheme
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(df['Text'].apply(' '.join))
df_tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out())

# Add category column back to dataframe
df_tfidf['Category'] = df['Category']
print(df['Category'])

# Save cleaned and weighted dataset to CSV file
df_tfidf.to_csv('cleaned_weighted_dataset.csv', index=False)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


      ArticleId                                               Text  \
0          1833  worldcom ex-boss launches defence lawyers defe...   
1           154  german business confidence slides german busin...   
2          1101  bbc poll indicates economic gloom citizens in ...   
3          1976  lifestyle  governs mobile choice  faster  bett...   
4           917  enron bosses in $168m payout eighteen former e...   
...         ...                                                ...   
1485        857  double eviction from big brother model caprice...   
1486        325  dj double act revamp chart show dj duo jk and ...   
1487       1590  weak dollar hits reuters revenues at media gro...   
1488       1587  apple ipod family expands market apple has exp...   
1489        538  santy worm makes unwelcome visit thousands of ...   

           Category  
0          business  
1          business  
2          business  
3              tech  
4          business  
...             ...  
1485 

  df['Text'] = df['Text'].str.replace('[^\w\s]', '').str.lower()


0            business
1            business
2            business
3                tech
4            business
            ...      
1485    entertainment
1486    entertainment
1487         business
1488             tech
1489             tech
Name: Category, Length: 1490, dtype: object


In [None]:
import pandas as pd
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

# Load dataset
df = pd.read_csv('/content/BBC News Train.csv')

# Remove 'ArticleId' column
df.drop('ArticleId', axis=1, inplace=True)

# Define stopwords and stemmer
stop_words = set(stopwords.words('english'))
stemmer = SnowballStemmer('english')

# Define function to preprocess text
def preprocess_text(text):
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Convert to lowercase
    text = text.lower()
    
    # Tokenize text into words
    words = word_tokenize(text)
    
    # Remove stopwords and stem words
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    
    # Join words back into text
    text = ' '.join(words)
    
    return text

# Apply preprocessing function to 'Text' column
df['Text'] = df['Text'].apply(preprocess_text)

# Save preprocessed dataset to CSV file
df.to_csv('preprocessed_dataset.csv', index=False)

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Load preprocessed dataset
df = pd.read_csv('preprocessed_dataset.csv')

# Define TF-ICF vectorizer
tfidf = TfidfVectorizer(norm=None, use_idf=True, smooth_idf=False, sublinear_tf=False)

# Fit vectorizer to corpus
tfidf.fit(df['Text'])

# Compute TF-ICF weights for each document
tficf_weights = tfidf.transform(df['Text'])

# Convert sparse matrix to dense matrix
tficf_weights = tficf_weights.toarray()

# Add TF-ICF weights to dataframe
for i, feature in enumerate(tfidf.get_feature_names_out()):
    df[f'TF-ICF_{feature}'] = tficf_weights[:, i]

# Save dataframe to CSV file
df.to_csv('tficf_weighted_dataset.csv', index=False)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  df[f'TF-ICF_{feature}'] = tficf_weights[:, i]
  df[f'TF-ICF_{feature}'] = tficf_weights[:, i]
  df[f'TF-ICF_{feature}'] = tficf_weights[:, i]
  df[f'TF-ICF_{feature}'] = tficf_weights[:, i]
  df[f'TF-ICF_{feature}'] = tficf_weights[:, i]
  df[f'TF-ICF_{feature}'] = tficf_weights[:, i]
  df[f'TF-ICF_{feature}'] = tficf_weights[:, i]
  df[f'TF-ICF_{feature}'] = tficf_weights[:, i]
  df[f'TF-ICF_{feature}'] = tficf_weights[:, i]
  df[f'TF-ICF_{feature}'] = tficf_weights[:, i]
  df[f'TF-ICF_{feature}'] = tficf_weights[:, i]
  df[f'TF-ICF_{feature}'] = tficf_weights[:, i]
  df[f'TF-ICF_{feature}'] = tficf_weights[:, i]
  df[f'TF-ICF_{feature}'] = tficf_weights[:, i]
  df[f'TF-ICF_{feature}'] = tficf_weights[:, i]
  df[f'TF-ICF_{feature}'] = tficf_weights[:, i]
  df[f'TF-ICF_{feature}'] = tficf_weights[:, i]
  df[f'TF-ICF_{feature}'] = tficf_weights[:, i]
  df[f'TF-ICF_{feature}'] = tficf_weights[:, i]
  df[f'TF-ICF_{feature}

In [None]:
from sklearn.model_selection import train_test_split

# Load TF-ICF weighted dataset
df = pd.read_csv('tficf_weighted_dataset.csv')

# Split dataset into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)

# Save training and testing sets to CSV files
train_df.to_csv('train_dataset.csv', index=False)
test_df.to_csv('test_dataset.csv', index=False)

In [None]:
train_documents = pd.read_csv('train_dataset.csv')
test_df = pd.read_csv('test_dataset.csv')


In [None]:
import pandas as pd
import numpy as np
from collections import Counter
import math
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support

def compute_icf(documents):
    # Count the number of documents containing each term
    N = len(documents)
    doc_freq = {}
    for doc in documents:
        terms = set(doc)
        for term in terms:
            if term in doc_freq:
                doc_freq[term] += 1
            else:
                doc_freq[term] = 1
    
    # Compute the ICF value for each term
    icf_values = {}
    for term in doc_freq:
        icf_values[term] = np.log(N / doc_freq[term])
    print(icf_values)
    return icf_values

# Compute the ICF values for the training set
train_documents = train_df["Text"].tolist()
icf_values = compute_icf(train_documents)

# Define a function to compute the TF-ICF weights for each term in each document
def compute_tf_icf(document, icf_values):
    # Compute the term frequency (TF) for each term in the document
    term_freq = {}
    for term in document:
        if term in term_freq:
            term_freq[term] += 1
        else:
            term_freq[term] = 1
    
    # Compute the TF-ICF weight for each term in the document
    tf_icf = {}
    for term in term_freq:
        if term in icf_values:
            tf_icf[term] = term_freq[term] * icf_values[term]
        else:
            tf_icf[term] = 0.0
    print(tf_icf)
    return tf_icf

# Compute the TF-ICF weights for the training set
train_df["TF-ICF"] = train_df["Text"].apply(lambda x: compute_tf_icf(x, icf_values))

{'l': 0.0, 'm': 0.0, 'n': 0.0, 'i': 0.0, 'h': 0.0, 'r': 0.0, 'u': 0.0, 'f': 0.0, 'e': 0.0, 'a': 0.0, '3': 0.5409158871184905, '1': 0.18568290449665945, 'd': 0.0, ' ': 0.0, 'o': 0.0, 'j': 0.17563256864315796, 'c': 0.0, 'x': 0.12304011852398737, 's': 0.0, 'k': 0.0025199509345131947, 'b': 0.0, '9': 0.7083630277971985, 'w': 0.0, 'g': 0.0, 't': 0.0, 'y': 0.0033613477027049274, 'p': 0.0, 'v': 0.0025199509345131947, '4': 0.5510535554029458, '5': 0.4527044619829234, '0': 0.1716405473736206, 'q': 0.49308679942860906, '£': 1.2427461902518968, '6': 0.7554510638961002, '7': 0.7771125606772794, '2': 0.20918935217200083, '8': 0.7359986379692849, 'z': 0.7501082194856046}
{'f': 0.0, 'u': 0.0, 'm': 0.0, 'e': 0.0, ' ': 0.0, 'r': 0.0, 'o': 0.0, 'b': 0.0, 'i': 0.0, 'n': 0.0, 's': 0.0, 'l': 0.0, 'a': 0.0, 't': 0.0, 'c': 0.0, 'g': 0.0, 'd': 0.0, 'h': 0.0, 'v': 0.030239411214158338, 'w': 0.0, 'y': 0.04033617243245913, '1': 0.3713658089933189, '9': 0.7083630277971985, '3': 0.5409158871184905, 'x': 0.492160474

In [None]:
def compute_tf_icf_cat(category_term_counts, train_df, category):
    # Compute the TF-ICF values for each term in the category
    num_documents = len(train_df)
    category_docs = train_df[train_df["Category"] == category]
    num_category_docs = len(category_docs)
    tf_icf = {}
    
    for term in category_term_counts:
        # Compute term frequency (TF)
        term_freq = category_term_counts[term] / sum(category_term_counts.values())

        # Compute inverse category frequency (ICF)
        num_category_with_term = sum(1 for doc in category_docs["Text"] if term in doc)
        num_other_categories_with_term = sum(1 for _, doc in train_df[train_df["Category"] != category]["Text"].items() if term in doc)
        icf = math.log((num_documents - num_other_categories_with_term) / (num_category_with_term + 1))

        # Compute TF-ICF
        tf_icf[term] = term_freq * icf
        
    return tf_icf

In [None]:
def train_naive_bayes(train_df):
    # Compute the prior probability of each category
    num_documents = len(train_df)
    category_counts = train_df["Category"].value_counts()
    print(category_counts)
    prior_probs = {}
    for category in category_counts.index:
        prior_probs[category] = category_counts[category] / num_documents
    
    # Compute the TF-ICF weights for each term in each category
    category_term_weights = {}
    for category in category_counts.index:
        category_docs = train_df[train_df["Category"] == category]
        category_term_counts = Counter()
        for text in category_docs["Text"]:
            tokens = preprocess_text(text)
            category_term_counts.update(tokens)
        category_term_weights[category] = compute_tf_icf_cat(category_term_counts, train_df, category)
    
    return prior_probs, category_term_weights

In [None]:
def test_naive_bayes(test_df, prior_probs, category_term_weights):
    # Classify each document in the testing set
    predictions = []
    for text in test_df["Text"]:
        tokens = preprocess_text(text)
        category_scores = {}
        for category in prior_probs.keys():
            category_score = math.log(prior_probs[category])
            for token in tokens:
                if token in category_term_weights[category]:
                    category_score += category_term_weights[category][token]
            category_scores[category] = category_score
        prediction = max(category_scores, key=category_scores.get)
        predictions.append(prediction)
    
    # Calculate the accuracy, precision, recall, and F1 score
    actual_categories = test_df["Category"].tolist()
    accuracy = accuracy_score(actual_categories, predictions)
    precision, recall, f1_score, _ = precision_recall_fscore_support(actual_categories, predictions, average="weighted")
    
    return accuracy, precision, recall, f1_score

In [None]:
# Train the Naive Bayes classifier on the training set
prior_probs, category_term_weights = train_naive_bayes(pd.read_csv('train_dataset.csv'))

# Test the Naive Bayes classifier on the testing set
accuracy, precision, recall, f1_score = test_naive_bayes(test_df, prior_probs, category_term_weights)

# Print the evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1_score)

sport            245
business         228
entertainment    194
tech             188
politics         188
Name: Category, dtype: int64
Accuracy: 0.22595078299776286
Precision: 0.05105375633730212
Recall: 0.22595078299776286
F1 Score: 0.08328842730939433


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

def train_imp_naive_bayes(train_df):
    # Remove unnecessary columns
    train_df = train_df[['Text', 'Category']]

    # Preprocess text
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    train_df['Text'] = train_df['Text'].str.lower().str.replace('[^\w\s]', '')
    train_df['Text'] = train_df['Text'].apply(lambda x: ' '.join([stemmer.stem(word) for word in word_tokenize(x) if word not in stop_words]))

    # Split into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(train_df['Text'], train_df['Category'], test_size=0.3, random_state=42)

    # Vectorize with TF-IDF
    vectorizer = TfidfVectorizer()
    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)

    # Compute ICF
    n_docs = X_train.shape[0]
    icf = np.log(n_docs / (1 + np.sum(X_train > 0, axis=0)))

    # Multiply TF with ICF to get TF-ICF weights
    X_train = X_train.multiply(icf)
    X_test = X_test.multiply(icf)

    # Train Naive Bayes classifier
    clf = MultinomialNB()
    clf.fit(X_train, y_train)

    # Evaluate accuracy on testing set
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    return clf, vectorizer, icf, accuracy

In [None]:
clf, vector, icf, acc = train_imp_naive_bayes(pd.read_csv('train_dataset.csv'))
print("Accuracy:", acc)

  train_df['Text'] = train_df['Text'].str.lower().str.replace('[^\w\s]', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['Text'] = train_df['Text'].str.lower().str.replace('[^\w\s]', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['Text'] = train_df['Text'].apply(lambda x: ' '.join([stemmer.stem(word) for word in word_tokenize(x) if word not in stop_words]))


Accuracy: 0.9616613418530351


In [None]:
from sklearn.model_selection import train_test_split

# Load TF-ICF weighted dataset
df = pd.read_csv('preprocessed_dataset.csv')

# Split dataset into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Save training and testing sets to CSV files
train_df.to_csv('train_dataset1.csv', index=False)
test_df.to_csv('test_dataset1.csv', index=False)

In [None]:
train_documents = pd.read_csv('train_dataset1.csv')

In [None]:
# Train the Naive Bayes classifier on the training set
prior_probs, category_term_weights = train_naive_bayes(pd.read_csv('train_dataset1.csv'))

# Test the Naive Bayes classifier on the testing set
accuracy, precision, recall, f1_score = test_naive_bayes(pd.read_csv('test_dataset1.csv'), prior_probs, category_term_weights)

# Print the evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1_score)
clf, vector, icf, acc = train_imp_naive_bayes(pd.read_csv('train_dataset1.csv'))
print("Accuracy:", acc)

sport            283
business         261
entertainment    227
politics         218
tech             203
Name: Category, dtype: int64


  _warn_prf(average, modifier, msg_start, len(result))
  train_df['Text'] = train_df['Text'].str.lower().str.replace('[^\w\s]', '')


Accuracy: 0.21140939597315436
Precision: 0.04469393270573398
Recall: 0.21140939597315436
F1 Score: 0.0737883210321813
Accuracy: 0.9636871508379888


In [None]:
from sklearn.model_selection import train_test_split

# Load TF-ICF weighted dataset
df = pd.read_csv('tficf_weighted_dataset.csv')

# Split dataset into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.4, random_state=42)

# Save training and testing sets to CSV files
train_df.to_csv('train_dataset2.csv', index=False)
test_df.to_csv('test_dataset2.csv', index=False)

In [None]:
prior_probs, category_term_weights = train_naive_bayes(pd.read_csv('train_dataset2.csv'))

# Test the Naive Bayes classifier on the testing set
accuracy, precision, recall, f1_score = test_naive_bayes(pd.read_csv('test_dataset2.csv'), prior_probs, category_term_weights)

# Print the evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1_score)
clf, vector, icf, acc = train_imp_naive_bayes(pd.read_csv('train_dataset2.csv'))
print("Accuracy:", acc)

In [None]:
train_documents = pd.read_csv('train_dataset3.csv')

In [None]:
from sklearn.model_selection import train_test_split

# Load TF-ICF weighted dataset
df = pd.read_csv('tficf_weighted_dataset.csv')

# Split dataset into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.5, random_state=42)

# Save training and testing sets to CSV files
train_df.to_csv('train_dataset3.csv', index=False)
test_df.to_csv('test_dataset3.csv', index=False)

In [None]:
prior_probs, category_term_weights = train_naive_bayes(pd.read_csv('train_dataset3.csv'))

# Test the Naive Bayes classifier on the testing set
accuracy, precision, recall, f1_score = test_naive_bayes(pd.read_csv('test_dataset3.csv'), prior_probs, category_term_weights)

# Print the evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1_score)
clf, vector, icf, acc = train_imp_naive_bayes(pd.read_csv('train_dataset2.csv'))
print("Accuracy:", acc)