# **Import Libraries**

In [83]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from hiclass import LocalClassifierPerParentNode
from sklearn.pipeline import Pipeline
import time
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# **Loading DataSet**

In [92]:
# Load the dataset into a DataFrame
data = pd.read_csv("data_hierarchial_classif.csv")

df = data[['Title', 'Text', 'Cat1', 'Cat2', 'Cat3']]

df.isna().sum() #5 NA values in 'Title' column

df['Title']=df['Title'].fillna("")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title']=df['Title'].fillna("")


# **Preprocessing feature columns**

In [93]:
# Define data pre-processing function

def preprocess_text(text):
    # Convert text to lowercase and remove special characters
    lower_case = text.lower().strip().replace('\n', ' ').replace('\r', ' ')
    alphabetic = re.sub(r'[^a-zA-Z\']', ' ', lower_case)
    without_link = re.sub(r'http\S+', '', alphabetic)

    # Tokenize text
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(without_link)

    # Remove stopwords
    stop = set(stopwords.words('english')) - set(["my", "haven't", "aren't", "can", "no", "why", "through", "herself", "she", "he", "himself", "you", "you're", "myself", "not", "here", "some", "do", "does", "did", "will", "don't", "doesn't", "didn't", "won't", "should", "should've", "couldn't", "mightn't", "mustn't", "shouldn't", "hadn't", "wasn't", "wouldn't"])
    filtered_tokens = [word for word in tokens if word not in stop]

    # Lemmatize tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_text = [lemmatizer.lemmatize(word) for word in filtered_tokens]

    # Join tokens back into text
    processed_text = " ".join(lemmatized_text)

    return processed_text

# Apply data pre-processing to 'Text' and 'Title' columns
df['Text_Updated'] = df['Text'].apply(preprocess_text)
df['Title_Updated'] = df['Title'].apply(preprocess_text)
df['Title_Text_Combined'] = df['Title_Updated'] + ' ' + df['Text_Updated']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Text_Updated'] = df['Text'].apply(preprocess_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title_Updated'] = df['Title'].apply(preprocess_text)


# **Feature selection by evaluating model performance**

In [112]:
# Define the base classifier
base_classifier = LinearSVC(class_weight='balanced', random_state=42)

lcppn = LocalClassifierPerParentNode(
    local_classifier=base_classifier,
    verbose=0,
    n_jobs=1,
)

# Define the pipeline
pipeline = Pipeline([
    ('count', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('model', lcppn),
])

# Initialize variables to store the best feature and its corresponding F1 score
best_feature = None
best_f1_score = -1

# Iterate over each feature column
for feature_column in ["Text_Updated", "Title_Text_Combined", "Title_Updated"]:
    # Reset index
    df.reset_index(drop=True, inplace=True)
    X = df[feature_column].to_numpy()
    y = data[["Cat1", "Cat2", "Cat3"]].to_numpy()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train the pipeline
    beginning = time.time()
    pipeline.fit(X_train, y_train)
    end = time.time()

    # Compute F1 score
    predictions = pipeline.predict(X_test)
    f1_hierarchical = f1(y_test, predictions)

    # Print F1 score
    print(f"F1 score for {feature_column}: {f1_hierarchical}")

    # Check if the current feature has a better F1 score
    if f1_hierarchical > best_f1_score:
        best_f1_score = f1_hierarchical
        best_feature = feature_column

# Print the best feature and its corresponding F1 score
print(f"Best feature: {best_feature}")
print(f"Best F1 score: {best_f1_score}")

F1 score for Text_Updated: 0.5925
F1 score for Title_Text_Combined: 0.8148333333333333
F1 score for Title_Updated: 0.8459999999999999
Best feature: Title_Updated
Best F1 score: 0.8459999999999999


# **Predictions using best feature**

In [113]:
# Save prediction dataframe for the best feature
X_best = df[best_feature].to_numpy()
y_best = df[["Cat1", "Cat2", "Cat3"]].to_numpy()
X_train_best, X_test_best, y_train_best, y_test_best = train_test_split(X_best, y_best, test_size=0.2, random_state=42)
pipeline.fit(X_train_best, y_train_best)
predictions_best = pipeline.predict(X_test_best)
predicted_df = pd.DataFrame(predictions_best, columns=["Predicted_Cat1", "Predicted_Cat2", "Predicted_Cat3"])

y_test_best_pandas = pd.DataFrame(y_test_best, columns=["Cat1", "Cat2", "Cat3"])

final_df = pd.concat([y_test_best_pandas, predicted_df], axis=1)

# Save the final DataFrame
final_df.to_csv("predicted_results_hiclass_svc.csv", index=False)

# **Category wise model evaluation**

In [109]:
# Assuming y_test and predictions are numpy arrays
y_test_cat1 = y_test_best[:, 0]  # Extract Cat1 labels
y_test_cat2 = y_test_best[:, 1]  # Extract Cat2 labels
y_test_cat3 = y_test_best[:, 2]  # Extract Cat3 labels

predictions_cat1 = predictions_best[:, 0]  # Extract Cat1 predictions
predictions_cat2 = predictions_best[:, 1]  # Extract Cat2 predictions
predictions_cat3 = predictions_best[:, 2]  # Extract Cat3 predictions

# Calculate precision, recall, accuracy, and F1-score for each category with macro and weighted averaging
def calculate_metrics(y_true, y_pred):

    accuracy = accuracy_score(y_true, y_pred)
    f1_weighted = f1_score(y_true, y_pred, average='weighted')
    precision_weighted = precision_score(y_true, y_pred, average='weighted')
    recall_weighted = recall_score(y_true, y_pred, average='weighted')


    return {
        'accuracy': accuracy,
        'f1_weighted': f1_weighted,
        'precision_weighted': precision_weighted,
        'recall_weighted': recall_weighted,
    }

# Calculate metrics for each category
metrics_cat1 = calculate_metrics(y_test_cat1, predictions_cat1)
metrics_cat2 = calculate_metrics(y_test_cat2, predictions_cat2)
metrics_cat3 = calculate_metrics(y_test_cat3, predictions_cat3)

# Create dataframes for each category
df_cat1 = pd.DataFrame(metrics_cat1, index=[0])
df_cat2 = pd.DataFrame(metrics_cat2, index=[0])
df_cat3 = pd.DataFrame(metrics_cat3, index=[0])

# Print the dataframes
print("Category 1 Metrics:")
display(df_cat1)
print()

print("Category 2 Metrics:")
display(df_cat2)
print()

print("Category 3 Metrics:")
display(df_cat3)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Category 1 Metrics:


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,accuracy,f1_weighted,precision_weighted,recall_weighted
0,0.923,0.922969,0.923114,0.923



Category 2 Metrics:


Unnamed: 0,accuracy,f1_weighted,precision_weighted,recall_weighted
0,0.835,0.835421,0.843195,0.835



Category 3 Metrics:


Unnamed: 0,accuracy,f1_weighted,precision_weighted,recall_weighted
0,0.775,0.768932,0.78177,0.775
