In [6]:
import pandas as pd
import os
from sklearn.preprocessing import MultiLabelBinarizer
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jurgen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Jurgen\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
def remove_tags(text):
  remove = re.compile(r'')
  return re.sub(remove, '', text)

def special_char(text):
  reviews = ''
  for x in text:
    if x.isalnum():
      reviews = reviews + x
    else:
      reviews = reviews + ' '
  return reviews

def convert_lower(text):
   return text.lower()


def remove_stopwords(text):
  stop_words = set(stopwords.words('english'))
  words = word_tokenize(text)
  return [x for x in words if x not in stop_words]

def lemmatize_word(text):
  wordnet = WordNetLemmatizer()
  return " ".join([wordnet.lemmatize(word) for word in text])



In [47]:
categories = os.listdir('./articles')
all_features = []

for category in categories:
    files = os.listdir(f'./articles/{category}')
    for file in files:
        # print(category,file)
        with open(f'./articles/{category}/{file}', 'r', encoding='utf-8') as f:
            article = f.read()
            all_features.append((category, file, article))

# Create DataFrame from collected features
dataset = pd.DataFrame(all_features, columns=['Category', 'File', 'Text'])

# Removing empty articles
dataset = dataset[dataset.Text.str.len() > 0].reset_index(drop=True)

In [48]:
dataset.head()

Unnamed: 0,Category,File,Text
0,adulting-101,5_smart_ways_to_stretch_your_dollar_with_GrabF...,The app’s new features serve up both convenien...
1,adulting-101,Adulting_101_People_around_me_are_job_hopping_...,Adulthood is an invigorating stage of life as ...
2,adulting-101,Airbnb_bans_security_cameras_inside_guest_home...,SAN FRANCISCO — Airbnb on Monday (March 11) sa...
3,adulting-101,As_it_happened_Pritam_Singh_pleads_not_guilty_...,SINGAPORE: Leader of the Opposition Pritam Sin...
4,adulting-101,As_Swiftonomics_sweeps_through_Singapore_small...,SINGAPORE — Swiftonomics has swept through Sin...


In [49]:
dataset['Text'] = dataset['Text'].apply(remove_tags)
dataset['Text'] = dataset['Text'].apply(special_char)
dataset['Text'] = dataset['Text'].apply(convert_lower)
dataset['Text'] = dataset['Text'].apply(remove_stopwords)
dataset['Text'] = dataset['Text'].apply(lemmatize_word)

In [50]:
dataset.head()

Unnamed: 0,Category,File,Text
0,adulting-101,5_smart_ways_to_stretch_your_dollar_with_GrabF...,app new feature serve convenience value whethe...
1,adulting-101,Adulting_101_People_around_me_are_job_hopping_...,adulthood invigorating stage life young people...
2,adulting-101,Airbnb_bans_security_cameras_inside_guest_home...,san francisco airbnb monday march 11 said bann...
3,adulting-101,As_it_happened_Pritam_Singh_pleads_not_guilty_...,singapore leader opposition pritam singh charg...
4,adulting-101,As_Swiftonomics_sweeps_through_Singapore_small...,singapore swiftonomics swept singapore america...


In [69]:
title = dataset['File'].values
features = dataset['Text'].values
labels = dataset['Category'].str.split('_').apply(lambda x: [x[0]])

# Step 3: Encode Labels
mlb = MultiLabelBinarizer()
binary_labels = mlb.fit_transform(labels)

# Step 4: Create Multi-label Dataset
multi_label_df = pd.DataFrame(binary_labels, columns=mlb.classes_)
multi_label_df['Text'] = features
multi_label_df['File'] = title

columns = ["File" , 'Text'] + list(mlb.classes_)
multi_label_df = multi_label_df[columns]

multi_label_df = multi_label_df.groupby("File").max()

multi_label_df

Unnamed: 0_level_0,Text,adulting-101,big-read,commentary,gen-y-speaks,gen-z-speaks,singapore,voices,world
File,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
5_smart_ways_to_stretch_your_dollar_with_GrabFood.txt,app new feature serve convenience value whethe...,1,1,1,1,1,1,1,1
Adulting_101_People_around_me_are_job_hopping_Am_I_missing_out_by_not_doing_the_same.txt,adulthood invigorating stage life young people...,1,1,1,1,1,1,1,1
Airbnb_bans_security_cameras_inside_guest_homes.txt,san francisco airbnb monday march 11 said bann...,1,1,1,1,1,1,1,1
As_Swiftonomics_sweeps_through_Singapore_small_businesses_say_concerts_a_boon_and_a_bane_for_them.txt,singapore swiftonomics swept singapore america...,1,1,1,1,1,1,1,1
As_it_happened_Pritam_Singh_pleads_not_guilty_to_charges_of_lying_to_parliament_committee_over_Raeesah_Khan_s_case.txt,singapore leader opposition pritam singh charg...,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...
Why_you_should_wear_earplugs_at_concerts_and_other_useful_medical_advice_from_a_TikTok_famous_GP_from_S_pore.txt,article may contain affiliate link buy link ma...,1,0,1,0,0,0,0,0
Woman_arrested_for_Taylor_Swift_concert_ticket_scams_on_Carousell.txt,singapore 29 year old woman arrested monday ma...,1,1,1,1,0,0,0,0
Woman_charged_with_cheating_Taylor_Swift_fan_of_S_350_for_Singapore_concert_tickets.txt,singapore 29 year old woman tuesday mar 12 cha...,0,0,0,0,0,1,1,0
Your_Say_3_reasons_why_S_pore_is_justified_in_restricting_genetic_screening_of_IVF_embryos.txt,refer article baby step toughest period life s...,1,1,1,1,1,1,1,1


In [78]:
multi_label_df.reset_index().to_csv('multi_label_dataset.csv', index=False)

Unnamed: 0,File,Text,adulting-101,big-read,commentary,gen-y-speaks,gen-z-speaks,singapore,voices,world
0,5_smart_ways_to_stretch_your_dollar_with_GrabF...,app new feature serve convenience value whethe...,1,1,1,1,1,1,1,1
1,Adulting_101_People_around_me_are_job_hopping_...,adulthood invigorating stage life young people...,1,1,1,1,1,1,1,1
2,Airbnb_bans_security_cameras_inside_guest_home...,san francisco airbnb monday march 11 said bann...,1,1,1,1,1,1,1,1
3,As_Swiftonomics_sweeps_through_Singapore_small...,singapore swiftonomics swept singapore america...,1,1,1,1,1,1,1,1
4,As_it_happened_Pritam_Singh_pleads_not_guilty_...,singapore leader opposition pritam singh charg...,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...
105,Why_you_should_wear_earplugs_at_concerts_and_o...,article may contain affiliate link buy link ma...,1,0,1,0,0,0,0,0
106,Woman_arrested_for_Taylor_Swift_concert_ticket...,singapore 29 year old woman arrested monday ma...,1,1,1,1,0,0,0,0
107,Woman_charged_with_cheating_Taylor_Swift_fan_o...,singapore 29 year old woman tuesday mar 12 cha...,0,0,0,0,0,1,1,0
108,Your_Say_3_reasons_why_S_pore_is_justified_in_...,refer article baby step toughest period life s...,1,1,1,1,1,1,1,1
