In [1]:
import pandas as pd
from func import text_cleansing as tc
from func import tf_idf as tfidf

# Data Collection

In [2]:
df = pd.read_csv('data/2023_0531-0315_1000_False_detikcom.csv')
df

Unnamed: 0,title,category,publish_date,article_url,content
0,"Pasar Sawit Dihambat, RI & Malaysia Langsung S...",detikFinance,31 Mei 2023 23:55 WIB,https://finance.detik.com/industri/d-6749907/p...,
1,Nggak Nyangka! Negara Tetangga Ini Pesaing RI ...,detikFinance,31 Mei 2023 23:06 WIB,https://finance.detik.com/industri/d-6749887/n...,
2,Cuan Ratusan Juta Rupiah dari Bisnis Makanan Beku,detikFinance,31 Mei 2023 23:00 WIB,https://finance.detik.com/solusiukm/d-6749882/...,
3,"Genjot Kendaraan Listrik, RI Jaring Produsen M...",detikFinance,31 Mei 2023 22:21 WIB,https://finance.detik.com/industri/d-6749860/g...,
4,"Insentif buat Bus Listrik Ada, Cek di Sini Pen...",detikFinance,31 Mei 2023 21:17 WIB,https://finance.detik.com/industri/d-6749780/i...,
...,...,...,...,...,...
6995,"Siap-siap UTBK 2023, Intip Tata Tertib UTBK & ...",detikEdu,01 Mei 2023 09:00 WIB,https://www.detik.com/edu/seleksi-masuk-pt/d-6...,
6996,Beasiswa ke Jepang MEXT Scholarship 2024 Jenja...,detikEdu,01 Mei 2023 08:00 WIB,https://www.detik.com/edu/beasiswa/d-6696920/b...,
6997,Kurang Waktu Bermain Mandiri Tingkatkan Ganggu...,detikEdu,01 Mei 2023 07:00 WIB,https://www.detik.com/edu/detikpedia/d-6695590...,
6998,IPB Masih Buka Program S1 Beasiswa Utusan Daer...,detikEdu,01 Mei 2023 06:00 WIB,https://www.detik.com/edu/beasiswa/d-6696919/i...,


# Data Preprocessing

## Data Cleaning

In [3]:
# Memeriksa data duplikat
df.duplicated().sum()

0

In [4]:
# memeriksa missing value
df.isnull().sum()

title              0
category           0
publish_date       0
article_url        0
content         7000
dtype: int64

In [5]:
# menghapus feature yang tidak dibutuhkan (publish_date, article_url, and content column)
df = df.drop(['publish_date', 'article_url', 'content'], axis=1)
df

Unnamed: 0,title,category
0,"Pasar Sawit Dihambat, RI & Malaysia Langsung S...",detikFinance
1,Nggak Nyangka! Negara Tetangga Ini Pesaing RI ...,detikFinance
2,Cuan Ratusan Juta Rupiah dari Bisnis Makanan Beku,detikFinance
3,"Genjot Kendaraan Listrik, RI Jaring Produsen M...",detikFinance
4,"Insentif buat Bus Listrik Ada, Cek di Sini Pen...",detikFinance
...,...,...
6995,"Siap-siap UTBK 2023, Intip Tata Tertib UTBK & ...",detikEdu
6996,Beasiswa ke Jepang MEXT Scholarship 2024 Jenja...,detikEdu
6997,Kurang Waktu Bermain Mandiri Tingkatkan Ganggu...,detikEdu
6998,IPB Masih Buka Program S1 Beasiswa Utusan Daer...,detikEdu


In [6]:
# memeriksa jumlah nilai unik pada setiap kategori
df['category'].value_counts()

detikFinance    1000
detikSport      1000
detikHealth     1000
detikFood       1000
detikOto        1000
detikTravel     1000
detikEdu        1000
Name: category, dtype: int64

In [7]:
# mendapatkan seluruh nilai dari kolom title dalam bentuk list
title_list = df['title'].values.tolist()
title_list

['Pasar Sawit Dihambat, RI & Malaysia Langsung Sambangi Uni Eropa',
 'Nggak Nyangka! Negara Tetangga Ini Pesaing RI Produksi Kendaraan Listrik',
 'Cuan Ratusan Juta Rupiah dari Bisnis Makanan Beku',
 'Genjot Kendaraan Listrik, RI Jaring Produsen Mobil dari China',
 'Insentif buat Bus Listrik Ada, Cek di Sini Penjelasannya',
 'Anak Buah Luhut Beberkan Tantangan Garap Kendaraan Listrik di RI',
 'Saham GOTO Melesat hingga ARA, Ada Apa Nih?',
 'Ini Solusi Genjot Pasokan Listrik di Batam',
 'Trenggono Tegaskan Ekspor Pasir Laut Bukan Jual Negara',
 'Terungkap! Biang Kerok Harga Telur yang Makin Mahal',
 'Menteri Kelautan Ungkap Alasan Ekspor Pasir Laut Dibuka Lagi',
 'The Body Shop Buka-bukaan Soal PHK 146 Karyawan',
 'Naik MRT Jakarta Bisa Sambil Belanja, Barang Diambil di Stasiun',
 'Pasir Laut RI Boleh Diekspor ke Singapura hingga Jepang, Asal...',
 'Harga Telur Mahal, Pemerintah Was-was Inflasi Tinggi',
 '3 Perusahaan Eropa Mau Bikin Pabrik Baterai Mobil Listrik di RI',
 'Ada Rumah Tua 

In [8]:
# cleaning text
title_list = tc.tokenizing(title_list)
title_list = tc.stopword_removal(title_list)
title_list = tc.case_folding(title_list)
title_list = tc.stemming(title_list)
title_list = tc.clean_doc(title_list)
title_list

['pasar sawit hambat ri malaysia sambang uni eropa',
 'nggak nyangka negara tetangga saing ri produksi kendara listrik',
 'cuan ratus juta rupiah bisnis makan beku',
 'genjot kendara listrik ri jaring produsen mobil china',
 'insentif bus listrik cek jelas',
 'anak buah luhut kan tantang garap kendara listrik ri',
 'saham goto lesat ara nih',
 'solusi genjot pasok listrik batam',
 'trenggono tegas ekspor pasir laut jual negara',
 'ungkap biang kerok harga telur mahal',
 'menteri laut alas ekspor pasir laut buka',
 'the body shop buka buka phk karyawan',
 'mrt jakarta belanja barang ambil stasiun',
 'pasir laut ri ekspor singapura jepang',
 'harga telur mahal perintah was was inflasi tinggi',
 'usaha eropa bikin pabrik baterai mobil listrik ri',
 'rumah tua dihimpit apartemen mewah kelola wajib akses',
 'trenggono buka buka alas izin ekspor pasir laut buka',
 'ungkap sebab pesawat garuda masalah putar manado',
 'eddy soib maksimal kur bazar umkm bri',
 'nasib impor krl bekas kemenperin 

## Data Transformation

In [9]:
# mengganti nama nilai feature category
cat_dict = {'detikFinance': 'finance',
            'detikSport': 'sport',
            'detikHealth': 'health',
            'detikFood': 'food',
            'detikOto': 'automotive',
            'detikTravel': 'travel',
            'detikEdu': 'education'}

df['category'] = df['category'].map(cat_dict)
df

Unnamed: 0,title,category
0,"Pasar Sawit Dihambat, RI & Malaysia Langsung S...",finance
1,Nggak Nyangka! Negara Tetangga Ini Pesaing RI ...,finance
2,Cuan Ratusan Juta Rupiah dari Bisnis Makanan Beku,finance
3,"Genjot Kendaraan Listrik, RI Jaring Produsen M...",finance
4,"Insentif buat Bus Listrik Ada, Cek di Sini Pen...",finance
...,...,...
6995,"Siap-siap UTBK 2023, Intip Tata Tertib UTBK & ...",education
6996,Beasiswa ke Jepang MEXT Scholarship 2024 Jenja...,education
6997,Kurang Waktu Bermain Mandiri Tingkatkan Ganggu...,education
6998,IPB Masih Buka Program S1 Beasiswa Utusan Daer...,education


# Feature Engineering

## TF-IDF

In [10]:
tf = tfidf.get_tf(title_list, tfidf.get_list_word(title_list))

In [11]:
df_tf = tfidf.get_df_tf_wqt(tf)

In [12]:
df_tf

Unnamed: 0,D0,D1,D2,D3,D4,D5,D6,D7,D8,D9,...,D6990,D6991,D6992,D6993,D6994,D6995,D6996,D6997,D6998,D6999
aa,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
aaji,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
aal,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
aare,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
aba,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zonk,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zoo,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zs,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zulhas,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
idf = tfidf.get_idf(tf, tfidf.get_list_word(title_list))

In [14]:
df_idf = tfidf.get_df_idf(idf)

In [15]:
df_idf

Unnamed: 0,IDF
aa,3.845
aaji,3.845
aal,3.544
aare,3.845
aba,3.845
...,...
zonk,2.766
zoo,3.845
zs,3.845
zulhas,2.942


In [16]:
wqt = tfidf.get_wqt(tf, idf)

In [17]:
df_wqt = tfidf.get_df_tf_wqt(wqt).T
df_wqt.reset_index(inplace=True)
df_wqt.drop('index', axis=1, inplace=True)

In [18]:
df_wqt

Unnamed: 0,aa,aaji,aal,aare,aba,abad,abah,abai,abal,abang,...,zohri,zombie,zona,zonasi,zone,zonk,zoo,zs,zulhas,zuppa
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
df_clean = pd.concat([df['title'], df['category'], df_wqt], axis=1)

In [20]:
df_clean

Unnamed: 0,title,category,aa,aaji,aal,aare,aba,abad,abah,abai,...,zohri,zombie,zona,zonasi,zone,zonk,zoo,zs,zulhas,zuppa
0,"Pasar Sawit Dihambat, RI & Malaysia Langsung S...",finance,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Nggak Nyangka! Negara Tetangga Ini Pesaing RI ...,finance,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Cuan Ratusan Juta Rupiah dari Bisnis Makanan Beku,finance,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Genjot Kendaraan Listrik, RI Jaring Produsen M...",finance,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Insentif buat Bus Listrik Ada, Cek di Sini Pen...",finance,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6995,"Siap-siap UTBK 2023, Intip Tata Tertib UTBK & ...",education,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6996,Beasiswa ke Jepang MEXT Scholarship 2024 Jenja...,education,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6997,Kurang Waktu Bermain Mandiri Tingkatkan Ganggu...,education,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6998,IPB Masih Buka Program S1 Beasiswa Utusan Daer...,education,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## save df_clean to csv

In [21]:
df_clean.to_csv('data/2023_0531-0315_1000_False_detikcom_clean.csv', index=False)