In [1]:
import pandas as pd
import re
import numpy as np
import pickle
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split, GridSearchCV
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

## Test Klasifikasi Judul Teks

In [2]:
df_politik = pd.read_csv(r'../detik_politik.csv').sample(frac=0.371, replace=False, random_state=42)
print(df_politik.shape)

df_pemerintahan = pd.read_csv(r'../detik_pemerintahan.csv').sample(frac=0.371, replace=False, random_state=42)
print(df_pemerintahan.shape)

df_non_PP = pd.read_csv(r'../detik_kesehatan.csv')
print(df_non_PP.shape)

(2948, 5)
(2960, 5)
(6135, 5)


In [3]:
df_politik = df_politik.drop_duplicates(subset=['Title'])
df_pemerintahan = df_pemerintahan.drop_duplicates(subset=['Title'])
df_non_PP = df_non_PP.drop_duplicates(subset=['Title'])

In [4]:
print(df_politik.shape)
print(df_pemerintahan.shape)
print(df_non_PP.shape)

(2766, 5)
(2682, 5)
(5447, 5)


In [10]:
df_politik['Klasifikasi Berita'] = 'Politik & Pemerintahan'
df_pemerintahan['Klasifikasi Berita'] = 'Politik & Pemerintahan'
df_non_PP['Klasifikasi Berita'] = 'Non PP'

In [11]:
df_titlecombine = pd.concat([df_politik, df_pemerintahan, df_non_PP], ignore_index=True)
df_titlecombine['Title'] = df_titlecombine['Title'].astype(str)
df_titlecombine.head()

Unnamed: 0,Title,Link,Time,Narasi,Klasifikasi Berita
0,Cak Imin Pastikan PKB dan PKS Tak Kedepankan P...,https://news.detik.com/detiktv/d-6927477/cak-i...,"Selasa, 12 Sep 2023 18:27 WIB",['Ketua Umum PKB Muhaimin Iskandar (Cak Imin) ...,Politik & Pemerintahan
1,"Masuk Tahun Politik, Pembangunan MRT Jakarta D...",https://finance.detik.com/infrastruktur/d-6749...,"Rabu, 31 Mei 2023 17:09 WIB",['Direktur Utama PT MRT Jakarta (Perseroda) Tu...,Politik & Pemerintahan
2,Satire Emil Dardak Ketika Dituduh Jadi Joki Gi...,https://www.detik.com/jatim/berita/d-7165313/s...,"Senin, 29 Jan 2024 08:00 WIB",['Emil Dardak sebagai salah satu juru bicara C...,Politik & Pemerintahan
3,"Wanti-wanti JK, Pemerintahan Jokowi Bisa Jatuh...",https://news.detik.com/detiktv/d-7039140/wanti...,"Rabu, 15 Nov 2023 21:09 WIB","['Wakil Presiden RI ke-10, Jusuf Kalla menyoro...",Politik & Pemerintahan
4,Elite TKN: Poster 'Kabinet Indonesia Emas' Pra...,https://news.detik.com/pemilu/d-7202178/elite-...,"Selasa, 20 Feb 2024 11:59 WIB","[""Sekretaris Tim Kampanye Nasional (TKN) Prabo...",Politik & Pemerintahan


In [12]:
df_titlecombine.shape

(10895, 5)

In [13]:
def clean_text(text):
    # Mengubah teks menjadi lowercase
    text = text.lower()

    # Menghapus simbol-simbol, tanda baca, dan angka
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)

    # Tokenisasi pada judul berita
    words = word_tokenize(text)

    # Menghapus stopword
    stop_words_nltk = stopwords.words('indonesian')
    
    filter_words = [word for word in words if word not in stop_words_nltk]

    # Proses stemming
    stemmer = StemmerFactory().create_stemmer()
    stemmed_words = [stemmer.stem(word) for word in filter_words]

    cleaned_text = ' '.join(stemmed_words)

    return cleaned_text

In [14]:
df_titlecombine['Klasifikasi Berita'].value_counts()

Politik & Pemerintahan    5448
Non PP                    5447
Name: Klasifikasi Berita, dtype: int64

In [15]:
df_titlecombine = df_titlecombine[['Title', 'Klasifikasi Berita']]

In [16]:
df_titlecombine.head()

Unnamed: 0,Title,Klasifikasi Berita
0,Cak Imin Pastikan PKB dan PKS Tak Kedepankan P...,Politik & Pemerintahan
1,"Masuk Tahun Politik, Pembangunan MRT Jakarta D...",Politik & Pemerintahan
2,Satire Emil Dardak Ketika Dituduh Jadi Joki Gi...,Politik & Pemerintahan
3,"Wanti-wanti JK, Pemerintahan Jokowi Bisa Jatuh...",Politik & Pemerintahan
4,Elite TKN: Poster 'Kabinet Indonesia Emas' Pra...,Politik & Pemerintahan


In [17]:
# Membersihkan teks pada kolom 'Title'
df_titlecombine['Title'] = df_titlecombine['Title'].apply(clean_text)
df_titlecombine.head(10)

Unnamed: 0,Title,Klasifikasi Berita
0,cak imin pasti pkb pks depan politik identitas,Politik & Pemerintahan
1,masuk politik bangun mrt jakarta jamin jalan,Politik & Pemerintahan
2,satire emil dardak tuduh joki gibran debat caw...,Politik & Pemerintahan
3,wantiwanti jk perintah jokowi jatuh krisis pol...,Politik & Pemerintahan
4,elite tkn poster kabinet indonesia emas prabow...,Politik & Pemerintahan
5,ajak raffinagita lampung zulhasputri raya poli...,Politik & Pemerintahan
6,tkn prabowo proporsional debat serang personal,Politik & Pemerintahan
7,ksad tindak cepat prajurit tni langgar netrali...,Politik & Pemerintahan
8,nasdem tepis denny rumor anies sangka jegal,Politik & Pemerintahan
9,demokrat pamer lukis sby no justice no peace m...,Politik & Pemerintahan


In [21]:
df_titlecombine.to_csv("end_resultforpolitik.csv", index=False)

In [18]:
def label_encoding(dataframe, column_name):
    # Define the categories for both conditions
    categories_condition_1 = {'Non PP': 0, 'Politik & Pemerintahan': 1}
    categories_condition_2 = {'Non Kesehatan': 0, 'Kesehatan': 1}
    
    # Determine the unique values in the column to decide which categories to use
    unique_values = set(dataframe[column_name].unique())
    
    # Check the specific condition to select the appropriate categories
    if unique_values.intersection(categories_condition_1.keys()):
        categories = categories_condition_1
    else:
        categories = categories_condition_2
    
    # Apply the label encoding using the selected categories
    dataframe[column_name] = dataframe[column_name].map(categories)
    
    # Initialize and fit the label encoder
    label_encoder = LabelEncoder()
    dataframe[column_name] = label_encoder.fit_transform(dataframe[column_name])
    
    return dataframe


In [19]:
df_titlecombine = label_encoding(df_titlecombine, 'Klasifikasi Berita')

In [20]:
df_titlecombine['Klasifikasi Berita'].value_counts()

1    5448
0    5447
Name: Klasifikasi Berita, dtype: int64

In [34]:
def vectorizer_data_ML(feature_var, target_var, train_output_path, test_output_path):
    # Split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(feature_var, target_var, test_size=0.2, random_state=42)
    
    # Print sizes of train and test sets
    print(f"Train set size: {len(X_train)}")
    print(f"Test set size: {len(X_test)}")
    
    # Initialize the TF-IDF vectorizer
    vectorizer = TfidfVectorizer(max_features=1000)
    
    # Fit and transform the train set, transform the test set
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)
    
    # Convert the results to dense arrays
    X_train_tfidf_dense = X_train_tfidf.toarray()
    X_test_tfidf_dense = X_test_tfidf.toarray()
    
    # Get feature names
    feature_names = vectorizer.get_feature_names_out()
    
    # Function to create a sorted DataFrame of TF-IDF scores
    def create_sorted_tfidf_df(tfidf_dense, feature_names):
        tfidf_scores = np.sum(tfidf_dense, axis=0)
        df_tfidf = pd.DataFrame({'kata': feature_names, 'skor': tfidf_scores})
        df_tfidf_sorted = df_tfidf.sort_values(by='skor', ascending=False).reset_index(drop=True)
        return df_tfidf_sorted
    
    # Create sorted DataFrames for train and test sets
    df_train_tfidf_sorted = create_sorted_tfidf_df(X_train_tfidf_dense, feature_names)
    df_test_tfidf_sorted = create_sorted_tfidf_df(X_test_tfidf_dense, feature_names)
    
    # Save DataFrames to CSV files
    df_train_tfidf_sorted.to_csv(train_output_path, index=False)
    df_test_tfidf_sorted.to_csv(test_output_path, index=False)
    
    # Print a confirmation message
    print(f"TF-IDF results saved to {train_output_path} and {test_output_path}")

    return X_train_tfidf, X_test_tfidf, X_train, X_test, y_train, y_test, vectorizer

In [35]:
X_train_tfidf, X_test_tfidf, X_train, X_test, y_train, y_test, vectorizer = vectorizer_data_ML(df_titlecombine['Title'],
                                        df_titlecombine['Klasifikasi Berita'], "train_PP_tfidf.csv", "test_PP_tfidf.csv")

Train set size: 8716
Test set size: 2179
TF-IDF results saved to train_PP_tfidf.csv and test_PP_tfidf.csv


### Train Model dengan data yang ada

In [72]:
def modelling_process(X_train_tfidf, y_train, name_model):
    # Define the models and their parameter grids
    models = {
        'Logistic Regression': {
            'model': LogisticRegression(),
            'params': {
                'C': [0.1, 1, 10]
            }
        },
        'Support Vector Machine': {
            'model': SVC(kernel='linear'),
            'params': {
                'C': [0.1, 1, 10]
            }
        }
    }

    best_model = None
    best_score = 0

    for name, config in models.items():
        print(f"Evaluating {name}...")
        grid_search = GridSearchCV(config['model'], config['params'], cv=5, scoring='accuracy')
        grid_search.fit(X_train_tfidf, y_train)
        
        best_params = grid_search.best_params_
        best_model = grid_search.best_estimator_
        y_pred = best_model.predict(X_test_tfidf)
        accuracy = accuracy_score(y_test, y_pred)
        
        print(f"Best parameters for {name}: {best_params}")
        print(f"{name} Accuracy: {accuracy}")
        print(classification_report(y_test, y_pred))
        print('\n')

        if accuracy > best_score:
            best_score = accuracy
            best_model = best_model

    print(f"Best Model: {best_model}")
    print(f"Best Accuracy: {best_score}")

    # Save the best model to a file
    pickle.dump(best_model, open(f'{name_model}.pkl', 'wb'))
    print(f"Best model saved as '{name_model}.pkl'")

In [37]:
modelling_process(X_train_tfidf, y_train, 'model_classification_politik')

Evaluating Logistic Regression...
Best parameters for Logistic Regression: {'C': 10}
Logistic Regression Accuracy: 0.9807251032583754
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      1057
           1       0.98      0.98      0.98      1122

    accuracy                           0.98      2179
   macro avg       0.98      0.98      0.98      2179
weighted avg       0.98      0.98      0.98      2179



Evaluating Support Vector Machine...
Best parameters for Support Vector Machine: {'C': 1}
Support Vector Machine Accuracy: 0.9788893988067922
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      1057
           1       0.98      0.98      0.98      1122

    accuracy                           0.98      2179
   macro avg       0.98      0.98      0.98      2179
weighted avg       0.98      0.98      0.98      2179



Best Model: SVC(C=1, kernel='linear')
Best Accuracy: 0.98072510325

In [73]:
def save_model(vectorizer, name_vectorizer):
    pickle.dump(vectorizer, open(f"{name_vectorizer}.pkl", "wb"))

In [39]:
save_model(vectorizer, 'vectorizer_politik')

### Cleaning Test Data

In [8]:
# df_testdata = pd.read_csv(r'../tbhoax_newdata.csv')

# df_titletest = df_testdata[['title']]
# df_titletest['title'] = df_titletest['title'].str.replace(r'\[.*?\] ', '', regex=True)

# df_titletest['title'] = df_titletest['title'].astype(str)
# df_titletest['title'] = df_titletest['title'].apply(clean_text)
# df_titletest.to_csv('endresult_judul.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_titletest['title'] = df_titletest['title'].str.replace(r'\[.*?\] ', '', regex=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_titletest['title'] = df_titletest['title'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_titletest['title'] = df_titletest['title'].apply(clean_te

In [71]:
def load_model(filename_vectorizer, filename_model):
    loaded_vectorizer = pickle.load(open(filename_vectorizer, "rb"))
    loaded_model = pickle.load(open(filename_model, "rb"))

    return loaded_vectorizer, loaded_model

def result_test_data(name_file_vectorizer: str, name_file_model: str, name_label_0: str, name_label_1: str, name_column_predict: str,
                     name_file_csv: str):
    df_testdata = pd.read_csv(r'../tbhoax_newdata.csv')

    df_titletest = df_testdata[['title']]
    df_titletest['title'] = df_titletest['title'].str.replace(r'\[.*?\] ', '', regex=True)

    df_titletest['title'] = df_titletest['title'].astype(str)
    df_titletest['title'] = df_titletest['title'].apply(clean_text)

    loaded_vectorizer, loaded_model = load_model(name_file_vectorizer, name_file_model)

    transformedtest_data = loaded_vectorizer.transform(df_titletest['title'])

    test_predictions = loaded_model.predict(transformedtest_data)
    print(test_predictions)

    label_map_reverse = {1: name_label_1, 0: name_label_0}
    predictions_labels = [label_map_reverse[pred] for pred in test_predictions]

    df_testdata[name_column_predict] = predictions_labels

    print(df_testdata[name_column_predict].value_counts())
    df_testdata = df_testdata[df_testdata[name_column_predict] == name_label_1]
    print(df_testdata.shape)

    return df_testdata.to_csv(name_file_csv, index=False)


In [41]:
result_test_data('vectorizer_politik.pkl', 'model_classification_politik.pkl', 'Non PP', 'Politik & Pemerintahan', 'Predict_PP', 'filteredPP.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_titletest['title'] = df_titletest['title'].str.replace(r'\[.*?\] ', '', regex=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_titletest['title'] = df_titletest['title'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_titletest['title'] = df_titletest['title'].apply(clean_te

[0 0 1 ... 0 1 1]
Politik & Pemerintahan    9851
Non PP                    2868
Name: Predict_PP, dtype: int64
(9851, 10)


#### Sama, tapi untuk topik kesehatan

In [54]:
df_detik_kesehatan = pd.read_csv(r'../detik_kesehatan.csv')
df_detik_kesehatan = df_detik_kesehatan.drop(columns=['Link', 'Time', 'Narasi'])
df_detik_kesehatan.head()

Unnamed: 0,Title,Klasifikasi Berita
0,7 Manfaat Brem Madiun Bagi Kesehatan Tubuh,Non-Hoax
1,Tata Cara dan Syarat Pindah Faskes BPJS Kesehatan,Non-Hoax
2,"7 Manfaat Ikan Patin untuk Kesehatan, Salah Sa...",Non-Hoax
3,Berapa Kali BPJS Kesehatan Dapat Digunakan dal...,Non-Hoax
4,3 Warga yang Pakai Air dari Toren Berisi Mayat...,Non-Hoax


In [55]:
df_detik_kesehatan['Klasifikasi Berita'] = 'Kesehatan'
df_detik_kesehatan.head()

Unnamed: 0,Title,Klasifikasi Berita
0,7 Manfaat Brem Madiun Bagi Kesehatan Tubuh,Kesehatan
1,Tata Cara dan Syarat Pindah Faskes BPJS Kesehatan,Kesehatan
2,"7 Manfaat Ikan Patin untuk Kesehatan, Salah Sa...",Kesehatan
3,Berapa Kali BPJS Kesehatan Dapat Digunakan dal...,Kesehatan
4,3 Warga yang Pakai Air dari Toren Berisi Mayat...,Kesehatan


In [56]:
df_detik_kesehatan.shape

(6135, 2)

In [57]:
df_detik_kesehatan = df_detik_kesehatan.drop_duplicates()

In [58]:
df_detik_kesehatan.shape

(5447, 2)

In [59]:
df_politik_non_kesehatan = df_politik
print(df_politik_non_kesehatan.shape)

df_pemerintahan_non_kesehatan = df_pemerintahan
print(df_pemerintahan_non_kesehatan.shape)

(2766, 5)
(2682, 5)


In [60]:
df_nonkesehatan = pd.concat([df_pemerintahan_non_kesehatan, df_politik_non_kesehatan])
df_nonkesehatan['Klasifikasi Berita'] = 'Non Kesehatan'

In [61]:
df_nonkesehatan = df_nonkesehatan.drop(columns=['Link', 'Time', 'Narasi'])

In [62]:
df_nonkesehatan.head()

Unnamed: 0,Title,Klasifikasi Berita
2570,"Di Sidang MK, Menko PMK Jelaskan Bansos Tak Di...",Non Kesehatan
3139,Sandiaga Bahagia AHY Jadi Menteri di Kabinet J...,Non Kesehatan
7139,"Rakyat Puas, PDI Perjuangan Terpilih Teratas",Non Kesehatan
3787,"Sempat Diprediksi Turun, Okupansi Hotel di Med...",Non Kesehatan
7788,"Putra Mahkota Saudi-Presiden Iran Teleponan, B...",Non Kesehatan


In [63]:
df_combine = pd.concat([df_detik_kesehatan, df_nonkesehatan], ignore_index=True)
df_combine['Title'] = df_combine['Title'].astype(str)
df_combine.head()

Unnamed: 0,Title,Klasifikasi Berita
0,7 Manfaat Brem Madiun Bagi Kesehatan Tubuh,Kesehatan
1,Tata Cara dan Syarat Pindah Faskes BPJS Kesehatan,Kesehatan
2,"7 Manfaat Ikan Patin untuk Kesehatan, Salah Sa...",Kesehatan
3,Berapa Kali BPJS Kesehatan Dapat Digunakan dal...,Kesehatan
4,3 Warga yang Pakai Air dari Toren Berisi Mayat...,Kesehatan


In [64]:
df_combine['Klasifikasi Berita'].value_counts()

Non Kesehatan    5448
Kesehatan        5447
Name: Klasifikasi Berita, dtype: int64

In [65]:
df_combine = label_encoding(df_combine, 'Klasifikasi Berita')
df_combine.head()

Unnamed: 0,Title,Klasifikasi Berita
0,7 Manfaat Brem Madiun Bagi Kesehatan Tubuh,1
1,Tata Cara dan Syarat Pindah Faskes BPJS Kesehatan,1
2,"7 Manfaat Ikan Patin untuk Kesehatan, Salah Sa...",1
3,Berapa Kali BPJS Kesehatan Dapat Digunakan dal...,1
4,3 Warga yang Pakai Air dari Toren Berisi Mayat...,1


In [66]:
df_combine['Klasifikasi Berita'].value_counts()

0    5448
1    5447
Name: Klasifikasi Berita, dtype: int64

In [67]:
df_combine['Title'] = df_combine['Title'].astype(str)
df_combine['Title'] = df_combine['Title'].apply(clean_text)

In [74]:
X_train_tfidf, X_test_tfidf, X_train, X_test, y_train, y_test, vectorizer = vectorizer_data_ML(df_combine['Title'], df_combine['Klasifikasi Berita'], "train_Kesehatan_tfidf.csv", "test_Kesehatan_tfidf.csv")

Train set size: 8716
Test set size: 2179
TF-IDF results saved to train_Kesehatan_tfidf.csv and test_Kesehatan_tfidf.csv


In [75]:
modelling_process(X_train_tfidf, y_train, 'model_classification_kesehatan')

Evaluating Logistic Regression...
Best parameters for Logistic Regression: {'C': 1}
Logistic Regression Accuracy: 0.9784304726938963
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      1057
           1       0.98      0.98      0.98      1122

    accuracy                           0.98      2179
   macro avg       0.98      0.98      0.98      2179
weighted avg       0.98      0.98      0.98      2179



Evaluating Support Vector Machine...
Best parameters for Support Vector Machine: {'C': 1}
Support Vector Machine Accuracy: 0.9775126204681046
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      1057
           1       0.98      0.98      0.98      1122

    accuracy                           0.98      2179
   macro avg       0.98      0.98      0.98      2179
weighted avg       0.98      0.98      0.98      2179



Best Model: SVC(C=1, kernel='linear')
Best Accuracy: 0.978430472693

In [76]:
save_model(vectorizer, 'vectorizer_kesehatan')

In [77]:
result_test_data('vectorizer_kesehatan.pkl', 'model_classification_kesehatan.pkl', 'Non Kesehatan', 'Kesehatan', 'Predict_Kesehatan', 'filteredKesehatan.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_titletest['title'] = df_titletest['title'].str.replace(r'\[.*?\] ', '', regex=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_titletest['title'] = df_titletest['title'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_titletest['title'] = df_titletest['title'].apply(clean_te

[1 1 1 ... 1 0 0]
Non Kesehatan    9887
Kesehatan        2832
Name: Predict_Kesehatan, dtype: int64
(2832, 10)
