In [4]:
import unicodedata
import re
import string
import nltk
import numpy as np
import pandas as pd

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, GridSearchCV, train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, roc_curve, roc_auc_score


nltk.download('punkt')


data_fake = pd.read_csv('Datasets/fake.csv')
data_true = pd.read_csv('Datasets/real.csv')

# Merge data
data_merge = pd.concat([data_fake, data_true], axis=0).sample(frac=1).reset_index(drop=True)

# print(data_merge)

      label                                            article
0         0  Nagdagdag si Klay Thompson ng 29 puntos para s...
1         1  Naging usap-usapan sa social media nitong mga ...
2         0  Sa kabila nang maraming detalye hinggil sa kas...
3         0  Sa ika-9 na taon ng film festival, katuwang ip...
4         1  HINDI NGAYON HADLANG ANG HINDI PAG BALITA SA M...
...     ...                                                ...
3201      1  Naglabas ng pahayag ang opisina ni Presidentia...
3202      0  Ito ang reaksiyon ni "First Take" host Skip Ba...
3203      0  Maaaring nagkataon lamang ito, ngunit ang hala...
3204      1  Usap-usapan ngayon sa social media ang umano'y...
3205      0  Ang proyekto ay dinaluhan ng mga opisyal at mi...

[3206 rows x 2 columns]


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
class TextPreprocessor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return [self.preprocess(text) for text in X]

    def preprocess(self, text):
        # print(textes)
        text = str(text)
        text = text.lower()
        normalized = unicodedata.normalize("NFD", text)
        text = "".join(c for c in normalized if unicodedata.category(c) != "Mn")
        text = re.sub(r'\[.*?\]', '', text)
        text = re.sub(r"\\W", " ", text)
        text = re.sub(r'https?://\S+|www\.\S+', '', text)
        text = re.sub(r'<.*?>+', '', text)
        text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
        text = re.sub(r'\n', '', text)
        text = re.sub(r'\w*\d\w*', '', text)
        return text


# Split data into features (X) and target variable (y)
X = data_merge['article']
y = data_merge['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

stopwords = pd.read_json('Datasets/stopwords-tl.json')
custom_stop_words = stopwords[0].values.tolist()


mnb = MultinomialNB()
lr = LogisticRegression()
rf = RandomForestClassifier()
knn = KNeighborsClassifier()
svm = SVC(probability=True)

base_models = [mnb, lr, rf, knn, svm]
base_names = ['MNB', 'LR', 'RF', 'KNN', 'SVM']


# Define the XGBClassifier as meta learner
xgb_model = xgb.XGBClassifier()

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

# Define the steps in your pipeline
steps = [
    ('preprocess', TextPreprocessor()),
    ('vectorizer',  TfidfVectorizer(stop_words=custom_stop_words)),
    ('gridsearch', GridSearchCV(estimator=xgb_model, param_grid=param_grid, scoring='accuracy', cv=5, n_jobs=-1))

]

# Create the pipeline
pipeline = Pipeline(steps)

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)


# Now you can inspect the preprocessed data
# print("Preprocessed data:")
# print(X_train_preprocessed[0])

# Evaluate the pipeline on the test data
# accuracy = pipeline.score(X_test, y_test)
# print(f"Accuracy: {accuracy}")

In [7]:
from joblib import dump

In [8]:
dump(pipeline.named_steps['gridsearch'].best_estimator_, 'Models/best_estimator.joblib')

# additional_step = [
#     ('stacking', )
# ]

# pipeline.steps.append(('stacking', additional_step))

# pipeline.named_steps['gridsearch'].best_params_
pipeline.named_steps['gridsearch'].best_estimator_

In [9]:

steps = [
    ('preprocess', TextPreprocessor()),
    ('vectorizer',  TfidfVectorizer(stop_words=custom_stop_words)),
    ('stacking', StackingClassifier(estimators=list(zip(base_names, base_models)), final_estimator=pipeline.named_steps['gridsearch'].best_estimator_ ,cv=5))
    ]
estack = Pipeline(steps)

# Fit the pipeline on the training data
estack.fit(X_train, y_train)

In [45]:
# base_estimator_data_rf = pipeline.named_steps['stacking'].estimators_[0].named_steps['rf'].predict(X_train)
# base_estimator_data_svm = pipeline.named_steps['stacking'].estimators_[1].named_steps['svm'].predict(X_train)

# for step_name, step_object in pipeline.named_steps.items():
#     print(f"Step: {step_name}")
#     print(step_object)



In [15]:
from joblib import dump

stack_model_filename = 'pipeline_xgb_boost.joblib'


dump(pipeline, '/content/drive/MyDrive/Colab Notebooks/model/pipeline_xgb_boost.joblib')

['/content/drive/MyDrive/Colab Notebooks/model/pipeline_xgb_boost.joblib']

In [16]:
dump( pipeline.named_steps['gridsearch'].best_estimator_ , '/content/drive/MyDrive/Colab Notebooks/model/gridsearch_best_estimator.joblib')

['/content/drive/MyDrive/Colab Notebooks/model/gridsearch_best_estimator.joblib']

In [32]:
# Assuming 'pipeline' is your existing pipeline
print([func for func in dir(TextPreprocessor) if callable(getattr(TextPreprocessor, func)) and not func.startswith("__")])
# for step_name, step_object in pipeline.named_steps.items():
#     print(f"Step: {step_name}")
#     print("Methods and attributes:")
#     print([func for func in dir(step_object) if callable(getattr(step_object, func)) and not func.startswith("__")])
#     print("\n")\


['_check_feature_names', '_check_n_features', '_get_param_names', '_get_tags', '_more_tags', '_repr_html_inner', '_repr_mimebundle_', '_validate_data', '_validate_params', 'fit', 'fit_transform', 'get_params', 'preprocess', 'set_output', 'set_params', 'transform']


In [33]:
import copy

# Assuming 'pipeline' is your existing pipeline
copied_pipeline = copy.deepcopy(pipeline)

# TESTING

In [41]:
print("Steps before removal:")
print(pipeline.steps)

# Remove the step with name 'svm'
pipeline.steps = [step for step in pipeline.steps if step[0] != 'new_step']

# Print the steps after removal
print("\nSteps after removal:")
print(pipeline.steps)

Steps before removal:
[('preprocess', TextPreprocessor()), ('vectorizer', TfidfVectorizer(stop_words=['akin', 'aking', 'ako', 'alin', 'am', 'amin',
                            'aming', 'ang', 'ano', 'anumang', 'apat', 'at',
                            'atin', 'ating', 'ay', 'bababa', 'bago', 'bakit',
                            'bawat', 'bilang', 'dahil', 'dalawa', 'dapat',
                            'din', 'dito', 'doon', 'gagawin', 'gayunman',
                            'ginagawa', 'ginawa', ...])), ('gridsearch', GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     callbacks=None, colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, device=None,
                                     early_stopping_rounds=None,
                                     enable_categorical=False, eval_metric=None,
                                 

In [42]:
pipeline.predict("Nagbibigay ito ng kalayaan sa relihiyon, nagkakaloob ng posisyon sa gobyerno base sa merito at nagbabawal sa birth-based privilege. Hindi krimen sa code ang blasphemy, sabi-sabi at iba pang gawain na labag sa relihiyon. Gayunman, hindi nahinto ang pang-aalipin.")

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [46]:
copied_pipeline.predict(X_train)

AttributeError: 'list' object has no attribute 'predict'

In [55]:

pipeline.steps = [step for step in pipeline.steps if step[0] != 'stacking']
# Print the steps after removal
print("\nSteps after removal:")
print(pipeline.steps)





Steps after removal:
[('preprocess', TextPreprocessor()), ('vectorizer', TfidfVectorizer(stop_words=['akin', 'aking', 'ako', 'alin', 'am', 'amin',
                            'aming', 'ang', 'ano', 'anumang', 'apat', 'at',
                            'atin', 'ating', 'ay', 'bababa', 'bago', 'bakit',
                            'bawat', 'bilang', 'dahil', 'dalawa', 'dapat',
                            'din', 'dito', 'doon', 'gagawin', 'gayunman',
                            'ginagawa', 'ginawa', ...])), ('gridsearch', GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     callbacks=None, colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, device=None,
                                     early_stopping_rounds=None,
                                     enable_categorical=False, eval_metric=None,
                                 

In [56]:
dump(pipeline, '/content/drive/MyDrive/Colab Notebooks/model/pipeline_preprocess.joblib')

['/content/drive/MyDrive/Colab Notebooks/model/pipeline_preprocess.joblib']