In [None]:
import unicodedata
import re
import string
import nltk
import numpy as np
import pandas as pd

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, GridSearchCV, train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, roc_curve, roc_auc_score


nltk.download('punkt')


data_fake = pd.read_csv('Datasets/fake.csv')
data_true = pd.read_csv('Datasets/real.csv')

# Merge data
data_merge = pd.concat([data_fake, data_true], axis=0).sample(frac=1).reset_index(drop=True)

#

In [47]:
print(data_fake)
print(data_merge)

      label                                            article
0         0  Ayon sa TheWrap.com, naghain ng kaso si Krupa,...
1         0  Kilala rin ang singer sa pagkumpas ng kanyang ...
2         0  BLANTYRE, Malawi (AP) -- Bumiyahe patungong Ma...
3         0  Kasama sa programa ang pananalangin, bulaklak ...
4         0  Linisin ang Friendship Department dahil dadala...
...     ...                                                ...
1598      0  HINAMON ng isang kongresista si Sen. Panfilo L...
1599      0  INIHAYAG ni dating Presidential Spokesperson H...
1600      0  INIREKLAMO sa Office of the Ombudsman ang isan...
1601      0  ISINAPUBLIKO ng Quezon City Police District (Q...
1602      0  NAIS humingi ng kasiguruhan ng isang solon sa ...

[1603 rows x 2 columns]
      label                                            article
0         0  Nagdagdag si Klay Thompson ng 29 puntos para s...
1         1  Naging usap-usapan sa social media nitong mga ...
2         0  Sa kabila nang ma

In [6]:
class TextPreprocessor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return [self.preprocess(text) for text in X]

    def preprocess(self, text):
        # print(textes)
        text = str(text)
        text = text.lower()
        normalized = unicodedata.normalize("NFD", text)
        text = "".join(c for c in normalized if unicodedata.category(c) != "Mn")
        text = re.sub(r'\[.*?\]', '', text)
        text = re.sub(r"\\W", " ", text)
        text = re.sub(r'https?://\S+|www\.\S+', '', text)
        text = re.sub(r'<.*?>+', '', text)
        text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
        text = re.sub(r'\n', '', text)
        text = re.sub(r'\w*\d\w*', '', text)
        return text


# Split data into features (X) and target variable (y)
X = data_merge['article']
y = data_merge['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

stopwords = pd.read_json('Datasets/stopwords-tl.json')
custom_stop_words = stopwords[0].values.tolist()


mnb = MultinomialNB()
lr = LogisticRegression()
rf = RandomForestClassifier()
knn = KNeighborsClassifier()
svm = SVC(probability=True)

base_models = [mnb, lr, rf, knn, svm]
base_names = ['MNB', 'LR', 'RF', 'KNN', 'SVM']


# Define the XGBClassifier as meta learner
xgb_model = xgb.XGBClassifier()

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

# Define the steps in your pipeline
steps = [
    ('preprocess', TextPreprocessor()),
    ('vectorizer',  TfidfVectorizer(stop_words=custom_stop_words)),
    ('gridsearch', GridSearchCV(estimator=xgb_model, param_grid=param_grid, scoring='accuracy', cv=5, n_jobs=-1))

]

# Create the pipeline
pipeline = Pipeline(steps)

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)


# Now you can inspect the preprocessed data
# print("Preprocessed data:")
# print(X_train_preprocessed[0])

# Evaluate the pipeline on the test data
# accuracy = pipeline.score(X_test, y_test)
# print(f"Accuracy: {accuracy}")

In [7]:
from joblib import dump

In [8]:
dump(pipeline.named_steps['gridsearch'].best_estimator_, 'Models/best_estimator.joblib')

# additional_step = [
#     ('stacking', )
# ]

# pipeline.steps.append(('stacking', additional_step))

# pipeline.named_steps['gridsearch'].best_params_
pipeline.named_steps['gridsearch'].best_estimator_

In [72]:

steps = [
    ('preprocess', TextPreprocessor()),
    ('vectorizer',  TfidfVectorizer(stop_words=custom_stop_words)),
    ('stacking', StackingClassifier(estimators=list(zip(base_names, base_models)), final_estimator=pipeline.named_steps['gridsearch'].best_estimator_ ,cv=5))
    ]
estack = Pipeline(steps)

# Fit the pipeline on the training data
estack.fit(X_train, y_train)

In [73]:
dump(estack, 'Models/stack_xgboost.joblib' )

['Models/stack_xgboost.joblib']

In [45]:
# base_estimator_data_rf = pipeline.named_steps['stacking'].estimators_[0].named_steps['rf'].predict(X_train)
# base_estimator_data_svm = pipeline.named_steps['stacking'].estimators_[1].named_steps['svm'].predict(X_train)

# for step_name, step_object in pipeline.named_steps.items():
#     print(f"Step: {step_name}")
#     print(step_object)

In [32]:
# Assuming 'pipeline' is your existing pipeline
# print([func for func in dir(TextPreprocessor) if callable(getattr(TextPreprocessor, func)) and not func.startswith("__")])
# for step_name, step_object in pipeline.named_steps.items():
#     print(f"Step: {step_name}")
#     print("Methods and attributes:")
#     print([func for func in dir(step_object) if callable(getattr(step_object, func)) and not func.startswith("__")])
#     print("\n")\


['_check_feature_names', '_check_n_features', '_get_param_names', '_get_tags', '_more_tags', '_repr_html_inner', '_repr_mimebundle_', '_validate_data', '_validate_params', 'fit', 'fit_transform', 'get_params', 'preprocess', 'set_output', 'set_params', 'transform']


# TESTING

In [None]:
# print("Steps before removal:")
# print(pipeline.steps)

# # Remove the step with name 'svm'
# pipeline.steps = [step for step in pipeline.steps if step[0] != 'new_step']

# # Print the steps after removal
# print("\nSteps after removal:")
# print(pipeline.steps)

In [84]:


toPredict = ["Kahit na laglag ang mga kandidato ng Otso Diretso sa Halalan 2019, inudyukan ni Vice President Leni Robredo ang publiko na bantayan ang mga natitirang mga boto na kailangan pang bilangin. May laban pa tayong hinaharap. Hindi pa tapos ang bilangan, at kailangan pang bantayan, sabi ni Robredo. Sa mga kandidato ng oposisyon, si Senator Bam Aquino lang ang pinakamalapit sa Top 12. As of 11:26 am, nasa ika-14 na puwesto si Bam. Sumunod ay si Roxas na nasa ika-16 na puwesto. Ang ibang pambato ng Otso Diretso naman ay nasa labas na ng top 20. Para kay Robredo, panahon daw para magkapit-bisig at huwag bumitaw sa mga nalalabing oras ng bilangan. Ipinamalas ng ating mga kandidato at volunteers ang tapang at dedikasyon sa ating adhikain. Huwag natin ito bitawan. Paghugutan natin ng lakas ang bawat isa, dagdag pa ni Robredo. Basahin ang reaksyon ng ilang kababayan natin sa mga pinagsasabi ni Robredo. Instead of making that your battle cry, why not chanel it towards something positive, Leni.. drop the negative word fight, and inspire the people by using positive words like - lets HELP the government by being good abiding citizens etc etc.. something like that sana!!! Kaya kayo lumulubog eh! The people do NOT see and feel your sincerity to truly help the country. Hayyy aling Leni, sabi ni A. Gayanelo. Fight??? Gusto mo talaga laging kaguluhan madam. We need a happy life and peace of mind and peace country thats all we ned madam, sabi ni M. Dizon. Fight for what? Mas maganda pow yata vice Pres. eh kung help the government para sa better future., sabi ni A. Luna. Source: ABS-CBN | FB"]
toP = pd.DataFrame(toPredict)


In [90]:
# y_pred= estack.predict(X_test[610]) # 0
y_pred= estack.predict(toP) #0

# df = toP.iloc[1:]
for x in y_pred:
    print(x)
# vectorized = estack.named_steps['vectorizer'].transform(toP[0])
# vectorized.shape


# print('Accuracy: %.2f' % accuracy)
# print('Precision: %.2f' % precision)
# print('Recall: %.2f' % recall)
# print('F1 Score: %.2f' % f1)

0


Kahit na laglag ang mga kandidato ng Otso Diretso sa Halalan 2019, inudyukan ni Vice President Leni Robredo ang publiko na bantayan ang mga natitirang mga boto na kailangan pang bilangin. "May laban pa tayong hinaharap. Hindi pa tapos ang bilangan, at kailangan pang bantayan," sabi ni Robredo. Sa mga kandidato ng oposisyon, si Senator Bam Aquino lang ang pinakamalapit sa Top 12. As of 11:26 am, nasa ika-14 na puwesto si Bam. Sumunod ay si Roxas na nasa ika-16 na puwesto. Ang ibang pambato ng Otso Diretso naman ay nasa labas na ng top 20. Para kay Robredo, panahon daw para magkapit-bisig at huwag bumitaw sa mga nalalabing oras ng bilangan. "Ipinamalas ng ating mga kandidato at volunteers ang tapang at dedikasyon sa ating adhikain. Huwag natin ito bitawan. Paghugutan natin ng lakas ang bawat isa," dagdag pa ni Robredo. Basahin ang reaksyon ng ilang kababayan natin sa mga pinagsasabi ni Robredo. "Instead of making that your battle cry, why not chanel it towards something positive, Leni.. 