In [1]:
import unicodedata
import re
import string
import nltk
import numpy as np
import pandas as pd

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, GridSearchCV, train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, roc_curve, roc_auc_score


nltk.download('punkt')


data_fake = pd.read_csv('Datasets/fake.csv')
data_true = pd.read_csv('Datasets/real.csv')

# Merge data
data_merge = pd.concat([data_fake, data_true], axis=0).sample(frac=1).reset_index(drop=True)

#

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [67]:
# print(data_fake)
# print(data_merge)

In [2]:
class TextPreprocessor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return [self.preprocess(text) for text in X]

    def preprocess(self, text):
        # print(textes)
        text = str(text)
        text = text.lower()
        normalized = unicodedata.normalize("NFD", text)
        text = "".join(c for c in normalized if unicodedata.category(c) != "Mn")
        text = re.sub(r'\[.*?\]', '', text)
        text = re.sub(r"\\W", " ", text)
        text = re.sub(r'https?://\S+|www\.\S+', '', text)
        text = re.sub(r'<.*?>+', '', text)
        text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
        text = re.sub(r'\n', '', text)
        text = re.sub(r'\w*\d\w*', '', text)
        return text


# Split data into features (X) and target variable (y)
X = data_merge['article']
y = data_merge['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

stopwords = pd.read_json('Datasets/stopwords-tl.json')
custom_stop_words = stopwords[0].values.tolist()


mnb = MultinomialNB()
lr = LogisticRegression()
rf = RandomForestClassifier()
knn = KNeighborsClassifier()
svm = SVC(probability=True)

base_models = [mnb, lr, rf, knn, svm]
base_names = ['MNB', 'LR', 'RF', 'KNN', 'SVM']


# Define the XGBClassifier as meta learner
xgb_model = xgb.XGBClassifier()

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

steps = [
    ('preprocess', TextPreprocessor()),
    ('vectorizer',  TfidfVectorizer(stop_words=custom_stop_words))
]

# Create the pipeline
prepros = Pipeline(steps)

from joblib import dump, load
dump(prepros, 'Models/vectorizer.joblib')

['Models/vectorizer.joblib']

SKIP this if you already have Best_estimator

In [69]:

# Define the steps in your pipeline
steps = [
    ('preprocess', TextPreprocessor()),
    ('vectorizer',  TfidfVectorizer(stop_words=custom_stop_words)),
    ('gridsearch', GridSearchCV(estimator=xgb_model, param_grid=param_grid, scoring='accuracy', cv=5, n_jobs=-1))

]

# Create the pipeline
pipeline = Pipeline(steps)

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)


# Now you can inspect the preprocessed data
# print("Preprocessed data:")
# print(X_train_preprocessed[0])

# Evaluate the pipeline on the test data
# accuracy = pipeline.score(X_test, y_test)
# print(f"Accuracy: {accuracy}")

In [71]:
dump(pipeline.named_steps['gridsearch'].best_estimator_, 'Models/best_estimator.joblib')

# additional_step = [
#     ('stacking', )
# ]

# pipeline.steps.append(('stacking', additional_step))

# pipeline.named_steps['gridsearch'].best_params_
pipeline.named_steps['gridsearch'].best_estimator_

In [4]:
pipeL =  load('Models/best_estimator.joblib')
steps = [
    ('preprocess', TextPreprocessor()),
    ('vectorizer',  TfidfVectorizer(stop_words=custom_stop_words)),
    ('stacking', StackingClassifier(estimators=list(zip(base_names, base_models)), final_estimator=pipeL ,cv=5))
    ]
estack = Pipeline(steps)

# Fit the pipeline on the training data
estack.fit(X_train, y_train)

KeyboardInterrupt: 

In [73]:
dump(estack, 'Models/stack_xgboost.joblib' )

['Models/stack_xgboost.joblib']

In [74]:
# base_estimator_data_rf = pipeline.named_steps['stacking'].estimators_[0].named_steps['rf'].predict(X_train)
# base_estimator_data_svm = pipeline.named_steps['stacking'].estimators_[1].named_steps['svm'].predict(X_train)


# TESTING

In [119]:
# PRINT ALL THE STEPS
# # Print the steps after removal
# print("\nSteps after removal:")
# print(estack.steps)

[('preprocess', TextPreprocessor()), ('vectorizer', TfidfVectorizer(stop_words=['akin', 'aking', 'ako', 'alin', 'am', 'amin',
                            'aming', 'ang', 'ano', 'anumang', 'apat', 'at',
                            'atin', 'ating', 'ay', 'bababa', 'bago', 'bakit',
                            'bawat', 'bilang', 'dahil', 'dalawa', 'dapat',
                            'din', 'dito', 'doon', 'gagawin', 'gayunman',
                            'ginagawa', 'ginawa', ...])), ('stacking', StackingClassifier(cv=5,
                   estimators=[('MNB', MultinomialNB()),
                               ('LR', LogisticRegression()),
                               ('RF', RandomForestClassifier()),
                               ('KNN', KNeighborsClassifier()),
                               ('SVM', SVC(probability=True))],
                   final_estimator=XGBClassifier(base_score=None, booster=None,
                                                 callbacks=None,
                   

In [5]:
Usestack = load('Models/stack_xgboost.joblib')

# toPredict = X_test[3049]
# toP = pd.DataFrame(toPredict)


In [21]:
# toPredict = ["Pinag-uusapan ngayon ng mga netizens ang ulat kamakailan lang ng ABS-CBN patungkol sa pagdami umano ng mga Pilipinong walang trabaho. Ayon sa artikulong may pamagat na Mga Pinoy na walang trabaho, umabot sa 9.8 milyon, milyon-milyon na daw ang walang hanap-buhay na kababayan natin kung pagbabasehan ang pinakahuling resulta ng survey ng Social Weather Stations (SWS). Tanong nga mga kababayan natyn, paano nalaman at umabot sa 9.8 million ang resulta ng survey kung sa mismong artikulo ay sinabing 1,500 lang ang tinanong. Idinaos ang SWS survey mula Setyembre 15 hanggang 23 sa pamamagitan ng mga face-to-face interview sa 1,500 Pinoy na may edad 18 pataas. Mayroon din itong +/-6 percent margin of error para sa national percentages, sabi sa ulat ng ABS-CBN. Hindi tuloy maiwasan ng mga kababayan natin na kuwestiyonin ang nasabing ulat at ang resulta ng survey. Source: ABS-CBN | ThinkingMinds"]
toPredict= X_test[2290].split(' ',0)
# y_pred1 = estack.predict(toPredict) 

# 
# print(y_pred1)

# df = toP.iloc[1:]

# vectorized = estack.named_steps['preprocess'].transform(toPredict)
# vectorized.shape
# print(vectorized)

# y_pred = estack.predict(toPredict)
y_pred = Usestack.predict_proba(toPredict) 
y_pred2 = Usestack.predict(toPredict)
print(y_pred)
print(y_pred2)

# for x in y_pred:
#     print(x)

# X_train, X_test, y_train, y_test
# accuracy = accuracy_score(y_test, y_pred)
# precision = precision_score(y_test, y_pred)
# recall = recall_score(y_test, y_pred)
# f1 = f1_score(y_test, y_pred)

# print('Accuracy: %.2f' % accuracy)
# print('Precision: %.2f' % precision)
# print('Recall: %.2f' % recall)
# print('F1 Score: %.2f' % f1)

[[9.998374e-01 1.625810e-04]]
[0]


In [86]:
# print(y_pred2)

[0 0 0 ... 0 0 0]


In [112]:
pd.set_option('display.max_columns', None)
print(y_test)
# for x in y_test:
#     print(x)

610     0
3049    0
2481    0
1918    0
2884    0
       ..
2924    1
2290    1
1068    0
420     1
1602    0
Name: label, Length: 642, dtype: int64
