In [None]:
import pandas as pd
import re
import nltk
import numpy as np
from scipy.optimize import minimize
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer # Stemming
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.metrics import classification_report, mean_squared_error as mse, r2_score as r2
from tl_stopwords import STOP_WORDS as tl_stopwords
from tagalog_stemmer import stemmer as tl_stemmer
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
nltk.download('punkt')
nltk.download('stopwords')

# Preprocessing

In [None]:
fake_df = pd.read_csv("full.csv")
fake_df.head()

In [None]:
print(fake_df.isna().sum().sum())
print(fake_df.isnull().sum().sum())
print(fake_df.duplicated().sum())

In [None]:
fake_df.drop_duplicates(inplace=True)

In [None]:
print(fake_df.isna().sum().sum())
print(fake_df.isnull().sum().sum())
print(fake_df.duplicated().sum())

In [None]:
fake_df.label.value_counts()

In [None]:
fake_df.label.value_counts().plot(
    kind='pie', autopct='%1.2f%%')

In [None]:
x_df = fake_df.article
y_df = fake_df.label

In [None]:
#split training and testing set 
x_train, x_test, y_train, y_test = train_test_split(x_df, y_df, test_size=0.2, random_state=1)

In [None]:
y_train.value_counts()

In [None]:
y_train.value_counts().plot(kind='pie', autopct='%1.2f%%')

In [None]:
#define function to preprocess text
def preprocess_text(text):
    '''
    Function for text preprocessing.
    
    Input Parameters:
    text:str - string to process
    
    Output:
    stemmed:str - processed string from the original text.
    '''

    cleaned = re.sub(r'\W+', ' ', text).lower() #removes punctuations
    tokenized = word_tokenize(cleaned) #tokenization of text

    #stop words removal
    en_stopwords = set(stopwords.words('english'))
    stop_words = en_stopwords.union(tl_stopwords)
    no_stops = [token for token in tokenized if not token in stop_words]
    # no_stops = " ".join([token for token in tokenized if not token in stop_words])

    #stemming
    stemmer = PorterStemmer()
    stemmed = " ".join([stemmer.stem(token) for token in no_stops])
    # stemmer = tl_stemmer
    # stemmed = stemmer(no_stops)
    return stemmed

In [None]:
x_train = x_train.apply(preprocess_text)
print(x_train.shape)
x_train.head()

In [None]:
#convert the verified_reviews columns with Tfidvectorizer
vectorizer = TfidfVectorizer(norm=None)
# vectorizer = CountVectorizer()

x_values_list = vectorizer.fit_transform(x_train).toarray()
x_train = pd.DataFrame(x_values_list,columns = vectorizer.get_feature_names_out())

x_train.head()

# Model Selection

In [None]:
models = [
    LogisticRegression(), 
    MLPClassifier(), 
    KNeighborsClassifier(), 
    SVC(),
    GaussianProcessClassifier(), 
    QuadraticDiscriminantAnalysis(), 
    DecisionTreeClassifier(), 
    RandomForestClassifier(), 
    AdaBoostClassifier(), 
    GaussianNB()
] 

In [None]:
for model in models:
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)

    print('\n\n\n' + str(model))
    #show the results in a confusion matrix
    cf_matrix = confusion_matrix(y_test, y_pred)

    disp_cf = sns.heatmap(cf_matrix, annot=True, cmap='Blues', fmt='d')

    disp_cf.set_title('Confusion Matrix')
    disp_cf.set_xlabel('\nPredicted Category')
    disp_cf.set_ylabel('Actual Category ')

    print(classification_report(y_test, y_pred))
    plt.show()

# Hyperparameter Tuning

### Takes too long to load, di ko na sinama

In [None]:
log_model = LogisticRegression()

In [None]:
param_grid = [    
    {'penalty' : ['l1', 'l2', 'elasticnet', 'none'],
    # 'C' : np.logspace(-4, 4, 5),
    'solver' : ['lbfgs','newton-cg','liblinear','sag','saga'],
    # 'max_iter' : [100, 1000,2500, 5000]
    }
]


clf = GridSearchCV(log_model, param_grid = param_grid, cv = 2, verbose=True, n_jobs=-1)
best_clf = clf.fit(x_train, y_train)