<a href="https://colab.research.google.com/github/hrinotaf/LKP4_G6501231035_Harin_Noor_Octafiani/blob/main/Tugas_LKP4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
import string
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import stopwords
from collections import Counter
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [None]:
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/train.csv')
data.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [None]:
X = data[["text"]]
y = data[["label"]]

print(X)
print(y.head())

                                                    text
0      House Dem Aide: We Didn’t Even See Comey’s Let...
1      Ever get the feeling your life circles the rou...
2      Why the Truth Might Get You Fired October 29, ...
3      Videos 15 Civilians Killed In Single US Airstr...
4      Print \nAn Iranian woman has been sentenced to...
...                                                  ...
20795  Rapper T. I. unloaded on black celebrities who...
20796  When the Green Bay Packers lost to the Washing...
20797  The Macy’s of today grew from the union of sev...
20798  NATO, Russia To Hold Parallel Exercises In Bal...
20799    David Swanson is an author, activist, journa...

[20800 rows x 1 columns]
   label
0      1
1      0
2      1
3      1
4      1


In [None]:
X = X.astype(str)

In [None]:
X = X["text"].str.lower()

In [None]:
print(X.head())

0    house dem aide: we didn’t even see comey’s let...
1    ever get the feeling your life circles the rou...
2    why the truth might get you fired october 29, ...
3    videos 15 civilians killed in single us airstr...
4    print \nan iranian woman has been sentenced to...
Name: text, dtype: object


In [None]:
PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

X = X.apply(lambda text: remove_punctuation(text))
X.head()

0    house dem aide we didn’t even see comey’s lett...
1    ever get the feeling your life circles the rou...
2    why the truth might get you fired october 29 2...
3    videos 15 civilians killed in single us airstr...
4    print \nan iranian woman has been sentenced to...
Name: text, dtype: object

In [None]:
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

X = X.apply(lambda text: remove_stopwords(text))
X.head()

0    house dem aide didn’t even see comey’s letter ...
1    ever get feeling life circles roundabout rathe...
2    truth might get fired october 29 2016 tension ...
3    videos 15 civilians killed single us airstrike...
4    print iranian woman sentenced six years prison...
Name: text, dtype: object

In [None]:
cnt = Counter()
for text in X.values:
    for word in text.split():
        cnt[word] += 1

cnt.most_common(10)

[('said', 79931),
 ('mr', 66051),
 ('”', 48164),
 ('—', 47096),
 ('trump', 43702),
 ('would', 37013),
 ('one', 36653),
 ('people', 33923),
 ('new', 29660),
 ('like', 25694)]

In [None]:
FREQWORDS = set([w for (w, wc) in cnt.most_common(10)])
def remove_freqwords(text):
    return " ".join([word for word in str(text).split() if word not in FREQWORDS])

X = X.apply(lambda text: remove_freqwords(text))
X.head()

0    house dem aide didn’t even see comey’s letter ...
1    ever get feeling life circles roundabout rathe...
2    truth might get fired october 29 2016 tension ...
3    videos 15 civilians killed single us airstrike...
4    print iranian woman sentenced six years prison...
Name: text, dtype: object

In [None]:
n_rare_words = 10
RAREWORDS = set([w for (w, wc) in cnt.most_common()[:-n_rare_words-1:-1]])
def remove_rarewords(text):
    return " ".join([word for word in str(text).split() if word not in RAREWORDS])

X = X.apply(lambda text: remove_rarewords(text))
X.head()

0    house dem aide didn’t even see comey’s letter ...
1    ever get feeling life circles roundabout rathe...
2    truth might get fired october 29 2016 tension ...
3    videos 15 civilians killed single us airstrike...
4    print iranian woman sentenced six years prison...
Name: text, dtype: object

In [None]:
lemmatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}
def lemmatize_words(text):
    pos_tagged_text = nltk.pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

X = X.apply(lambda text: lemmatize_words(text))
X.head()

0    house dem aide didn’t even see comey’s letter ...
1    ever get feeling life circle roundabout rather...
2    truth might get fired october 29 2016 tension ...
3    video 15 civilian kill single u airstrike iden...
4    print iranian woman sentence six year prison i...
Name: text, dtype: object

In [None]:
X = X.values
vectorizer = CountVectorizer(binary=True)
features = vectorizer.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, y, test_size=0.2, random_state=3)

Bernoulli Naive Bayes

In [None]:
model = BernoulliNB()
model.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

In [None]:
train = model.predict(X_train)
cm = confusion_matrix(y_train, train)
print(classification_report(y_train ,train))
print("Confusion Matrix: \n", cm)
print("F1 Score: ",f1_score(y_train, train))
print("Accuracy: ", accuracy_score(y_train, train))
print("Precision: ", precision_score(y_train, train))
print("Recall: ", recall_score(y_train, train))

              precision    recall  f1-score   support

           0       0.87      0.74      0.80      8294
           1       0.78      0.89      0.83      8346

    accuracy                           0.82     16640
   macro avg       0.82      0.82      0.81     16640
weighted avg       0.82      0.82      0.81     16640

Confusion Matrix: 
 [[6164 2130]
 [ 938 7408]]
F1 Score:  0.8284500111831805
Accuracy:  0.815625
Precision:  0.7766827427133571
Recall:  0.8876108315360651


In [None]:
test = model.predict(X_test)
cm = confusion_matrix(y_test, test)
print(classification_report(y_test ,test))
print("Confusion Matrix: \n", cm)
print("F1 Score: ",f1_score(y_test, test))
print("Accuracy: ", accuracy_score(y_test, test))
print("Precision: ", precision_score(y_test, test))
print("Recall: ", recall_score(y_test, test))

              precision    recall  f1-score   support

           0       0.83      0.73      0.78      2093
           1       0.76      0.85      0.80      2067

    accuracy                           0.79      4160
   macro avg       0.79      0.79      0.79      4160
weighted avg       0.79      0.79      0.79      4160

Confusion Matrix: 
 [[1523  570]
 [ 309 1758]]
F1 Score:  0.7999999999999999
Accuracy:  0.7887019230769231
Precision:  0.7551546391752577
Recall:  0.8505079825834543
