# **Introduction**

In [29]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Data yang digunakan pada *preprocessing* adalah [Fake and real News Dataset](https://www.kaggle.com/code/ohseokkim/fake-news-easy-nlp-text-classification/data).

# **Download Libraries**

In [30]:
pip install nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [31]:
pip install demoji

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [32]:
pip install contractions

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [33]:
pip install Unidecode

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [34]:
pip install num2words

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [35]:
pip install pyspellchecker

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# **Import Libraries**

In [36]:
import numpy as np
import pandas as pd
import re
import nltk
import spacy
import string
import demoji
import contractions
import unidecode
from collections import Counter
from num2words import num2words
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from bs4 import BeautifulSoup
from spellchecker import SpellChecker
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')

# pd.options.display.max_columns=None
# pd.options.display.max_rows=None
# pd.options.display.max_colwidth=None

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# **Load Data**

In [37]:
dwn_url='https://drive.google.com/uc?id=' + '1zKSY4d2v6Wf3UUhmd-5d-er30PBMXzGg'
fake_df = pd.read_csv(dwn_url)
dwn_url='https://drive.google.com/uc?id=' + '1mXmP3ystGW_ALB2XZQPfIzq3xdQOh0wm'
true_df = pd.read_csv(dwn_url)

In [38]:
fake_df["output"] = 0
true_df["output"] = 1

df = pd.concat([fake_df, true_df])
df = df.sample(frac=1).reset_index(drop=True)

In [39]:
print(f'Shape of the whole data is: {df.shape[0]} rows and {df.shape[1]} columns')

Shape of the whole data is: 44898 rows and 5 columns


In [40]:
df.head()

Unnamed: 0,title,text,subject,date,output
0,PRESIDENT TRUMP’S REMARKS at Ford’s Theatre Ga...,PRESIDENT TRUMP and FIRST LADY MELANIA TRUMP a...,Government News,"Jun 4, 2017",1
1,‘F*ck Donald Trump’: Democratic Chair Of Cali...,"John Burton, the outgoing chairman of Californ...",News,"May 22, 2017",1
2,U.S sanctions North Koreans for 'flagrant' rig...,WASHINGTON (Reuters) - The United States on Th...,worldnews,"October 26, 2017",0
3,"Unlike Trump, Americans want strong environmen...",NEW YORK (Reuters) - More than 60 percent of A...,politicsNews,"January 17, 2017",0
4,Senate approves two FCC nominees as it reviews...,WASHINGTON (Reuters) - The U.S. Senate on Thur...,politicsNews,"August 3, 2017",0


# **Pre Processing Data**

## Remove

### Rare words

In [None]:
def rare_words(text):
    # tokenization
    tokens = word_tokenize(text)
    counter = Counter()
    for word in tokens:
        counter[word]= +1

    RareWords = []
    number_rare_words = 1
    # take top 1 rare words
    frequentWords = counter.most_common()
    for (word, word_count) in frequentWords[:-number_rare_words-1:-1]:
        RareWords.append(word)

    return RareWords

def remove_rw(text, RareWords):
    tokens = word_tokenize(text)
    without_rw = []
    for word in tokens:
        if word not in RareWords:
            without_rw.append(word)

    without_rw = ' '.join(without_rw)
    return without_rw

### Other

In [None]:
def remove(text):
  # remove url's
  text = re.sub(r'https?://\S+|www\.\S+', '', text)

  # remove email
  text = re.sub(r'\S+@\S+', '', text)

  # remove Hashtags and Mentions
  text = re.sub(r'(@\S+|#\S+)', '', text)

  # remove date
  ## 1. Remove date formats like: dd/mm/yy(yy), dd-mm-yy(yy), dd(st|nd|rd).mm/yy(yy)
  text = re.sub(r'\d{1,2}(st|nd|rd|th)?[-./]\d{1,2}[-./]\d{2,4}', '', text)
  ## 2. Remove date formats like: 20 apr 21, April 15th, 11th of April, 2021
  text = re.sub(r'(\d{1,2})?(st|nd|rd|th)?[-./,]?\s?(of)?\s?([J|j]an(uary)?|[F|f]eb(ruary)?|[Mm]ar(ch)?|[Aa]pr(il)?|[Mm]ay|[Jj]un(e)?|[Jj]ul(y)?|[Aa]ug(ust)?|[Ss]ep(tember)?|[Oo]ct(ober)?|[Nn]ov(ember)?|[Dd]ec(ember)?)\s?(\d{1,2})?(st|nd|rd|th)?\s?[-./,]?\s?(\d{2,4})?', r'', text)

  # remove html tags
  text = BeautifulSoup(text).get_text()

  # remove emoji
  emoji_pattern = re.compile("["
                              u"\U0001F600-\U0001F64F"  # emoticons
                              u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                              u"\U0001F680-\U0001F6FF"  # transport & map symbols
                              u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                              u"\U00002500-\U00002BEF"  # chinese char
                              u"\U00002702-\U000027B0"
                              u"\U00002702-\U000027B0"
                              u"\U000024C2-\U0001F251"
                              u"\U0001f926-\U0001f937"
                              u"\U00010000-\U0010ffff"
                              u"\u2640-\u2642"
                              u"\u2600-\u2B55"
                              u"\u200d"
                              u"\u23cf"
                              u"\u23e9"
                              u"\u231a"
                              u"\ufe0f"  # dingbats
                              u"\u3030"
                              "]+", flags=re.UNICODE)
  text = emoji_pattern.sub(r'', text)

  # remove puctuations
  PUNCTUATIONS = string.punctuation
  text = text.translate(str.maketrans('', '', PUNCTUATIONS))

  # remove stopword
  STOPWORDS = set(stopwords.words('english'))
  text = ' '.join([word for word in text.split() if word not in STOPWORDS])

  # remove number
  text = re.sub(r'\d+', '',text)

  # remove whitespace
  text = " ".join(text.split())

  # remove rare word
  text = remove_rw(text, rare_words(text))

  return text


### Remove all of them

In [None]:
# Run Function remove
# df.text = df.text.apply(lambda x: remove(x))

df['text_clean'] = df['text'].apply(lambda x: remove(x))
df['text_clean'].sample(10)

15316    ANKARA Reuters Turkey longer needs join Europe...
42029    Will FINALLY straw breaks camel back Wow eveny...
19911    After announced President Trump seriously cons...
22676    What antiAmerican thing someone besides vote D...
36465    Donald Trump selected racist homophobe nation ...
15247    BRASILIA Reuters Brazilian President Michel Te...
40099    ListenAfter creating video included vile behea...
32683    Donald Trump bragging nonstop Carrier deal sup...
18062    Reuters US Senator Rand Paul Kentucky said ret...
14080    Initem corrects spelling Kislyak paragraph WAS...
Name: text_clean, dtype: object

## Stemming

In [None]:
stemmer = PorterStemmer()

def stem_words(text):
    return ' '.join([stemmer.stem(word) for word in text.split()])

In [None]:
df['text_stemmed'] = df['text_clean'].apply(lambda text: stem_words(text))

In [None]:
df[['text', 'text_stemmed']].sample(10)

Unnamed: 0,text,text_stemmed
2284,LONDON (Reuters) - A Briton kidnapped in Niger...,london reuter a briton kidnap nigeria last mon...
40797,If this sniper is part of the Black Lives Matt...,if sniper part black live matter group massacr...
6796,Security concerns were brought up after a blac...,secur concern brought black flag heart arab wr...
6543,Sen. John McCain is fed up with Donald Trump a...,sen john mccain fed donald trump big mouth fol...
44744,Here s a great way to explain socialism to the...,here great way explain social neighbor of cour...
37139,WASHINGTON (Reuters) - U.S. budget chief Mick ...,washington reuter us budget chief mick mulvane...
4821,Tina Fey once again took on Sarah Palin on Sat...,tina fey took sarah palin saturday night live ...
5300,ATLANTA (Reuters) - Supporters of U.S. Preside...,atlanta reuter support us presid donald trump ...
10297,"On Thursday, failed 2012 presidential candidat...",on thursday fail presidenti candid mitt romney...
39331,Stop blaming white people for Trumps win last ...,stop blame white peopl trump win last night am...


## Lemmatization

In [None]:
lemmatizer = WordNetLemmatizer()

def text_lemmatize(text):
    return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

In [None]:
df['text_lemmatized'] = df['text_stemmed'].apply(lambda text: text_lemmatize(text))

In [None]:
df[['text', 'text_stemmed', 'text_lemmatized']].sample(5)

Unnamed: 0,text,text_stemmed,text_lemmatized
30676,BRUSSELS (Reuters) - Sacked Catalan leader Car...,brussel reuter sack catalan leader carl puigde...,brussel reuter sack catalan leader carl puigde...
31454,"On September 15, Hillary apparently held a r...",onhillari appar held ralli old student recreat...,onhillari appar held ralli old student recreat...
1942,Keith Olbermann did not mince words. The end o...,keith olbermann minc word the end social secur...,keith olbermann minc word the end social secur...
9858,,,
22777,Today we found out who s REALLY in control of ...,today found realli control republican parti th...,today found realli control republican parti th...


## Convert accented characters to ASCII characters



In [None]:
def accented_to_ascii(text):
    return unidecode.unidecode(text)

In [None]:
df['text_stemmed'] = df['text_stemmed'].apply(lambda x: accented_to_ascii(x))
df['text_lemmatized'] = df['text_lemmatized'].apply(lambda x: accented_to_ascii(x))

# **Save clean data to csv**

In [None]:
df.to_csv('/content/drive/MyDrive/Colab Notebooks/KP MBKM Nutrifood/Dataset/clean_fake_and_real_news.csv', index=False)

In [None]:
df['text_stemmed'].to_csv('/content/drive/MyDrive/Colab Notebooks/KP MBKM Nutrifood/Dataset/steam_clean_fake_and_real_news.csv', index=False)
df['text_lemmatized'].to_csv('/content/drive/MyDrive/Colab Notebooks/KP MBKM Nutrifood/Dataset/lemma_clean_fake_and_real_news.csv', index=False)

# **Download clean data**

In [None]:
from google.colab import files
files.download('steam_clean_fake_and_real_news.csv')
files.download('lemma_clean_fake_and_real_news.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>