In [None]:
from google.colab import drive
from google.colab import files
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
import os
import json
import pandas as pd
import re

In [None]:
os.listdir("gdrive/MyDrive/BT4222/Data")

['22FEB_fakenewsnet_dataset.zip', 'politifact.json', 'gossipcop.json']

In [None]:
politifact_data = json.load(open("gdrive/MyDrive/BT4222/Data/politifact.json", "r"))
gossipcop_data = json.load(open("gdrive/MyDrive/BT4222/Data/gossipcop.json", "r"))

### Functions for processing

In [None]:
!pip install contractions

Collecting contractions
  Downloading contractions-0.1.66-py2.py3-none-any.whl (8.0 kB)
Collecting textsearch>=0.0.21
  Downloading textsearch-0.0.21-py2.py3-none-any.whl (7.5 kB)
Collecting anyascii
  Downloading anyascii-0.3.0-py3-none-any.whl (284 kB)
[K     |████████████████████████████████| 284 kB 4.9 MB/s 
[?25hCollecting pyahocorasick
  Downloading pyahocorasick-1.4.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (106 kB)
[K     |████████████████████████████████| 106 kB 63.2 MB/s 
[?25hInstalling collected packages: pyahocorasick, anyascii, textsearch, contractions
Successfully installed anyascii-0.3.0 contractions-0.1.66 pyahocorasick-1.4.4 textsearch-0.0.21


In [None]:
!python -m spacy download en_core_web_lg

Collecting en_core_web_lg==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.2.5/en_core_web_lg-2.2.5.tar.gz (827.9 MB)
[K     |████████████████████████████████| 827.9 MB 1.2 MB/s 
Building wheels for collected packages: en-core-web-lg
  Building wheel for en-core-web-lg (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-web-lg: filename=en_core_web_lg-2.2.5-py3-none-any.whl size=829180942 sha256=b17c38b282873da8a4a3f444603ba03a6d342995418812ec979d25d6e8f780e8
  Stored in directory: /tmp/pip-ephem-wheel-cache-rg853sn8/wheels/11/95/ba/2c36cc368c0bd339b44a791c2c1881a1fb714b78c29a4cb8f5
Successfully built en-core-web-lg
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [None]:
import contractions
import spacy
import string
from spacy.tokenizer import Tokenizer
from spacy.util import compile_infix_regex

In [None]:
PUNCT = string.punctuation + "’"

In [None]:
def remove_tags(text):
    text = re.sub("\n",' ', text)
    text = re.sub("\'", "'", text)
    text = re.sub(' +', ' ', text)
    return text

In [None]:
def expand_contractions(text):
  # creating an empty list
  expanded_words = []   
  for word in text.split():
    # using contractions.fix to expand the shortened words
    expanded_words.append(contractions.fix(word))  
    
  expanded_text = ' '.join(expanded_words)
  return expanded_text

In [None]:
def pipeline():
    nlp = spacy.load("en_core_web_lg")
    inf = list(nlp.Defaults.infixes)               # Default infixes
    inf.remove(r"(?<=[0-9])[+\-\*^](?=[0-9-])")    # Remove the generic op between numbers or between a number and a -
    inf = tuple(inf)                               # Convert inf to tuple
    infixes = inf + tuple([r"(?<=[0-9])[+*^](?=[0-9-])", r"(?<=[0-9])-(?=-)"])  # Add the removed rule after subtracting (?<=[0-9])-(?=[0-9]) pattern
    infixes = [x for x in infixes if '-|–|—|--|---|——|~' not in x] # Remove - between letters rule
    infix_re = compile_infix_regex(infixes)

    custom_tokenizer = Tokenizer(nlp.vocab, prefix_search=nlp.tokenizer.prefix_search,
                                suffix_search=nlp.tokenizer.suffix_search,
                                infix_finditer=infix_re.finditer,
                                token_match=nlp.tokenizer.token_match,
                                rules=nlp.Defaults.tokenizer_exceptions)

    nlp.tokenizer = custom_tokenizer
    return nlp

def lemma(text, nlp):
    lemma = ' '.join([token.lemma_ if token.lemma_ != '-PRON-' else token.text for token in nlp(text)])
    return lemma

In [None]:
def stopword_removal(text, nlp, stop=None):
    all_stopwords = nlp.Defaults.stop_words
    if stop:
        for word in stop:
            all_stopwords.add(word)
    doc = nlp(text)
    tokens_without_sw= [token.text for token in doc if not token.text in all_stopwords]
    filtered_sentence = " ".join(tokens_without_sw)
    return filtered_sentence

In [None]:
def remove_punctuation(text, nlp):
    doc = nlp(text)
    return ' '.join([token.text for token in doc if token.text not in PUNCT])

In [None]:
def clean_pipe(text, nlp):
    text = text.lower()
    text = remove_tags(text)
    text = expand_contractions(text)
    text = lemma(text, nlp)
    text = stopword_removal(text, nlp)
    text = remove_punctuation(text, nlp)
    return text

In [None]:
nlp = pipeline()

### Clean politifact

In [None]:
for article in politifact_data:
    article['title_clean'] = clean_pipe(article['title'], nlp)
    article['text_clean'] = clean_pipe(article['text'], nlp)

with open("gdrive/MyDrive/BT4222/Data/politifact_clean.json", 'w') as f:
  json.dump(politifact_data,f)

### Clean Gossipcop

In [None]:
for article in gossipcop_data:
    article['title_clean'] = clean_pipe(article['title'], nlp)
    article['text_clean'] = clean_pipe(article['text'], nlp)

with open("gdrive/MyDrive/BT4222/Data/gossipcop_clean.json", 'w') as f:
  json.dump(gossipcop_data,f)