# News Dataset Preprocessing

### Required packages

In [None]:
from pathlib import Path
import pandas as pd



### Defining paths and directory creation

In [2]:
# define Path object for data directory
root_dir = Path('./')
data_dir = root_dir / 'data'
models_dir = root_dir / 'models'
plots_dir = root_dir / 'plots'

# print data files
for data_file in data_dir.glob('*'):
    print(data_file)
    
# create directory for plots and models
plots_dir.mkdir(exist_ok=True)
models_dir.mkdir(exist_ok=True)

data/train.json
data/test.json


### Loading in datasets

In [3]:
train = pd.read_json(data_dir / 'train.json')
test = pd.read_json(data_dir / 'test.json')

print(train.shape)
train.head(5)

(8263, 3)


Unnamed: 0,text,id,sentiment
0,Досудебное расследование по факту покупки ЕНПФ...,1945,negative
1,Медики рассказали о состоянии пострадавшего му...,1957,negative
2,"Прошел почти год, как железнодорожным оператор...",1969,negative
3,По итогам 12 месяцев 2016 года на территории р...,1973,negative
4,Астана. 21 ноября. Kazakhstan Today - Агентств...,1975,negative


### Data preprocessing

In [20]:
from collections import Counter
from nltk.corpus import stopwords as stopwords
from bs4 import BeautifulSoup
import re

class NewsPreprocessor:
    def __init__(self, X_train, y_train):
        self.vocab = Counter()
        self.X_train = X_train
        self.y_train = y_train
    
    def to_lowercase(self, doc):
            """ 
            convert all text to lowercase and remove newline characters
            """
            print("Running to_lowercase")
            return doc.lower().replace("\r", " ").replace("\n", " ")

    def strip_html_tags(self, doc):
        """
        remove HTML tags from the text
        """
        print("Running strip_html_tags")

        stripped_doc = []
        for word in doc:
            soup = BeautifulSoup(word, "html.parser")
            stripped_word = soup.get_text()
            stripped_doc.append(stripped_word)
        return stripped_doc
    
    def strip_special_chars(self, doc):
        """
        remove special characters from the text
        """
        print("Running strip_special_characters")
        return re.sub("@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+", " ", doc)

    def remove_stopwords(self, doc):
        """
        remove stopwords from the text
        """
        print("Running remove_stopwords")
        STOP = stopwords.words('russian')
        words = doc.split(' ')
        return ' '.join([word for word in words if word not in STOP])

    def remove_numbers(self, doc):
        """
        remove numbers from the text
        """
        print("Running remove_numbers")
        return ''.join(i for i in doc if not i.isdigit())

    def remove_freqwords(self, doc, num_words):
        """
        remove the frequent words
        """
        print("Running remove_freqwords")
        self.X_train.str.split().apply(self.vocab.update)
        FREQWORDS = set([w for (w, wc) in self.vocab.most_common(num_words)])

        return " ".join([word for word in str(doc).split() if word not in FREQWORDS])

    def remove_rarewords(doc, filter_val):
        """
        remove the rare words
        """
        print("Running remove_rarewords")
        self.X_train.str.split().apply(self.vocab.update)
        RAREWORDS = []
        for k, v in vocab_dict.items():
            if v < 10:
                RAREWORDS.append(k)
        return " ".join([word for word in str(doc).split() if word not in RAREWORDS])

    

# remove numbers
# train.text = train.text.apply(lambda doc: ''.join(i for i in doc if not i.isdigit())) 
# test.text = test.text.apply(lambda doc: ''.join(i for i in doc if not i.isdigit())) 

    
#train.text.apply(lambda doc: to_lowercase(doc))
#train.text.apply(lambda doc: strip_html_tags(doc))
#train.text.apply(lambda doc: remove_numbers(doc))
#train.text.apply(lambda doc: remove_freqwords(doc))

X_train, y_train = train['text'], train['sentiment']
processor = NewsPreprocessor(X_train, y_train)

processor

<__main__.NewsPreprocessor at 0x7f2df3367730>

In [None]:
X_train = X_train.apply(lambda doc: processor.to_lowercase(doc))
X_train = X_train.apply(lambda doc: processor.remove_stopwords(doc))
X_train = X_train.apply(lambda doc: processor.remove_numbers(doc))
X_train = X_train.apply(lambda doc: processor.remove_freqwords(doc, 10))
X_train = X_train.apply(lambda doc: processor.remove_rarewords(doc, 10))

X_train