# News Dataset Preprocessing

### Required packages

In [None]:
from pathlib import Path
import pandas as pd



### Defining paths and directory creation

In [2]:
# define Path object for data directory
root_dir = Path('./')
data_dir = root_dir / 'data'
models_dir = root_dir / 'models'
plots_dir = root_dir / 'plots'

# print data files
for data_file in data_dir.glob('*'):
    print(data_file)
    
# create directory for plots and models
plots_dir.mkdir(exist_ok=True)
models_dir.mkdir(exist_ok=True)

data/train.json
data/test.json


### Loading in datasets

In [3]:
train = pd.read_json(data_dir / 'train.json')
test = pd.read_json(data_dir / 'test.json')

print(train.shape)
train.head(5)

(8263, 3)


Unnamed: 0,text,id,sentiment
0,Досудебное расследование по факту покупки ЕНПФ...,1945,negative
1,Медики рассказали о состоянии пострадавшего му...,1957,negative
2,"Прошел почти год, как железнодорожным оператор...",1969,negative
3,По итогам 12 месяцев 2016 года на территории р...,1973,negative
4,Астана. 21 ноября. Kazakhstan Today - Агентств...,1975,negative


### Data preprocessing

In [50]:
from collections import Counter
from nltk.stem.snowball import SnowballStemmer 
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
import re

class NewsPreprocessor:
    def __init__(self, X_train, y_train):
        self.vocab = Counter()
        self.X_train = X_train
        self.y_train = y_train
        
        self.FREQWORDS = []
        self.RAREWORDS = []
        
    def get_wordcount(self):
        self.X_train.str.split().apply(self.vocab.update)

    def get_freqwords(self, num_words):
        FREQWORDS = set([w for (w, wc) in self.vocab.most_common(num_words)])
    
    def get_rarewords(self, filter_val):
        vocab_dict = dict(self.vocab)
        for k, v in vocab_dict.items():
            if v < filter_val:
                self.RAREWORDS.append(k)
    
    def to_lowercase(self, doc):
            """ 
            convert all text to lowercase and remove newline characters
            """
            return doc.lower().replace("\r", " ").replace("\n", " ")

    def strip_html_tags(self, doc):
        """
        remove HTML tags from the text
        """
        stripped_doc = []
        for word in doc:
            soup = BeautifulSoup(word, "html.parser")
            stripped_word = soup.get_text()
            stripped_doc.append(stripped_word)
        return stripped_doc
    
    def strip_special_chars(self, doc):
        """
        remove special characters from the text
        """
        # links
        return re.sub("(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w \.-]*)|[«»]", " ", doc)

    def remove_stopwords(self, doc):
        """
        remove stopwords from the text
        """
        STOP = stopwords.words('russian')
        words = doc.split(' ')
        return ' '.join([word for word in words if word not in STOP])

    def remove_numbers(self, doc):
        """
        remove numbers from the text
        """
        return ''.join(i for i in doc if not i.isdigit())
    
    

    def remove_freqwords(self, doc):
        """
        remove the frequent words
        """
        return " ".join([word for word in str(doc).split() if word not in self.FREQWORDS])

    def remove_rarewords(self, doc):
        """
        remove the rare words
        """
        return " ".join([word for word in str(doc).split() if word not in self.RAREWORDS])

    def stemmer(doc):
        stemmer = SnowballStemmer("russian")
        return ' '.join([stemmer.stem(word) for word in doc.split(' ')])

X_train, y_train = train['text'], train['sentiment']
processor = NewsPreprocessor(X_train, y_train)

<__main__.NewsPreprocessor at 0x7f2df3459090>

In [None]:
X_train = X_train.apply(lambda doc: processor.to_lowercase(doc))
print("to_lowercase executed")
X_train = X_train.apply(lambda doc: processor.remove_stopwords(doc))
print("remove_stopwords executed")
X_train = X_train.apply(lambda doc: processor.strip_special_chars(doc))
print("strip_special_chars executed")
X_train = X_train.apply(lambda doc: processor.remove_numbers(doc))
print("remove_numbers executed")
processor.get_wordcount()
processor.get_freqwords(10)
X_train = X_train.apply(lambda doc: processor.remove_freqwords(doc))
print("remove_freqwords executed")
processor.get_rarewords(5)
X_train = X_train.apply(lambda doc: processor.remove_rarewords(doc))
print("remove_rarewords executed")
X_train = X_train.apply(lambda doc: preprocessor.stemmer(doc))
print("stemmer executed")

X_train

to_lowercase executed
remove_stopwords executed
strip_special_chars executed
remove_numbers executed
remove_freqwords executed
