# News Dataset Preprocessing

### Required packages

In [1]:
from pathlib import Path
import pandas as pd

from core.preprocessing import NewsPreprocessor

### Defining paths and directory creation

In [2]:
# define Path object for data directory
root_dir = Path('./')
data_dir = root_dir / 'data'
models_dir = root_dir / 'models'
plots_dir = root_dir / 'plots'

# print data files
for data_file in data_dir.glob('*'):
    print(data_file)
    
# create directory for plots and models
plots_dir.mkdir(exist_ok=True)
models_dir.mkdir(exist_ok=True)

data/train.json
data/test_cleaned.csv
data/train_cleaned.csv
data/test.json


### Loading in datasets

In [3]:
train = pd.read_json(data_dir / 'train.json')
test = pd.read_json(data_dir / 'test.json')

print(train.shape)
train.head(5)

(8263, 3)


Unnamed: 0,text,id,sentiment
0,Досудебное расследование по факту покупки ЕНПФ...,1945,negative
1,Медики рассказали о состоянии пострадавшего му...,1957,negative
2,"Прошел почти год, как железнодорожным оператор...",1969,negative
3,По итогам 12 месяцев 2016 года на территории р...,1973,negative
4,Астана. 21 ноября. Kazakhstan Today - Агентств...,1975,negative


### Data preprocessing

In [4]:
X_train, y_train, X_test = train['text'], train['sentiment'], test['text']

processor = NewsPreprocessor(X_train, y_train)

In [None]:
# preprocess X_train and store in train_cleaned.csv
X_train = X_train.apply(lambda doc: processor.to_lowercase(doc))
print("to_lowercase executed")
X_train = X_train.apply(lambda doc: processor.strip_punctuation(doc))
print("strip_punctuation executed")
X_train = X_train.apply(lambda doc: processor.remove_stopwords(doc))
print("remove_stopwords executed")
X_train = X_train.apply(lambda doc: processor.strip_special_chars(doc))
print("strip_special_chars executed")
X_train = X_train.apply(lambda doc: processor.remove_numbers(doc))
print("remove_numbers executed")
processor.get_wordcount()
processor.get_freqwords(10)
X_train = X_train.apply(lambda doc: processor.remove_freqwords(doc))
print("remove_freqwords executed")
processor.get_common_enough_words(100)
X_train = X_train.apply(lambda doc: processor.remove_rarewords(doc))
print("remove_rarewords executed")
X_train = X_train.apply(lambda doc: processor.stemmer(doc))
print("stemmer executed")
y_train = processor.label_encoder()
print("label encoder executed")

train_cleaned = pd.DataFrame({'text': X_train,
                              'sentiment': y_train})

train_cleaned.to_csv(data_dir / 'train_cleaned.csv')

to_lowercase executed
strip_punctuation executed
remove_stopwords executed
strip_special_chars executed
remove_numbers executed
remove_freqwords executed


In [None]:
# preprocess X_test and store in test_cleaned.csv
X_test = X_test.apply(lambda doc: processor.to_lowercase(doc))
print("to_lowercase executed")
X_test = X_test.apply(lambda doc: processor.strip_punctuation(doc))
print("strip_punctuation executed")
X_test = X_test.apply(lambda doc: processor.remove_stopwords(doc))
print("remove_stopwords executed")
X_test = X_test.apply(lambda doc: processor.strip_special_chars(doc))
print("strip_special_chars executed")
X_test = X_test.apply(lambda doc: processor.remove_numbers(doc))
print("remove_numbers executed")
processor.get_wordcount()
processor.get_freqwords(10)
X_test = X_test.apply(lambda doc: processor.remove_freqwords(doc))
print("remove_freqwords executed")
processor.get_rarewords(5)
X_test = X_test.apply(lambda doc: processor.remove_rarewords(doc))
print("remove_rarewords executed")
X_test = X_test.apply(lambda doc: processor.stemmer(doc))
print("stemmer executed")

test_cleaned = pd.DataFrame({'text': X_test})

test_cleaned.to_csv(data_dir / 'test_cleaned.csv')

In [None]:
train_cleaned.text[0]