# Pre-processing of Chinese text data

This code prepares scraped texts for NLP tasks and other analysis. Specifically cleaning and tokenizing text, as well as cleaning dates.
Columns for some basic counts and other features are also included (and more likely will be added), as well as filtering options (e.g. to drop entries without keywords).

In [None]:
#### BIGGER TO DOES ###########
'''
1. bi and tri-grams in this script?
2. Move all "counts" to e.g. feature engineering ??
a. I lean against and in fact think all simple feature engineering should get moved here.
'''

In [4]:
# Imports 

import jieba
from jieba import posseg as pseg

import pandas as pd
import numpy as np

import re
import string
from datetime import datetime
import time
from itertools import combinations, permutations, chain

import os
from os import listdir
from os.path import isfile, join

# Main script

In [97]:
class PreProcess():
    def __init__(self, file):
        self.file = file
        self.colnames = ['title', 'url', 'date', 'text']
        #self.df = pd.read_csv(self.file, names=self.colnames, header=None, lineterminator='\n') #  engine='python'
        CONVERTERS = {'text': eval}
        
        self.df = pd.read_csv(self.file, lineterminator='\n')
        self.incomplete_df = None
        self.df_complete = None
    
    def save(self, filename):
        filename = filename 
        
        if not os.path.exists('../DATA/processed/'):
            os.makedirs('../DATA/processed/') 
        
        self.df_complete.to_csv('../DATA/processed/' + filename + '_clean.csv', index=False)
        self.incomplete_df.to_csv('../DATA/processed/' + filename + '_incomplete.csv', index=False)

        print('SAVED')
   
    # pseduo code
    def rename_columns(self, column_names=['title', 'url', 'date', 'text']):
        new_name_dict = dict(zip(self.colnames, column_names))
        self.df.rename(columns=new_name_dict, inplace=True)

    def mask_df(self, df, column, mask_keys, inverse=False):
        mask = df[column].isin(mask_keys)
        if inverse == True:
            df = df[~mask]
        else:
            df = df[mask]
        return df
    
    def get_complete_entries(self):
        # This assumes NO english etc in text / if not, just drop 'EMPTY' or whatever you want
        self.df.fillna('EMPTY', inplace = True)
        #get non-chinese entries from text
        mask_keys = list(self.df.text[self.df['text'].str.match(r"^[a-zA-Z0-9\s]*$")].unique())
        print('Dropped the following error keys: ' + str(mask_keys))
        self.incomplete_df = self.mask_df(self.df, 'text', mask_keys)
        self.df_complete = self.mask_df(self.df, 'text', mask_keys, inverse=True)
        
        # drop empty text strings / BIT REDUNDANT
        self.df_complete = self.df_complete[self.df_complete['text'].map(len) > 0] #df[df['TEST'].map(lambda d: len(d)) > 0]
        self.incomplete_df_1= self.df_complete[self.df_complete['text'].map(len) < 1]

        #drop empty text rows / REDUNDANT - above non-chinese entry check covers this. Why did I keep ?
        self.df_complete = self.mask_df(self.df_complete, 'text', ['EMPTY',  'ERROR', '404'], inverse=True)
        self.incomplete_df_2= self.mask_df(self.df_complete, 'text', ['EMPTY',  'ERROR', '404'], inverse=False)

        self.incomplete_df = pd.concat([self.incomplete_df, self.incomplete_df_1, self.incomplete_df_2])
     
        print('Complete entries: ' + str(len(self.df_complete)))
        print('Problematic entries: ' + str(len(self.incomplete_df)))
    
    def search_keyword_list(self, keywords):
        '''
        IN PROGRESSS !!!
        '''
        def key_check(txt):
            keys = []
            for k in keywords:
                if k in txt:
                    keys.append(k)
                else:
                    pass
            
            return(keys)
        
        self.df_complete['keys_present'] = self.df_complete['text'].dropna().apply(lambda x: key_check(x))
        
        check_list = self.df_complete[self.df_complete['keys_present'].map(len) > 0]
        print('At least one keyword in text: ' + str(len(check_list)))

    def search_keyword(self, keyword):

        keyword = keyword
        self.df_complete['in_title'] = self.df_complete['title'].str.contains(keyword)
        self.df_complete['in_text'] = self.df_complete['text'].str.contains(keyword)
        
        # Checking 'search term in' counts
        self.df_complete['in_title'].value_counts()
        self.df_complete['in_text'].value_counts()
        
        # Not used: use only if you want to cut anything without key, and without inspection/tokenizing, etc.
        # mask = self.df_complete['in_title'] | self.df_complete['in_text'] == True
        # df_complete_confirmed = self.df_complete[mask]
        
        print('Keyword confirmed in title or text: ' + str(len(df_complete_confirmed)))

    def filter_keyword_in_tokens(self, keywords, filter=True):
        
        keep_keys = keywords  # list ['气候']

        def keys_only(txt):
            return [w for w in txt if w.lower() in keep_keys]

        self.df_complete['tokens_keys'] = self.df_complete['hard_tokens'].dropna().apply(lambda x: keys_only(x))

        if filter is True:
            self.df_complete = self.df_complete[self.df_complete['tokens_keys'].map(lambda d: len(d)) > 0]

        print('Original data length: ' + str(len(df)))
        print('Dropping: ' + str(len(df_drop)))
        print('End data length: ' + str(len(df) - len(df_drop)))


    # This can all go for PD, but still has general value - thus perhaps clean up
    def datetime_parse(self):
        # Removing problems found - goal to make more general, eg. 1 digit follow by 1 letter
        self.df_complete['date_standard'] = self.df_complete['date'].str.replace('5G', '')

        # Remove all non-alpha-numeric
        self.df_complete['date_standard'] = self.df_complete['date_standard'].str.replace(r'[^0-9a-zA-Z:]+', ' ')
        # Remove any starting tag with :
        self.df_complete['date_standard'] = self.df_complete['date_standard'].str.replace(r'^\w+:\s', ' ')
        self.df_complete['date_standard'] = self.df_complete['date_standard'].str.split('Updated: ', expand=True)
        
        # Remove time # TO DO: ensure AM PM optional * but should be since it works!
        self.df_complete['date_standard'] = self.df_complete['date_standard'].str.replace(r'\b(([0-9]|0[0-9]|1[0-9]|2[0-3]):[0-5][0-9](:[0-5][0-9])?\s?([AaPp][Mm])?)', ' ')

        # TO ADD: check for full month names!
        
        # Removing digits longer than 4 long and words longer than 3 long
        self.df_complete['date_standard'] = self.df_complete['date_standard'].str.replace(r'[1-9]\d{4,}', ' ')
        self.df_complete['date_standard'] = self.df_complete['date_standard'].str.replace(r'[a-zA-Z]\d{,3}', '')
        
        # Strip loose whitespace
        self.df_complete['date_standard'] = self.df_complete['date_standard'].str.strip()
        
        # Replacing all non numeric with a standardized entry
        self.df_complete = self.df_complete.replace('nan',0)
        self.df_complete = self.df_complete.replace('NaN',0)
        self.df_complete = self.df_complete.replace('EMPTY',0)
        self.df_complete = self.df_complete.replace('',0)
        self.df_complete = self.df_complete.fillna(0)

                
        # Lists for parsing
        month = ['%b', '%m', '%B']
        day = ['%d']
        year = ['%Y', '%y']

        varieties = list(permutations(chain(month,day,year), 3))
        for v in varieties:
            v = ' '.join(v)
            try:
                self.df_complete['datetime'] = [datetime.strptime(str(d), v) if d != 0 else d for d in self.df_complete['date_standard']]
                self.df_complete['datetime'] = [d.date() if d != 0 else d for d in self.df_complete['datetime']]
                if self.df_complete['datetime'] is not None:
                    print('Successfully parsed with format: ' + v)
                    break
            except:
                #print('Failed: ' + v)
                pass
        return self.df_complete.head()

    
    def prep_tokenize(self):
        # TO DO: make hard tokenize a param 

        # Removing loose html code from text
        self.df_complete['text'] = self.df_complete['text'].str.replace('\xa0', '')
        self.df_complete['text'] = self.df_complete['text'].str.replace('\u3000', '') 
        self.df_complete['text'] = self.df_complete['text'].str.replace('  ', ' ')
        self.df_complete['text'] = self.df_complete['text'].str.replace('   ', ' ')
        print('HTML tags removed')

        def remove_punctuation(txt):
            ##############
            # UPDATE - check works
            ###############

            additions = ['：', '，', '《', '。', '》', '“', '„', ':', '一', '・', '«', '»', '”', '“'] 
            
            txt_nopunct = re.sub(r'[^\w\s]', '', txt) # removes all non characters, i.e. punct. - suprised it keeps chinese!
            txt_nopunct = re.sub(r'\b\s+\b', ' ', txt_nopunct) # removes more than one space

            txt_nopunct = txt_nopunct.replace('\n', '')
            txt_nopunct = txt_nopunct.replace('\t', '')
            txt_nopunct = txt_nopunct.replace('\r', '')
            txt_nopunct = txt_nopunct.replace('\u3000', '') # repetative but to be safe

            txt_nopunct = ''.join([c for c in txt_nopunct if c not in additions]) # first regex seems to handle this, but redundancy good

            txt_nopunct = txt_nopunct.strip()

            return txt_nopunct

        def remove_spaces(txt):
            txt_nospace = re.sub(r'\s', '', txt) 
            return txt_nospace

        def tokenize(txt):
            #jieba.add_word('于吉', freq=None, tag='nr')
            words = jieba.cut(txt, cut_all=False) #HMM=False ## True is default
            words = [str(word) for word in words]
            return words

        def remove_stopwords(txt):
            txt_nostop = [w for w in txt if w.lower() not in stop_words]
            return txt_nostop
        
        def hard_tokenize(txt):
            regex = re.compile(u'[^\u4E00-\u9FA5]')
            hard = regex.sub('', txt)
            return hard

        def get_pos(txt):
            #words_pseg = pseg.cut(hard_tokenize(txt)) # use instead of below if text_clean not present
            words_pseg = pseg.cut(txt)
            pos = [str(x) for x in words_pseg]
            return pos

        def get_numbers(txt):
            x = re.findall(r'\d+', txt)
            return len(x)

        def pos_count(pos, txt):
            x = [token for token in txt if token.endswith(pos)]
            #y = [token.split('/')[0] for token in x] # use when I need lists with just these!
            return len(x)
            #return y

        # Creating initial tokens column just with cleaned text. Unlike normal tokenizing, keeping numbers at this point.

        self.df_complete['tokens'] = self.df_complete['text'].dropna().apply(lambda x: remove_punctuation(x))
        #self.df_complete['tokens'] = self.df_complete['tokens'].str.lower()
        print('Punctuation removed')
        
        # Features and counts
        self.df_complete['numbers_count'] =  self.df_complete['tokens'].apply(lambda x: get_numbers(x))

        print('Counts done')        
        
        # Remove spaces - OPTIONAL ?
        #self.df_complete['tokens'] = self.df_complete['tokens'].dropna().apply(lambda x: remove_spaces(x))

        # Tokenizing
        #self.df_complete['tokens'] = self.df_complete['tokens'].dropna().apply(lambda x: tokenize(x))
        #print('Tokenization done')

        self.df_complete['text_clean'] = self.df_complete['text'].dropna().apply(lambda x: hard_tokenize(x))
        self.df_complete['hard_tokens'] = self.df_complete['text_clean'].dropna().apply(lambda x: tokenize(x))
        print('Hard tokenization done')
        
        # POS list
        # WHY I AM USING TEXT CLEAN!!!??? - but it worked... believe it is due to jieba function doing it automatically
        self.df_complete['pos_tokens'] = self.df_complete['text_clean'].dropna().apply(lambda x: get_pos(x))
        print('POS tagging done')
        
        self.df_complete['verb_count'] =  self.df_complete['pos_tokens'].apply(lambda x: pos_count('v', x))
        self.df_complete['noun_count'] =  self.df_complete['pos_tokens'].apply(lambda x: pos_count('n', x))

        print('POS counts done')

        # Removing stop words
        data = pd.read_csv('../INPUT/stopwords-zh.txt', header=None)
        stop_words = data[0].tolist()
        
        #stop_words_add = ['・', '”', '“', ' ', '\u3000']
        #stop_words = stop_words + stop_words_add

        ########## ####
        # ATTN - could do to both... but leaving tokens with extra info for now
        ###############

        #self.df_complete['tokens'] = self.df_complete['tokens'].dropna().apply(lambda txt: remove_stopwords(txt))
        self.df_complete['hard_tokens'] = self.df_complete['hard_tokens'].dropna().apply(lambda txt: remove_stopwords(txt))

        print('Stop words dropped')

# Start: proccessing

In [None]:
PATH = '../DATA/'

# Getting list of files in data folder
csvfiles = [f for f in listdir(PATH) if isfile(join(PATH, f))]
print('Index, Filename')
print(list(zip([index for index, value in enumerate(csvfiles)], csvfiles)))

In [None]:
# Specify file to work with (used index just to keep typing short)
index = 4
file =  csvfiles[index]
print(file)

In [None]:
# Words and their position of speach tags can be added to jieba like this
# useful if you know your text has lots of special terms of importance to your analysis, since tokenizing is not never perfect, more so in Chinese
# TO DO: make param for class that allows you to enter a bunch of words (e.g. as a dict) and add them at once.
jieba.add_word('极端天气', freq=None, tag='n')
jieba.add_word('可持续发展', freq=None, tag='nt')

In [None]:
# TO DO: Add a load function so I can use only path and file name in Preprocess
# Here just double checking they are what I want
print(PATH)
print(file)

In [98]:
processing = PreProcess(PATH + file)

In [None]:
# Just a quick check looks ok
processing.df.tail(2)

In [None]:
processing.get_complete_entries()

In [102]:
# Search is never perfect: next two functions check our text strings for keywords
# If testing using my key and params, this will be shorter the overall length by allot
# This is because in 2.1 we collected with broad = True, iirc

keywords = ['气候变化', '气候变', '气候变迁', '气候']
key_keyword = '气候'

In [None]:
processing.search_keyword_list(keywords)

In [None]:
processing.search_keyword(key_keyword)


In [None]:
# Drops entries if keyword is not in text tokens, can also set filter=False to keep entries but still create column saying which keywords are in text
# filter_keyword_in_tokens(self, keywords, filter=True):
# TO DO: test!!! as I moved this from elsewhere and could have made an error

processing.filter_keyword_in_tokens(['气候'])

In [None]:
# If date gotten elsewhere, e.g. for People Daily, not needed, but still nice to run: can confirm day, month order etc.
processing.datetime_parse()

In [None]:
# Run our cleaning and tokenizer

processing.prep_tokenize()

In [None]:
# Just a last look at df

processing.df_complete.head(1)

In [112]:
# TO DO: Move into Class
# OR KEEP? Has numbers... figure out what to do with numbers, then decide
# best: turn numbers into words...
processing.df_complete.drop('tokens', axis=1, inplace=True)

In [None]:
# TO DO: Move into class
filename = file.split('.')[0]
filename

In [None]:
processing.save(filename)

## English tokenizing

Just in case you need it, placed here for convienence for now. But eventually will be moved or merged elsewhere... maybe a check language function.

In [None]:
# keep in mind you need to install nltk stuff iirc

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

In [None]:
stop_words = set(stopwords.words('english'))

In [None]:
def remove_stopwords(txt):
    txt_nostop = [w for w in txt if w.lower() not in stop_words]
    return txt_nostop

In [None]:
df['tokens'] = df.dropna().apply(lambda row: nltk.word_tokenize(row['tokens']), axis=1)

In [None]:
df['tokens'] = df['tokens'].dropna().apply(lambda txt: remove_stopwords(txt))

# Cut

This is stuff I've cut. Place here for now in case latter in process I realize I need it, but will evenutally be deleted

In [None]:

    def datetime_from_clean(self):
        # If you know your dates are pretty clean can use this instead
        
        # Lists for parsing
        month = ['%b', '%m', '%B']
        day = ['%d']
        year = ['%Y', '%y']
        
        self.df_complete['date_standard'] = self.df_complete['date_standard'].str.strip()
        self.df_complete['date_standard'] = self.df_complete['date_standard'].replace('0', 0)

        varieties = list(permutations(chain(month,day,year), 3))
        for v in varieties:
            v = ' '.join(v)
            try:
                self.df_complete['datetime'] = [datetime.strptime(str(d), v) if d != 0 else d for d in self.df_complete['date_standard']]
                self.df_complete['datetime'] = [d.date() if d != 0 else d for d in self.df_complete['datetime']]
                if self.df_complete['datetime'] is not None:
                    print('Successfully parsed with format: ' + v)
                    break
            except:
                print('Failed: ' + v)
                pass
        return self.df_complete.head()