## Introduction
This notebook is used for creating more than 40 manual statistical features for each dataset.   
The original dataset is too large so that the full running takes a long time.   
Only an example running (for the first rows) is shown here.  
The same procedures are repeated for training data and testing data.

In [1]:
import numpy as np
import pandas as pd
import re
import string
import nltk
import warnings
warnings.filterwarnings('ignore')
import textblob

In [2]:
file = '../input/bt5153-applied-machine-learning-2021-spring/train.csv'
#file = '../input/bt5153-applied-machine-learning-2021-spring/test.csv'
#file = '../input/bn-vect-manual-out95/test_over95.csv'
df = pd.read_csv(file)

EXAMPLE_RUN = True
if EXAMPLE_RUN:
    df = df[:100]

In [3]:
#basic data pre-processing functions before feature engineering
def remove_xDxAs(text):
    return text.replace('&#xD;&#xA;', ' ')

def remove_whitespace(text):
    # Function to remove whitespace
    return " ".join(text.split())

def data_preprocess(text):
    # Function to pre-process text
    text = remove_xDxAs(text)
    text = remove_whitespace(text)
    return text

## Feature Engineering

In [4]:
#check the tags of words
pos_family = {
    'noun': ['NN', 'NNS', 'NNP', 'NNPS'],
    'pron': ['PRP', 'PRP$', 'WP', 'WP$'],
    'verb': ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'],
    'adj': ['JJ', 'JJR', 'JJS'],
    'adv': ['RB', 'RBR', 'RBS', 'WRB']
    }

def check_pos_tag(x, flag):
    cnt = 0
    try:
        wiki = textblob.TextBlob(x)
        for tup in wiki.tags:
            ppo = list(tup)[1]
            if ppo in pos_family[flag]:
                cnt += 1
    except:
        pass
    return cnt

In [5]:
def if_url(text):
    #check whether the text contains a url
    if 'http://' in text:
        return True
    else:
        return False

def count_greek_char(text):
    #count greek characters
    cnt = 0
    for i in text:
        if (ord(i) >= 945) and (ord(i) < 970):
            cnt += 1
    return cnt

def count_japanese(text):
    #count Japanese characters
    jap = re.compile(r'[\u3040-\u309F\u30A0-\u30FF\uAC00-\uD7A3]') 
    cnt = 0
    for i in range(len(text)):
        if jap.search(text[i]):
            cnt += 1
    return cnt

def count_chinese(text):
    #count Chinese characters
    zhPattern = re.compile(u'[\u4e00-\u9fa5]+')
    cnt = 0
    for i in range(len(text)):
        if zhPattern.search(text[i]):
            cnt += 1
    return cnt


def isEnglish(text):
    #check whether all characters are English
    return text.translate(string.punctuation).isalnum()


def compute_ner(text):
    #count the number of words with special meanings
    types = ['PERSON', 'ORGANIZATION', 'LOCATION', 'DATE',
             'TIME', 'MONEY', 'PERCENT', 'FACILITY', 'GPE']

    dic = dict.fromkeys(types, 0)

    sentences = nltk.sent_tokenize(text)
    tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
    tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
    ne_chunked_sents = [nltk.ne_chunk(tagged) for tagged in tagged_sentences]

    named_entities = []
    for ne_taged_sentence in ne_chunked_sents:
        for tagged_tree in ne_taged_sentence:
            if hasattr(tagged_tree, 'label'):
                name = tagged_tree[0][0]
                type = tagged_tree.label()
                named_entities.append((name, type))

    entity_frame = pd.DataFrame(named_entities, columns=['Entity Name', 'Entity Type'])
    tmp = entity_frame['Entity Type'].value_counts().reset_index().set_index('index')

    for i in range(tmp.shape[0]):
        dic[tmp.index[i]] = tmp['Entity Type'].iloc[i]
    return dic

In [6]:
#the main function for creating manual statistical features
def get_manual_features(df):
    df['Text'] = df.Text.apply(data_preprocess)
    df['Split'] = df.Text.apply(lambda x: x.split())

    df['char_count'] = df.Text.apply(len)  
    df['word_count'] = df.Split.apply(lambda x: len(x))  
    df['word_density'] = df['char_count'] / (df['word_count'] + 1)  

    df['punc_count'] = df.Text.apply(lambda x:
                                  len("".join(_ for _ in x if _ in string.punctuation)))  
    df['punc_density'] = df['punc_count'] / (df['word_count'] + 1)  

    df['title_count'] = df.Split.apply(lambda x:
                                    len([wrd for wrd in x if wrd.istitle()]))  
    df['title_density'] = df['title_count'] / (df['word_count'] + 1)  

    df['upper_count'] = df.Split.apply(lambda x:
                                    len([wrd for wrd in x if wrd.isupper()]))  
    df['upper_density'] = df['upper_count'] / (df['word_count'] + 1)  

    df['noun_count'] = df.Text.apply(lambda x: check_pos_tag(x, 'noun'))
    df['noun_density'] = df['noun_count'] / (df['word_count'] + 1)

    df['verb_count'] = df.Text.apply(lambda x: check_pos_tag(x, 'verb'))
    df['verb_density'] = df['verb_count'] / (df['word_count'] + 1)

    df['adj_count'] = df.Text.apply(lambda x: check_pos_tag(x, 'adj'))
    df['adj_density'] = df['adj_count'] / (df['word_count'] + 1)

    df['adv_count'] = df.Text.apply(lambda x: check_pos_tag(x, 'adv'))
    df['adv_density'] = df['adv_count'] / (df['word_count'] + 1)

    df['pron_count'] = df.Text.apply(lambda x: check_pos_tag(x, 'pron'))
    df['pron_density'] = df['pron_count'] / (df['word_count'] + 1)

    df['avg_word_len'] = df.Split.apply(lambda x: np.mean([len(item) for item in x]))  
    df['max_word_len'] = df.Split.apply(lambda x: np.max([len(item) for item in x]))  

    df['num_word_count'] = df.Split.apply(lambda x: np.sum([item.isdigit() for item in x]))  
    df['num_char_count'] = df.Text.apply(lambda x: np.sum([item.isdigit() for item in x]))  
    df['num_word_density'] = df['num_word_count'] / df['word_count']  
    df['num_char_density'] = df['num_char_count'] / df['word_count']  

    df['alnum_count'] = df.Split.apply(lambda x: np.sum([item.isalnum() for item in x]))  
    df['alnum_density'] = df['alnum_count'] / df['word_count']

    df['alpha_count'] = df.Split.apply(lambda x: np.sum([item.isalpha() for item in x]))  
    df['alpha_density'] = df['alpha_count'] / df['word_count']

    df['has_url'] = df.Text.apply(if_url)

    df['ps_count'] = df.Text.apply(lambda x: compute_ner(x)['PERSON'])
    df['ps_density'] = df['ps_count'] / df['word_count']

    df['org_count'] = df.Text.apply(lambda x: compute_ner(x)['ORGANIZATION'])
    df['org_density'] = df['org_count'] / df['word_count']

    df['gpe_count'] = df.Text.apply(lambda x: compute_ner(x)['GPE'])
    df['gpe_density'] = df['gpe_count'] / df['word_count']

    return df

In [7]:
df_feat = get_manual_features(df)
df_feat.to_csv('df_train_all_feat.csv', index=False)
#df_feat.to_csv('df_test_feat.csv', index=False)
#df_feat.to_csv('df_outp95_feat.csv', index=False)
print(df_feat.columns)
df_feat.head()

Index(['Outcome', 'Text', 'Id', 'Split', 'char_count', 'word_count',
       'word_density', 'punc_count', 'punc_density', 'title_count',
       'title_density', 'upper_count', 'upper_density', 'noun_count',
       'noun_density', 'verb_count', 'verb_density', 'adj_count',
       'adj_density', 'adv_count', 'adv_density', 'pron_count', 'pron_density',
       'avg_word_len', 'max_word_len', 'num_word_count', 'num_char_count',
       'num_word_density', 'num_char_density', 'alnum_count', 'alnum_density',
       'alpha_count', 'alpha_density', 'has_url', 'ps_count', 'ps_density',
       'org_count', 'org_density', 'gpe_count', 'gpe_density'],
      dtype='object')


Unnamed: 0,Outcome,Text,Id,Split,char_count,word_count,word_density,punc_count,punc_density,title_count,...,alnum_density,alpha_count,alpha_density,has_url,ps_count,ps_density,org_count,org_density,gpe_count,gpe_density
0,14,I am having a problem with the first example o...,1,"[I, am, having, a, problem, with, the, first, ...",328,49,6.56,41,0.82,10,...,0.714286,35,0.714286,False,3,0.061224,1,0.020408,2,0.040816
1,14,"everyone, I met a tough definite integral as f...",2,"[everyone,, I, met, a, tough, definite, integr...",240,37,6.315789,63,1.657895,2,...,0.324324,12,0.324324,False,0,0.0,0,0.0,0,0.0
2,7,"Please dont lynch me, but i've never sat throu...",3,"[Please, dont, lynch, me,, but, i've, never, s...",244,48,4.979592,11,0.22449,4,...,0.8125,36,0.75,False,0,0.0,1,0.020833,1,0.020833
3,14,How to calculate $ \mathbb{Z}[x] /\langle2x-1\...,4,"[How, to, calculate, $, \mathbb{Z}[x], /\langl...",55,7,6.875,12,1.5,1,...,0.428571,3,0.428571,False,0,0.0,0,0.0,0,0.0
4,2,When somebody rings or texts my iPhone it is n...,5,"[When, somebody, rings, or, texts, my, iPhone,...",170,36,4.594595,1,0.027027,1,...,0.972222,35,0.972222,False,0,0.0,2,0.055556,0,0.0
