In [None]:
import torch
from torch import nn
import pandas as pd
import re
import matplotlib.pyplot as plt
import numpy as np
import json
from collections import Counter


In [None]:
'''step 1: read the raw data'''
def _read_data():
    ## read the training and test data
    train_data = pd.read_csv('data/training_raw_data.csv',index_col=None,encoding='utf8')
    test_data = pd.read_csv('data/test_raw_data.csv',index_col=None,encoding='utf8')

    return train_data, test_data

In [None]:
'''step 2: data preprocessing'''
def _data_preprocessing(input_data):
    input_data['clean_text'] = input_data['Content'].map(lambda x: re.sub(r'[^\w\s]',' ',x))
    input_data['clean_text'] = input_data['clean_text'].apply(lambda x: x.lower())
    # input_data = input_data.sample(frac = 1)  # shuffle the data samples

    return input_data

In [None]:
'''step 3: stats of length of sentences'''
def _stats_seq_len(input_data):
    input_data['seq_words'] = input_data['clean_text'].apply(lambda x: x.split())
    input_data['seq_len'] = input_data['seq_words'].apply(lambda x: len(x))
    # input_data['seq_len'].hist()
    # plt.show()
    print(input_data['seq_len'].describe())

    ## remove short and long tokens
    min_seq_len = 100
    max_seq_len = 600
    input_data = input_data[min_seq_len <= input_data['seq_len']]
    input_data = input_data[input_data['seq_len'] <= max_seq_len]

    return input_data


In [None]:
'''step 4: convert 'postive and negative' to labels 1 and 0'''
def _convert_labels(input_data):
    input_data['Label'] = input_data['Label'].apply(lambda x: 0 if x=='neg' else 1)

    return input_data


In [None]:

'''step 5: Tokenize: create Vocab to Index mapping dictionary'''
def _map_tokens2index(input_data,top_K = 500):
    words = input_data['seq_words'].tolist()
    tokens_list = []
    for l in words:
        tokens_list.extend(l)

    ## count the frequency of words
    count_tokens = Counter(tokens_list)

    ## dictionary = {words: count}
    sorted_tokens = count_tokens.most_common(len(tokens_list))

    ## choose the top K tokens or all of them
    tokens_top = sorted_tokens[:top_K]

    ## tokens to index staring from 2, index=0:<padding>, index=1:<unknown>
    tokens2index = {w:i+2 for i, (w,c) in enumerate(tokens_top)}

    ## index for padding (0) and unknown (1)
    tokens2index['<pad'] = 0
    tokens2index['<unk>'] = 1

    with open('data/tokens2index.json', 'w') as outfile:
        json.dump(tokens2index, outfile,indent=4)

    return tokens2index

In [None]:
'''step 6, Tokenize: Encode the words in sentences to index'''
def _encode_word2index(x,tokens2index):
    ## unknown words: index=1
    input_tokens = [tokens2index.get(w,1) for w in x]

    return input_tokens

In [None]:
'''step 7: padding or truncating sequence data'''
def _pad_truncate_seq(x,seq_len):
    if len(x) >= seq_len:
        return x[:seq_len]
    else:
        return x+[0]*(seq_len-len(x))

In [None]:

def main():
    ## Step 1: read raw data from files
    train_data, test_data = _read_data()

    ## Step 2: clearning data and lower case
    train_data = _data_preprocessing(train_data)
    test_data = _data_preprocessing(test_data)

    ## Step 3: stats of length and remove short and long sentences
    ## ***** It is Done (do not modify it) *****
    print("***please do not modify step 3 as it is Done!***")
    train_data = _stats_seq_len(train_data)
    test_data = _stats_seq_len(test_data)

    ## Step 4: convert to string labels to numerical labels (1, 0)
    train_data = _convert_labels(train_data)
    test_data = _convert_labels(test_data)
    # print("test", test_data['Label'][:20])

    ## step 5: Tokenize: create Vocab to Index mapping dictionary
    top_K = 10000
    tokens2index = _map_tokens2index(train_data,top_K)
    print("num of tokens", len(tokens2index))


    ## step 6: Encode the words in sentences to index
    train_data['input_x'] = train_data['seq_words'].apply(lambda x: _encode_word2index(x,tokens2index))
    test_data['input_x'] = test_data['seq_words'].apply(lambda x: _encode_word2index(x,tokens2index))
    print(test_data['input_x'][:10])


    ## step 7: padding or truncating sequence data, save results
    batch_seq_len = 150
    ## step 7-1: train data padding
    train_data['input_x'] = train_data['input_x'].apply(lambda x: _pad_truncate_seq(x,batch_seq_len))
    train_data.to_csv('data/training_data.csv', index=False)

    ## step 7-2: test data padding
    test_data['input_x'] = test_data['input_x'].apply(lambda x: _pad_truncate_seq(x,batch_seq_len))
    test_data.to_csv('data/test_data.csv', index=False)
    print("----finish data preprocessing now----")

In [None]:
if __name__ == '__main__':
    main()

***please do not modify step 3 as it is Done!***
count    20890.000000
mean       229.011345
std        114.382039
min        100.000000
25%        142.000000
50%        188.000000
75%        284.000000
max        600.000000
Name: seq_len, dtype: float64
count    10000.000000
mean       227.509000
std        113.163138
min        100.000000
25%        142.000000
50%        188.000000
75%        281.000000
max        600.000000
Name: seq_len, dtype: float64
num of tokens 10002
0    [17, 75, 17, 10, 1757, 2, 16, 13, 12, 197, 112...
1    [2, 7574, 2670, 15, 36, 351, 16, 49, 170, 12, ...
2    [2, 680, 3356, 2689, 6118, 3140, 707, 6895, 10...
3    [2, 84, 21, 7, 628, 51, 6, 69, 10, 481, 9, 18,...
4    [12, 113, 5771, 2, 6896, 6, 67, 2158, 239, 170...
5    [1237, 4148, 7, 4, 744, 404, 5, 904, 1324, 9, ...
6    [294, 724, 404, 5, 495, 12, 21, 7, 42, 78, 9, ...
7    [10, 114, 12, 21, 9, 7, 306, 3, 38, 157, 1016,...
8    [2, 77, 33, 3062, 2685, 6, 80, 71, 1, 2269, 38...
9    [1, 1, 750, 94, 43,

294