In [58]:
import re
import emoji
from wordsegment import load,segment

load()

def read_corpus(corpus_file):
    '''Reads in file name and returns text and its label'''
    documents = []
    labels = []
    with open(corpus_file, encoding='utf-8') as f:
        for line in f:
            tokens = line.strip()
            documents.append("".join(tokens.split("\t")[0]).strip())
            # binary problem: NOT, OFF
            labels.append(tokens.split("\t")[1])
    return documents, labels

def preprocess_input(text_in):
    '''Read in text and preprocesses it and returns'''
    input = re.sub("/"," / ",text_in) #split slashes
    input = emoji.demojize(input,delimiters=(" "," ")) #change emoji
    input = re.sub("_"," ",input) #split underscores
    hashtags = re.findall(r"(#\w+)", input) #split hashtags
    for hs in hashtags:
        words = " ".join(segment(hs))
        input = input.replace(hs, words)
    input=input.lower()
    return input

def write_to_txt(documents,labels,file):
    '''write documents and labels to a file'''
    with open(file,"w") as f:
        for i in range(len(documents)):
            f.write(documents[i]+"\t"+labels[i]+"\n")
    f.close()

In [59]:
input_files=["datasets/train.tsv","datasets/val.tsv","datasets/test.tsv"]
output_files=["datasets/train_preprocessed.txt","datasets/val_preprocessed.txt","datasets/test_preprocessed.txt"]

for j in range(len(input_files)):
    #read data files
    docs,labs=read_corpus(input_files[j])
    #preprocess text
    docs=[preprocess_input(x) for x in docs]
    #write preprocess text to files
    write_to_txt(docs,labs,output_files[j])