In [None]:
# A dependency of the preprocessing for BERT inputs
!pip install -q -U "tensorflow-text==2.8.*"

In [None]:
!pip install -q tf-models-official==2.7.0

In [76]:
# %load main.py
import os
import re
import shutil

import numpy  as np
import pandas as pd
import joblib
import nltk
import ekphrasis
from collections import Counter

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optimizer

import matplotlib.pyplot as plt

tf.get_logger().setLevel('ERROR')

### Step1: Data Pre-processing

In [2]:
# read data
Path =os.path.dirname(os.getcwd())
data_pathA=os.path.join(Path,'Datasets/A/twitter-2016train-A.txt')

In [81]:
# transform data into df form
dataA = pd.read_table(data_pathA,sep='\t',header=0)
dataA.columns = ['ID','Sentiment','Text']
def add_label(sentiment):
    if sentiment == 'negative':
        return -1
    elif sentiment == 'neutral':
        return 0
    elif sentiment == 'positive':
        return 1

dataA['label'] = dataA.Sentiment.apply(add_label)
dataA

Unnamed: 0,ID,Sentiment,Text,label
0,628976607420645377,negative,@Microsoft how about you make a system that do...,-1
1,629023169169518592,negative,I may be ignorant on this issue but... should ...,-1
2,629179223232479232,negative,"Thanks to @microsoft, I just may be switching ...",-1
3,629186282179153920,neutral,If I make a game as a #windows10 Universal App...,0
4,629226490152914944,positive,"Microsoft, I may not prefer your gaming branch...",1
...,...,...,...,...
5862,639855845958885376,positive,@Racalto_SK ok good to know. Punting at MetLif...,1
5863,639979760735662080,neutral,everyone who sat around me at metlife was so a...,0
5864,640196838260363269,neutral,what giants or niners fans would wanna go to t...,0
5865,640975710354567168,positive,Anybody want a ticket for tomorrow Colombia vs...,1


In [82]:
# sentiment distribution of data
dataA.loc[:,'label'].value_counts()

 1    3017
 0    2001
-1     849
Name: label, dtype: int64

1Case conversion
包含“India”和“india”的语料库如果不应用小写化，机器会把它们识别为两个独立的术语，而实际上它们都是同一个单词的不同形式，并且对应于同一个国家。小写化后，仅存在一种“India”实例，即“india”，简化了在语料库中找到所有提到印度时的任务。

In [22]:
#import ekphrasis library
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons

text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
        'time', 'url', 'date', 'number'],
    # terms that will be annotated
    annotate={"hashtag", "allcaps", "elongated", "repeated",
        'emphasis', 'censored'},
    fix_html=True,  # fix HTML tokens
    
    # corpus from which the word statistics are going to be used 
    # for word segmentation 
    segmenter="twitter", 
    
    # corpus from which the word statistics are going to be used 
    # for spell correction
    corrector="twitter", 
    
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=False,  # spell correction for elongated words
    
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
)

Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading twitter - 1grams ...


# 重点

In [23]:
Text=[sentence for Text in dataA.Text for sentence in text_processor.pre_process_doc(Text) if (word!='s' and word!='\'')]
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))
words = [word for word in Text if (word not in stop)]

In [79]:
from tqdm import tqdm
from collections import Counter
#Define the tokenzation function
def tokenize_text(text, option):
    '''
    Tokenize the input text as per specified option
        1: Use python split() function
        2: Use regex to extract alphabets plus 's and 't
        3: Use ekphrasis text_processor.pre_process_doc
        4: Use NLTK word_tokenize(), remove stop words and apply lemmatization
    '''
    if option == 1:
        return text.split()
    elif option == 2:
        return re.findall(r'\b([a-zA-Z]+n\'t|[a-zA-Z]+\'s|[a-zA-Z]+)\b', text)
    elif option == 3:
        return [word for word in text_processor.pre_process_doc(text) if (word!='s' and word!='\'')]
    elif option == 4:
        words = [word for word in word_tokenize(text) if (word.isalpha()==1)]
        # Remove stop words
        stop = set(stopwords.words('english'))
        words = [word for word in words if (word not in stop)]
        # Lemmatize words (first noun, then verb)
        wnl = nltk.stem.WordNetLemmatizer()
        lemmatized = [wnl.lemmatize(wnl.lemmatize(word, 'n'), 'v') for word in words]
        return lemmatized
    else:
        print("Please specify option value between 1 and 4")
        return []
# Create vocabulary to int dictionary
def create_vocab(messages, show_graph=False):
    corpus = []
    for message in tqdm(messages, desc="Tokenizaing"):
        tokens = tokenize_text(message, 3) # Use option 3
        corpus.extend(tokens)
    print("The number of all words: {}".format(len(corpus)))

    # Create Counter
    counts = Counter(corpus)
    print("The number of unique words: {}".format(len(counts)))

    # Create BoW
    bow = sorted(counts, key=counts.get, reverse=True)
    print("Top 40 frequent words: {}".format(bow[:40]))

    # Indexing vocabrary, starting from 1.
    vocab = {word: ii for ii, word in enumerate(counts, 1)}
    id2vocab = {v: k for k, v in vocab.items()}

    if show_graph:
        from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
        import seaborn as sns
        # Generate Word Cloud image
        text = " ".join(corpus)
        stopwords = set(STOPWORDS)
        stopwords.update(["will", "report", "reporting", "market", "stock", "share"])

        wordcloud = WordCloud(stopwords=stopwords, max_font_size=50, max_words=100, background_color="white", collocations=False).generate(text)
        plt.figure(figsize=(15,7))
        plt.imshow(wordcloud, interpolation="bilinear")
        plt.axis("off")
        plt.show()

        # Show most frequent words in a bar graph
        most = counts.most_common()[:80]
        x, y = [], []
        for word, count in most:
            if word not in stopwords:
                x.append(word)
                y.append(count)
        plt.figure(figsize=(12,10))
        sns.barplot(x=y, y=x)
        plt.show()

    return vocab,id2vocab,corpus,counts,bow

messages = list(dataA.Text)
vocab,id2vocab,corpus,counts,bow= create_vocab(messages)

Tokenizaing: 100%|███████████████████████████████████████████████████████████████| 5867/5867 [00:01<00:00, 3750.26it/s]

The number of all words: 140263
The number of unique words: 11186
Top 40 frequent words: ['.', 'the', ',', 'to', 'i', '<user>', '<url>', 'a', 'on', 'and', 'in', '<number>', '<hashtag>', '</hashtag>', 'of', '<repeated>', 'is', '!', 'for', 'it', 'you', 'may', '-', 'not', 'with', 'be', ':', 'tomorrow', 'at', '?', 'have', 'my', 'will', 'that', '"', 'but', 'th', 'day', 'this', '1']





In [71]:
# diff=set(Text)-set(words) 

In [84]:
print("All words: {}".format(len(words)))
# Create Counter
counts = Counter(words)
print("Unique words: {}".format(len(counts)))
# Create BoW
bow = sorted(counts, key=counts.get, reverse=True)
print("Top 30 frequent words: {}".format(bow[:30]))

All words: 95426
Unique words: 11050
Top 30 frequent words: ['.', ',', '<user>', '<url>', "'", '<number>', '<hashtag>', '</hashtag>', '<repeated>', '!', 'may', '-', ':', 'tomorrow', '?', '"', 'th', 'day', '1', '<date>', 'going', 'st', 'apple', '&', '2', 'see', 'like', 'friday', 'amazon', 'time']
