# Read_and_explore_data

In [None]:
import numpy as np
import pandas as pd
import sklearn

# Libraries and packages for text (pre-)processing
import string
import re
import nltk

print("Python version:", sys.version)
print("Version info.:", sys.version_info)
print("pandas version:", pd.__version__)
print("numpy version:", np.__version__)
print("skearn version:", sklearn.__version__)
print("re version:", re.__version__)
print("nltk version:", nltk.__version__)

for dirname, _, filenames in os.walk('/Data/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

<a id="Read_the_Data"></a>
## Read the Data

In [None]:
%time

# read the csv file
train_df = pd.read_csv("/Data/train.csv")
display(train_df.shape, train_df.head())

In [None]:
display(train_df[~train_df["location"].isnull()].head())
display(train_df[train_df["target"] == 0]["text"].values[1])

# Text_Cleaning

In [None]:
train_df["text_clean"] = train_df["text"].apply(lambda x: x.lower())
display(train_df.head())

In [None]:
def remove_URL(text):
    """
        Remove URLs from a sample string
    """
    return re.sub(r"https?://\S+|www\.\S+", "", text)

In [None]:
# remove urls from the text
train_df["text_clean"] = train_df["text_clean"].apply(lambda x: remove_URL(x))

# double check
print(train_df["text"][31])
print(train_df["text_clean"][31])
print(train_df["text"][37])
print(train_df["text_clean"][37])
print(train_df["text"][62])
print(train_df["text_clean"][62])

In [None]:
def remove_html(text):
    """
        Remove the html in sample text
    """
    html = re.compile(r"<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});")
    return re.sub(html, "", text)

In [None]:
# remove html from the text
train_df["text_clean"] = train_df["text_clean"].apply(lambda x: remove_html(x))

# double check
print(train_df["text"][62])
print(train_df["text_clean"][62])
print(train_df["text"][7385])
print(train_df["text_clean"][7385])

In [None]:
def remove_non_ascii(text):
    """
        Remove non-ASCII characters
    """
    return re.sub(r'[^\x00-\x7f]',r'', text) # or ''.join([x for x in text if x in string.printable])

In [None]:
# remove non-ascii characters from the text
train_df["text_clean"] = train_df["text_clean"].apply(lambda x: remove_non_ascii(x))

# double check
print(train_df["text"][38])
print(train_df["text_clean"][38])
print(train_df["text"][7586])
print(train_df["text_clean"][7586])

In [None]:
def remove_special_characters(text):
    """
        Remove special special characters, including symbols, emojis, and other graphic characters
    """
    emoji_pattern = re.compile(
        '['
        u'\U0001F600-\U0001F64F'  # emoticons
        u'\U0001F300-\U0001F5FF'  # symbols & pictographs
        u'\U0001F680-\U0001F6FF'  # transport & map symbols
        u'\U0001F1E0-\U0001F1FF'  # flags (iOS)
        u'\U00002702-\U000027B0'
        u'\U000024C2-\U0001F251'
        ']+',
        flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [None]:
%time
# remove non-ascii characters from the text
train_df_jtcc["text_clean"] = train_df_jtcc["comment_text"].apply(lambda x: remove_special_characters(x))
display(train_df_jtcc.head())

# double check
print(train_df_jtcc["comment_text"][143])
print(train_df_jtcc["text_clean"][143])
print(train_df_jtcc["comment_text"][189])
print(train_df_jtcc["text_clean"][189])

In [None]:
def remove_punct(text):
    """
        Remove the punctuation
    """
#     return re.sub(r'[]!"$%&\'()*+,./:;=#@?[\\^_`{|}~-]+', "", text)
    return text.translate(str.maketrans('', '', string.punctuation))

In [None]:
# remove punctuations from the text
train_df["text_clean"] = train_df["text_clean"].apply(lambda x: remove_punct(x))

# double check
print(train_df["text"][5])
print(train_df["text_clean"][5])
print(train_df["text"][7597])
print(train_df["text_clean"][7597])

# Text Preprocessing:

## Tokenization


In [None]:
# Tokenizing the tweet base texts.
from nltk.tokenize import word_tokenize

train_df['tokenized'] = train_df['text_clean'].apply(word_tokenize)
train_df.head()

In [None]:
# Removing stopwords.
nltk.download("stopwords")
from nltk.corpus import stopwords

stop = set(stopwords.words('english'))
train_df['stopwords_removed'] = train_df['tokenized'].apply(lambda x: [word for word in x if word not in stop])
train_df.head()

In [None]:
from nltk.stem import PorterStemmer

def porter_stemmer(text):
    """
        Stem words in list of tokenized words with PorterStemmer
    """
    stemmer = nltk.PorterStemmer()
    stems = [stemmer.stem(i) for i in text]
    return stems

In [None]:
%time

train_df['porter_stemmer'] = train_df['stopwords_removed'].apply(lambda x: porter_stemmer(x))
train_df.head()

In [None]:
from nltk.stem import SnowballStemmer

def snowball_stemmer(text):
    """
        Stem words in list of tokenized words with SnowballStemmer
    """
    stemmer = nltk.SnowballStemmer("english")
    stems = [stemmer.stem(i) for i in text]
    return stems

In [None]:
%time

train_df['snowball_stemmer'] = train_df['stopwords_removed'].apply(lambda x: snowball_stemmer(x))
train_df.head()

In [None]:
from nltk.stem import LancasterStemmer

def lancaster_stemmer(text):
    """
        Stem words in list of tokenized words with LancasterStemmer
    """
    stemmer = nltk.LancasterStemmer()
    stems = [stemmer.stem(i) for i in text]
    return stems

In [None]:
%time

train_df['lancaster_stemmer'] = train_df['stopwords_removed'].apply(lambda x: lancaster_stemmer(x))
train_df.head()

## Part of Speech Tagging (POS Tagging):


In [None]:
from nltk.corpus import wordnet
from nltk.corpus import brown

wordnet_map = {"N":wordnet.NOUN,
               "V":wordnet.VERB,
               "J":wordnet.ADJ,
               "R":wordnet.ADV
              }

train_sents = brown.tagged_sents(categories='news')
t0 = nltk.DefaultTagger('NN')
t1 = nltk.UnigramTagger(train_sents, backoff=t0)
t2 = nltk.BigramTagger(train_sents, backoff=t1)

def pos_tag_wordnet(text, pos_tag_type="pos_tag"):
    """
        Create pos_tag with wordnet format
    """
    pos_tagged_text = t2.tag(text)

    # map the pos tagging output with wordnet output
    pos_tagged_text = [(word, wordnet_map.get(pos_tag[0])) if pos_tag[0] in wordnet_map.keys() else (word, wordnet.NOUN) for (word, pos_tag) in pos_tagged_text ]
    return pos_tagged_text

In [None]:
pos_tag_wordnet(train_df['stopwords_removed'][2])

In [None]:
%time

train_df['combined_postag_wnet'] = train_df['stopwords_removed'].apply(lambda x: pos_tag_wordnet(x))

train_df.head()

## Lemmatization:


In [None]:
from nltk.stem import WordNetLemmatizer

def lemmatize_word(text):
    """
        Lemmatize the tokenized words
    """

    lemmatizer = WordNetLemmatizer()
    lemma = [lemmatizer.lemmatize(word, tag) for word, tag in text]
    return lemma

In [None]:
%time

# Test with POS Tagging
lemmatizer = WordNetLemmatizer()

train_df['lemmatize_word_w_pos'] = train_df['combined_postag_wnet'].apply(lambda x: lemmatize_word(x))
train_df['lemmatize_word_w_pos'] = train_df['lemmatize_word_w_pos'].apply(lambda x: [word for word in x if word not in stop]) # double check to remove stop words
train_df['lemmatize_text'] = [' '.join(map(str, l)) for l in train_df['lemmatize_word_w_pos']] # join back to text

train_df.head()

# Text Features Extraction:


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

def cv(data, ngram = 1, MAX_NB_WORDS = 75000):
    count_vectorizer = CountVectorizer(ngram_range = (ngram, ngram), max_features = MAX_NB_WORDS)
    emb = count_vectorizer.fit_transform(data).toarray()
    print("count vectorize with", str(np.array(emb).shape[1]), "features")
    return emb, count_vectorizer

In [None]:
def print_out(emb, feat, ngram, compared_sentence=0):
    print(ngram,"bag-of-words: ")
    print(feat.get_feature_names(), "\n")
    print(ngram,"bag-of-feature: ")
    print(test_cv_1gram.vocabulary_, "\n")
    print("BoW matrix:")
    print(pd.DataFrame(emb.transpose(), index = feat.get_feature_names()).head(), "\n")
    print(ngram,"vector example:")
    print(train_df["lemmatize_text"][compared_sentence])
    print(emb[compared_sentence], "\n")

In [None]:
test_corpus = train_df["lemmatize_text"][:5].tolist()
print("The test corpus: ", test_corpus, "\n")

test_cv_em_1gram, test_cv_1gram = cv(test_corpus, ngram=1)
print_out(test_cv_em_1gram, test_cv_1gram, ngram="Uni-gram")

In [None]:
test_cv_em_2gram, test_cv_2gram = cv(test_corpus, ngram=2)
print_out(test_cv_em_2gram, test_cv_2gram, ngram="Bi-gram")

In [None]:
test_cv_em_3gram, test_cv_3gram = cv(test_corpus, ngram=3)
print_out(test_cv_em_2gram, test_cv_2gram, ngram="Tri-gram")

In [None]:
%time

# implement into the whole dataset
train_df_corpus = train_df["lemmatize_text"].tolist()
train_df_em_1gram, vc_1gram = cv(train_df_corpus, 1)
train_df_em_2gram, vc_2gram = cv(train_df_corpus, 2)
train_df_em_3gram, vc_3gram = cv(train_df_corpus, 3)

print(len(train_df_corpus))
print(train_df_em_1gram.shape)
print(train_df_em_2gram.shape)
print(train_df_em_3gram.shape)

In [None]:
del train_df_em_1gram, train_df_em_2gram, train_df_em_3gram

### Term Frequency-Inverse Document Frequency (TF-IDF):


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def TFIDF(data, ngram = 1, MAX_NB_WORDS = 75000):
    tfidf_x = TfidfVectorizer(ngram_range = (ngram, ngram), max_features = MAX_NB_WORDS)
    emb = tfidf_x.fit_transform(data).toarray()
    print("tf-idf with", str(np.array(emb).shape[1]), "features")
    return emb, tfidf_x

In [None]:
test_corpus = train_df["lemmatize_text"][:5].tolist()
print("The test corpus: ", test_corpus, "\n")

test_tfidf_em_1gram, test_tfidf_1gram = TFIDF(test_corpus, ngram=1)
print_out(test_tfidf_em_1gram, test_tfidf_1gram, ngram="Uni-gram")

In [None]:
test_tfidf_em_2gram, test_tfidf_2gram = TFIDF(test_corpus, ngram=2)
print_out(test_tfidf_em_2gram, test_tfidf_2gram, ngram="Bi-gram")

In [None]:
test_tfidf_em_3gram, test_tfidf_3gram = TFIDF(test_corpus, ngram=3)
print_out(test_tfidf_em_3gram, test_tfidf_3gram, ngram="Tri-gram")

In [None]:
%time

# implement into the whole dataset
train_df_corpus = train_df["lemmatize_text"].tolist()
train_df_tfidf_1gram, tfidf_1gram = TFIDF(train_df_corpus, 1)
train_df_tfidf_2gram, tfidf_2gram = TFIDF(train_df_corpus, 2)
train_df_tfidf_3gram, tfidf_3gram = TFIDF(train_df_corpus, 3)

print(len(train_df_corpus))
print(train_df_tfidf_1gram.shape)
print(train_df_tfidf_1gram.shape)
print(train_df_tfidf_1gram.shape)

In [None]:
del train_df_tfidf_1gram, train_df_tfidf_2gram, train_df_tfidf_3gram

## Word Embedding:



In [None]:
%time

import gensim
print("gensim version:", gensim.__version__)

word2vec_path = "../input/googlenewsvectorsnegative300/GoogleNews-vectors-negative300.bin"

# we only load 200k most common words from Google News corpus
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True, limit=200000)

Compare the similarity between "cat" vs. "kitten" and "cat" vs. "cats"

In [None]:
print(word2vec_model.similarity('cat', 'kitten'))
print(word2vec_model.similarity('cat', 'cats'))

In [None]:
def get_average_vec(tokens_list, vector, generate_missing=False, k=300):
    """
        Calculate average embedding value of sentence from each word vector
    """

    if len(tokens_list)<1:
        return np.zeros(k)

    if generate_missing:
        vectorized = [vector[word] if word in vector else np.random.rand(k) for word in tokens_list]
    else:
        vectorized = [vector[word] if word in vector else np.zeros(k) for word in tokens_list]

    length = len(vectorized)
    summed = np.sum(vectorized, axis=0)
    averaged = np.divide(summed, length)
    return averaged

def get_embeddings(vectors, text, generate_missing=False, k=300):
    """
        create the sentence embedding
    """
    embeddings = text.apply(lambda x: get_average_vec(x, vectors, generate_missing=generate_missing, k=k))
    return list(embeddings)

In [None]:
%time

embeddings_word2vec = get_embeddings(word2vec_model, train_df["lemmatize_text"], k=300)

print("Embedding matrix size", len(embeddings_word2vec), len(embeddings_word2vec[0]))
print("The sentence: \"%s\" got embedding values: " % train_df["lemmatize_text"][0])
print(embeddings_word2vec[0])

In [None]:
del embeddings_word2vec

#### Global Vectors for Word Representation (GloVe):

In [None]:
%time

from gensim.scripts.glove2word2vec import glove2word2vec

glove_input_file = "../input/glove6b/glove.6B.300d.txt"
word2vec_output_file = "glove.6B.100d.txt.word2vec"
glove2word2vec(glove_input_file, word2vec_output_file)

# we only load 200k most common words from Google New corpus
glove_model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False, limit=200000)

In [None]:
print(glove_model.similarity('cat', 'kitten'))
print(glove_model.similarity('cat', 'cats'))

In [None]:
%time

embeddings_glove = get_embeddings(glove_model, train_df["lemmatize_text"], k=300)

print("Embedding matrix size", len(embeddings_glove), len(embeddings_glove[0]))
print("The sentence: \"%s\" got embedding values: " % train_df["lemmatize_text"][0])
print(embeddings_glove[0])