In [1]:
## text classification with NO model training: 
# https://towardsdatascience.com/text-classification-with-no-model-training-935fe0e42180

## steps:     
1 Setup: import packages, read data.
2 Preprocessing: clean text data.
3 Create Target Clusters: use Word2Vec with gensim to build the target variable.
4 Feature Engineering: Word Embedding with transformers and BERT.
5 Model Design & Testing: assign observations to clusters by Cosine Similarity and evaluate the performance.
6 Explainability: understand how the model produces results.

In [2]:
!pip install bert-for-tf2
!pip install sentencepiece



In [28]:
## for data
import json
import pandas as pd
import numpy as np
from sklearn import metrics, manifold## for processing
import re
import nltk## for plotting
import matplotlib.pyplot as plt
import seaborn as sns## for w2v
import gensim
import gensim.downloader as gensim_api## for bert
import transformers

In [29]:
import os

In [30]:
os.getcwd()

'/Users/jing/Documents/MyDocuments/CurrentProjects/WolfProject/DataOrganizationAnalyses/ChildObservation/TextMining/TextClassification_BERT'

In [31]:
os.getcwd()

'/Users/jing/Documents/MyDocuments/CurrentProjects/WolfProject/DataOrganizationAnalyses/ChildObservation/TextMining/TextClassification_BERT'

In [34]:
df = pd. read_excel("ChildObservationCleanedPreprocessed.xlsx")

In [35]:
df_clean = df["Lemmatize"]

In [36]:
df_clean.head()

0    child play cart belong p grandfather father br...
1    store open yet wait buy something suggest go s...
2    three row rice grain dry sun rid tricycle arou...
3    turtle string try hit snake see irrigation dit...
4    scold say break flower dont want bride give ba...
Name: Lemmatize, dtype: object

# Create Target Clusters

In [37]:
nlp = gensim_api.load("glove-wiki-gigaword-300")

In [38]:
nlp.most_similar(["obama"], topn=3)

[('barack', 0.9254721403121948),
 ('mccain', 0.7590768337249756),
 ('bush', 0.7570987939834595)]

In [43]:
## Function to apply
def get_similar_words(lst_words, top, nlp):
    lst_out = lst_words
    for tupla in nlp.most_similar(lst_words, topn=top):
        lst_out.append(tupla[0])
    return list(set(lst_out))

## Create Dictionary {category:[keywords]}
dic_clusters = {}

dic_clusters["FAMILY"] = get_similar_words(['mother','father','sister','brother','grandmother','grandfather','baby','adopted'], 
                  top=30, nlp=nlp)

dic_clusters["GAME"] = get_similar_words(['hopscotch','maze','finger','house','hide','game','team', 'group', 'chase'],
                   top=30, nlp=nlp)

dic_clusters["PLAY"] = get_similar_words(['mud','stick','toy','trike','leaf','paper'],
                   top=30, nlp=nlp)

dic_clusters["AGGRESSION"] = get_similar_words(['copulate','hit','argue','fight','shove',
                   'scold', 'spank'], top=30, nlp=nlp)

dic_clusters["SCHOOL"] = get_similar_words(['classroom','homework','teacher','pen','pencil','playground'],
                  top=30, nlp=nlp)

dic_clusters["STORE"] = get_similar_words(['store','candy','snack','money'],
                  top=30, nlp=nlp)

## print some
for k,v in dic_clusters.items():
    print(k, ": ", v[0:14], "...", len(v))

FAMILY :  ['stepfather', 'grandmother', 'brother', 'father', 'eldest', 'sons', 'family', 'sister', 'mother', 'granddaughter', 'uncle', 'niece', 'parents', 'wife'] ... 38
GAME :  ['one', 'home', 'playing', 'maze', 'away', 'get', "n't", 'hopscotch', 'it', 'go', 'play', 'hide', 'team', 'going'] ... 39
PLAY :  ['sticks', 'piece', 'stick', 'sheet', 'cardboard', 'dirt', 'mud', 'miniature', 'bag', 'plastic', 'glue', 'resemble', 'hand', 'trike'] ... 36
AGGRESSION :  ['taunt', 'get', "n't", 'letting', 'shove', 'fight', 'strangle', 'somebody', 'tempted', 'yell', 'push', 'ought', 'scold', 'throw'] ... 37
SCHOOL :  ['student', 'teachers', 'teaching', 'teach', 'homework', 'curriculum', 'students', 'school', 'teacher', 'taught', 'pencils', 'classroom', 'kids', 'preschool'] ... 36
STORE :  ['foods', 'chocolate', 'buy', 'shoppers', 'snack', 'store', 'liquor', 'food', 'stores', 'sells', 'bought', 'shop', 'supermarket', 'grocery'] ... 34


In [66]:
## word embedding
tot_words = [word for v in dic_clusters.values() for word in v]
X = nlp[tot_words]

## pca
pca = manifold.TSNE(perplexity=40, n_components=2, init='pca')
X = pca.fit_transform(X)

## create dtf
dtf = pd.DataFrame()
for k,v in dic_clusters.items():
    size = len(dtf) + len(v)
    dtf_group = pd.DataFrame(X[len(dtf):size], columns=["x","y"], 
                             index=v)
    dtf_group["cluster"] = k
    dtf = dtf.append(dtf_group)
    
## plot
fig, ax = plt.subplots()
sns.scatterplot(data=dtf, x="x", y="y", hue="cluster", ax=ax)

ax.legend(bbox_to_anchor=(0, 1), loc='upper right', ncol=1).texts[0].set_text(None)
ax.set(xlabel=None, ylabel=None, xticks=[], xticklabels=[], 
       yticks=[], yticklabels=[])

for i in range(len(dtf)):
    ax.annotate(dtf.index[i], 
               xy=(dtf["x"].iloc[i],dtf["y"].iloc[i]), 
               xytext=(0.5, 0.2), textcoords='offset points', 
               ha='right', va='bottom')
fig.savefig('clusters.png')

TypeError: 'TFBertModel' object is not subscriptable

In [60]:
## Feature Engineering
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
nlp = transformers.TFBertModel.from_pretrained('bert-base-uncased')

In [65]:
txt = "finger game"
## tokenize
idx = tokenizer.encode(txt)
print("tokens:", tokenizer.convert_ids_to_tokens(idx))
print("ids   :", tokenizer.encode(txt))

## word embedding
idx = np.array(idx)[None,:]
embedding = nlp(idx)
print("shape:", embedding[0][0].shape)

## function to apply
def utils_bert_embedding(txt, tokenizer, nlp):
    idx = tokenizer.encode(txt)
    idx = np.array(idx)[None,:]  
    embedding = nlp(idx)
    X = np.array(embedding[0][0][1:-1])
    return X## create list of news vector
lst_mean_vecs = [utils_bert_embedding(txt, tokenizer, nlp).mean(0) 
                 for txt in df_clean]

## create the feature matrix (n news x 768)
X = np.array(lst_mean_vecs)

tokens: ['finger', 'game']
ids   : [4344, 2208]
shape: (2, 768)


Token indices sequence length is longer than the specified maximum sequence length for this model (651 > 512). Running this sequence through the model will result in indexing errors


InvalidArgumentError: indices[0,512] = 512 is not in [0, 512) [Op:ResourceGather]