Useful links
- Unsupervised Text Classification with Topic Models and Good Old Human Reasoning https://medium.com/@power.up1163/unsupervised-text-classification-with-topic-models-and-good-old-human-reasoning-da297bed7362
- Understanding Text Classification in NLP with Movie Review Example https://www.analyticsvidhya.com/blog/2020/12/understanding-text-classification-in-nlp-with-movie-review-example-example/
- Text Clustering with Unsupervised Learning https://www.kaggle.com/code/carlosaguayo/text-clustering-with-unsupervised-learning

In [None]:
import configparser
import pandas as pd
import numpy as np
from numpy import array

import vertica_python
from vertica_python.errors import MissingRelation
import verticapy as vp
from verticapy import pandas_to_vertica, insert_into, drop

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import wordnet

import string
from wordcloud import WordCloud
import matplotlib.pyplot as plt

In [None]:
config_path = 'config.ini'
def get_VRT_cursor(path=config_path):
    config = configparser.ConfigParser()
    config.read(path)
    
    credentials = {
        'database': config['VERTICA']['database'],
        'user': config['VERTICA']['user'],
        'password': config['VERTICA']['password'],
        'host': config['VERTICA']['host'],
        'port': config['VERTICA']['port']
    }
    
    conn = vertica_python.connect(**credentials)
    cursor = conn.cursor()
    return cursor, conn

def get_df_from_sql(cursor, sql='SELECT now(), version()'):
    cursor.execute(sql)
    columns = cursor.description    
    result = cursor.fetchall()    
    
    df = pd.DataFrame(result, columns=[tuple[0] for tuple in columns])
    
    return df

VRT_cursor, conn = get_VRT_cursor()
vp.set_connection(conn)

In [15]:
df = {}

for i in range(6):
    print("Processing period ", i)    
    try:
        df[i] = get_df_from_sql(cursor=VRT_cursor, 
                                  sql=f"""select * from tbl""")
    except Exception as e:
        print("Unable to process because of exception", e)

Processing period  0
Processing period  1
Processing period  2
Processing period  3
Processing period  4
Processing period  5


### Data preparation

In [16]:
for i in range(6):
    print("Processing period", i)
    df[i]['dialog'] = df[i].groupby(['user_id'])['text'].transform(' '.join)
    df[i] = df[i][['user_id', 'dialog']].drop_duplicates()

In [17]:
def data_preparation(df):
    # tokenize the text
    df['dialog_processed'] = df['dialog'].map(lambda x: nltk.word_tokenize(x))

    # lowercase the tokens
    df['dialog_processed'] = df['dialog_processed'].map(lambda x: [token.lower() for token in x])

    # get list of stopwords in English
    stopwords = nltk.corpus.stopwords.words("english") + ['hi', 'hello', 'hey', 'hii', 'sym', 
                                                          'f', 'ok', 'yes', 'ye', 'yeah',
                                                          'f:1', 'f:2', 'f:3', 'f:4', 
                                                          'want', 'wanna', 'dont', 'u', 'baby', 'babe',
                                                          'love']
    # remove stopwords
    df['dialog_processed'] = df['dialog_processed'].map(lambda x: [token for token in x 
                                                                   if token.lower() not in stopwords])

    # remove punctuation
    df['dialog_processed'] = df['dialog_processed'].map(lambda x: [token for token in x 
                                                                  if token not in string.punctuation])

    # remove numbers
    df['dialog_processed'] = df['dialog_processed'].map(lambda x: [token for token in x 
                                                                  if token not in string.digits])

    #remove words with length <= 2 symbols
    df['dialog_processed'] = df['dialog_processed'].map(lambda x: [token for token in x if len(token) > 2])

    # remove non-informative records
    df['dialog_processed_len'] = df['dialog_processed'].map(lambda x: len(x))
    df = df[df['dialog_processed_len'] > 0 ]
    
    # remove smileys
    df['dialog_processed'] = df['dialog_processed'].map(lambda x: [token for token in x 
                                                                   if not(token.startswith('cat_'))])
    df['dialog_processed'] = df['dialog_processed'].map(lambda x: [token for token in x 
                                                                   if not(token.startswith('den_'))])
    df['dialog_processed'] = df['dialog_processed'].map(lambda x: [token for token in x 
                                                                   if not(token.startswith('standard_'))])
    df['dialog_processed'] = df['dialog_processed'].map(lambda x: [token for token in x 
                                                                   if not(token.startswith('htf_'))])
    df['dialog_processed'] = df['dialog_processed'].map(lambda x: [token for token in x 
                                                                   if not(token.startswith('jerboa_'))])
    df['dialog_processed'] = df['dialog_processed'].map(lambda x: [token for token in x 
                                                                   if not(token.startswith('smile_'))])
    df['dialog_processed'] = df['dialog_processed'].map(lambda x: [token for token in x 
                                                                   if not(token.startswith('owl_'))])
    df['dialog_processed'] = df['dialog_processed'].map(lambda x: [token for token in x 
                                                                   if not(token.startswith('ginger_'))])                                                               

    # remove non-informative records
    df['dialog_processed_len'] = df['dialog_processed'].map(lambda x: len(x))
    df = df[df['dialog_processed_len'] > 0 ]
    
    df_ = df[["user_id", "dialog", "dialog_processed"]]
    return df_

In [18]:
for i in range(6):
    df[i]  = data_preparation(df[i])

### Exploratory data analysis

Word map

In [None]:
df[5]['dialog_processed_string'] = df[5]['dialog_processed'].apply(lambda x: " ".join(i for i in x))
long_string = ' '.join(df['dialog_processed_string'])

wordcloud = WordCloud(background_color="white", max_words=10000, contour_width=3, contour_color='steelblue')
wordcloud.generate(long_string)
plt.imshow(wordcloud)

In [10]:
def plot_top_words(model, feature_names, n_top_words, title):
    fig, axes = plt.subplots(2, 5, figsize=(30, 15), sharex=True)
    axes = axes.flatten()
    for topic_idx, topic in enumerate(model.components_):
        top_features_ind = topic.argsort()[-n_top_words:]
        top_features = feature_names[top_features_ind]
        weights = topic[top_features_ind]

        ax = axes[topic_idx]
        ax.barh(top_features, weights, height=0.7)
        ax.set_title(f"Topic {topic_idx +1}", fontdict={"fontsize": 30})
        ax.tick_params(axis="both", which="major", labelsize=20)
        for i in "top right left".split():
            ax.spines[i].set_visible(False)
        fig.suptitle(title, fontsize=40)

    plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)
    plt.show()

In [16]:
n_samples = 2000
n_features = 1000
n_components = 10
n_top_words = 10

In [None]:
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from topicwizard.pipeline import make_topic_pipeline

long_string_ = [long_string]
#tf_vectorizer.fit_transform(long_string_)

tfidf_vectorizer = TfidfVectorizer(
    max_df=2, min_df=0.95, max_features=n_features, stop_words="english"
)
tfidf = tfidf_vectorizer.fit_transform(long_string_)
nmf = NMF(n_components=n_components).fit(tfidf)

In [None]:
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
plot_top_words(
    nmf, tfidf_feature_names, n_top_words, "Topics in NMF model"
)

In [None]:
tf_vectorizer = CountVectorizer(
    max_df=2, min_df=0.95, max_features=n_features, stop_words="english"
)
tf = tf_vectorizer.fit_transform(long_string_)


lda = LatentDirichletAllocation(
    n_components=n_components,
    max_iter=10,
    learning_method="online",
    learning_offset=50.0,
    random_state=0,
)
lda.fit(tf)

In [None]:
tf_feature_names = tf_vectorizer.get_feature_names_out()
plot_top_words(lda, tf_feature_names, 5, "Topics in LDA model")

### Get features from text

In [19]:
my_features = ['dog', 'cat', 'bird', 'fish']
users_with_features = {}

In [None]:
synonyms_features = {}

for feature in my_features:
    synonyms = []
    for syn in wordnet.synsets(feature): 
        for l in syn.lemmas(): 
            synonyms.append(l.name().lower().replace('_', ' ')) 
    synonyms.append(feature.lower().replace('_', ' '))
    synonyms_features[feature] = set(synonyms)    

synonyms_features['dog'].add('dachshund')  
print(synonyms_features) 

Extract users for each feature

In [21]:
for a in synonyms_features.keys():
    users_with_features[a] = pd.DataFrame(columns=['user_id', 'dialog', 'dialog_processed'])
    for i in range(6):
        for syn in synonyms_features[a]:
            mask = df[i].dialog_processed.apply(lambda x: syn in x)
            synonyms_features[a] = pd.concat([synonyms_features[a], df[i][mask]], ignore_index=True)

    users_with_features[a] = users_with_features[a].drop_duplicates(["user_id"])

In [None]:
results = pd.DataFrame(columns=['feature', 'users_count'])

for a in my_features:
    results.loc[len(results.index)] = [a, users_with_features[a].shape[0]] 
    
results.sort_values(by=['users_count'], ascending=False, ignore_index=True)