# Reducing of Sparse Cash Flow Category
What is the motivation?  
To effectively model or classify user behavior using cash flow categories, it’s important to tackle the issue of sparsely populated or rarely used categories in the dataset. These categories can introduce noise, complicate behavior classification, and diminish the model’s clarity and interpretability. To address this challenge, grouping them into higher-order, more logically clustered categories can improve the analysis and interpretation of user financial behavior.




In [None]:
import pandas as pd

category = pd.read_feather('../data/category_stat.feather')
category.groupby('is_expense').size()

In [None]:
category[category.is_expense].sort_values(['n_user','count']).tail(20)

In [None]:
_ = category[category.is_expense].plot(x='count', y='n_user', kind='scatter', alpha=.5)

In [None]:
_ = category[category.is_expense & (category.n_user < 1000)].plot(x='count', y='n_user', kind='scatter', alpha=.5)

In [None]:
category.query("is_expense == True and n_user < 1000").sort_values(['n_user','count']).tail(40)

In [None]:
category[~category.is_expense].sort_values('count').head(20)

In [None]:
_ = category[~category.is_expense].plot(x='count', y='n_user', kind='scatter', alpha=.5)

In [None]:
import jieba
from gensim.models import Word2Vec
from sklearn.cluster import KMeans
import numpy as np

# Sample Traditional Chinese text
text = ["今天天氣很好", "外面下雨了", "我喜歡吃蘋果", "他討厭香蕉", "天氣冷了，要穿厚一點"]

# Tokenize using Jieba
tokenized_text = [jieba.lcut(sentence) for sentence in text]

# Train Word2Vec model
model = Word2Vec(sentences=tokenized_text, vector_size=50, window=3, min_count=1, sg=1)

# Extract word embeddings
words = list(model.wv.key_to_index.keys())
word_vectors = np.array([model.wv[word] for word in words])

# Apply K-Means clustering
num_clusters = 3  # Adjust based on your needs
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(word_vectors)

# Output clusters
clusters = {i: [] for i in range(num_clusters)}
for word, label in zip(words, kmeans.labels_):
    clusters[label].append(word)

print(clusters)


In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd

# Load pretrained SBERT model
# model = SentenceTransformer("uer/sbert-base-chinese-nli")
model = SentenceTransformer("shibing624/text2vec-base-chinese")


# Example Traditional Chinese phrases
phrases = ["飲食", "三餐", "早餐", "晚餐", "宵夜", "咖啡", "食材", "加油", "生活用品", "飲料", "日用品", "日常用品"]

# Encode phrases into embeddings
embeddings = model.encode(phrases)

# Compute cosine similarity matrix
similarity_matrix = cosine_similarity(embeddings)

# Convert similarity matrix to 1D feature space
X = np.array(similarity_matrix)

# Apply Hierarchical Clustering
cluster_model = AgglomerativeClustering(n_clusters=None, distance_threshold=0.5, linkage="ward")
labels = cluster_model.fit_predict(X)

# Display clustered results
df = pd.DataFrame({"Phrase": phrases, "Cluster": labels})
print(df.sort_values("Cluster"))
