In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import jieba
import re
import joblib
import json


# 读取数据
df = pd.read_csv('Meme_Coins_Solana_3000.csv')  # 包含 'Coin Name', 'Symbol' 列

print("数据概览:")
print(f"总币种数量: {len(df)}")
print("\n前5行数据:")
print(df[['Coin Name', 'Symbol']].head())



数据概览:
总币种数量: 3000

前5行数据:
  Coin Name    Symbol
0  LOLToken     $PEPE
1  LOLToken   $GIGGLE
2  LOLToken  $MOONWAG
3   DogeSun     $MEOW
4  ChadCoin     $PEPE


In [4]:

# 自定义 JSON 序列化器来处理 NumPy 数据类型
class NumpyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, (np.integer, np.intc, np.intp, np.int8, np.int16, 
                          np.int32, np.int64, np.uint8, np.uint16, np.uint32, np.uint64)):
            return int(obj)
        elif isinstance(obj, (np.floating, np.float16, np.float32, np.float64)):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        elif isinstance(obj, np.bool_):
            return bool(obj)
        return super().default(obj)

# 文本预处理函数
def preprocess_text(text):
    """预处理文本，支持中英文"""
    if pd.isna(text):
        return ""
    
    text = str(text).lower().strip()
    text = re.sub(r'[^\w\u4e00-\u9fff]', ' ', text)
    return text

def tokenize_chinese(text):
    """中文分词"""
    return ' '.join(jieba.cut(text))

def preprocess_combined(row):
    """组合处理 Coin Name 和 Symbol"""
    name = preprocess_text(row['Coin Name'])
    symbol = preprocess_text(row['Symbol'])
    
    # 如果是中文，进行分词
    if any('\u4e00-\u9fff' in text for text in [name, symbol]):
        name = tokenize_chinese(name)
        symbol = tokenize_chinese(symbol)
    
    combined_text = f"{name} {symbol}"
    return combined_text

# 准备文本数据
print("\n预处理文本数据...")
df['combined_text'] = df.apply(preprocess_combined, axis=1)

# 文本向量化
print("进行文本向量化...")
vectorizer = TfidfVectorizer(
    max_features=100,
    min_df=2,
    max_df=0.8,
    stop_words='english',
    ngram_range=(1, 2)
)

X = vectorizer.fit_transform(df['combined_text'])
print(f"文本向量维度: {X.shape}")

# 聚类
print("进行聚类分析...")
kmeans = KMeans(n_clusters=4, random_state=42, n_init=10)
cluster_labels = kmeans.fit_predict(X)
df['cluster'] = cluster_labels

# 分析聚类特征
print("分析各聚类特点...")
cluster_descriptions = {}

for cluster_id in range(4):
    cluster_data = df[df['cluster'] == cluster_id]
    cluster_texts = ' '.join(cluster_data['combined_text'])
    
    words = cluster_texts.split()
    from collections import Counter
    word_counts = Counter(words)
    
    common_words = [(word, count) for word, count in word_counts.most_common(20) 
                   if len(word) > 1 and not word.isdigit()]
    
    theme_keywords = {
        '动物系': ['dog', 'cat', 'shib', 'doge', 'kitty', 'pig', 'bull', 'bear', '动物', '狗', '猫'],
        '人物系': ['king', 'queen', 'elon', 'people', 'man', 'woman', '国王', '女王', '人物'],
        '自然系': ['moon', 'sun', 'earth', 'star', 'planet', '宇宙', '星星', '月亮', '太阳'],
        '财富系': ['money', 'cash', 'rich', 'gold', 'diamond', '财富', '金钱', '黄金'],
        '科技系': ['ai', 'tech', 'robot', 'cyber', 'quantum', '人工智能', '科技', '机器人']
    }
    
    detected_themes = []
    for theme, keywords in theme_keywords.items():
        theme_count = sum(1 for word, _ in common_words[:10] if word in keywords)
        if theme_count >= 2:
            detected_themes.append(theme)
    
    # 确保所有数据都是 Python 原生类型
    cluster_descriptions[cluster_id] = {
        'size': int(len(cluster_data)),  # 转换为 Python int
        'common_words': [(str(word), int(count)) for word, count in common_words[:8]],  # 确保字符串和整数
        'themes': [str(theme) for theme in detected_themes],
        'example_coins': [
            {
                'name': str(coin['Coin Name']),
                'symbol': str(coin['Symbol'])
            }
            for _, coin in cluster_data[['Coin Name', 'Symbol']].head(3).iterrows()
        ]
    }

# 保存完整模型文件
print("\n保存完整模型文件...")
model_package = {
    'metadata': {
        'model_type': 'Text_Only_MemeCoin_Classifier',
        'features': ['Coin Name', 'Symbol'],
        'num_clusters': int(kmeans.n_clusters),  # 转换为 Python int
        'language': 'multilingual'
    },
    'vectorizer': vectorizer,
    'kmeans': kmeans,
    'cluster_descriptions': cluster_descriptions
}

joblib.dump(model_package, 'text_only_meme_classifier.pkl')

# 保存轻量级 JSON 模型（修复序列化问题）
print("保存轻量级 JSON 模型...")
lightweight_model = {
    'vocabulary': {str(k): int(v) for k, v in vectorizer.vocabulary_.items()},  # 确保字符串键和整数值
    'idf': [float(x) for x in vectorizer.idf_],  # 转换为 Python float 列表
    'cluster_centers': [[float(y) for y in x] for x in kmeans.cluster_centers_],  # 转换为嵌套的 Python float 列表
    'feature_names': [str(name) for name in vectorizer.get_feature_names_out()],  # 确保字符串
    'cluster_descriptions': cluster_descriptions,
    'model_params': {
        'num_clusters': int(kmeans.n_clusters),
        'max_features': int(vectorizer.max_features)
    }
}

# 使用自定义编码器保存 JSON
with open('lightweight_model.json', 'w', encoding='utf-8') as f:
    json.dump(lightweight_model, f, ensure_ascii=False, indent=2, cls=NumpyEncoder)

print("✅ 模型文件保存完成!")


预处理文本数据...
进行文本向量化...
文本向量维度: (3000, 100)
进行聚类分析...
分析各聚类特点...

保存完整模型文件...
保存轻量级 JSON 模型...
✅ 模型文件保存完成!
