<a href="https://colab.research.google.com/github/kurosakiichig/SW-mid/blob/main/NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!ls -l


total 4
drwxr-xr-x 1 root root 4096 Apr 23 13:39 sample_data


In [8]:
"""
NBA 推文情感分析 & 文本检索
依赖：pandas, numpy, scikit-learn, gensim, joblib, scipy
数据文件：NBADataset.csv（放在与脚本同目录 或 /content）
"""

import pandas as pd
import numpy as np
import re, os
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec
import joblib, scipy.sparse

# ====================== 1. 读数 & 预处理 ======================
FILE = 'NBADataset.csv'                      # 修改路径即可
df = pd.read_csv(FILE)

# 简易英文停用词表
_STOP = set("""
a about above after again against all am an and any are arent as at be because
been before being below between both but by couldnt did didnt do does doesnt
doing dont down during each few for from further had hadnt has hasnt have
havent having he hed hell hes her here heres hers herself him himself his how
hows i id ill im ive if in into is isnt it its itself lets me more most mustnt
my myself no nor not of off on once only or other ought our ours ourselves out
over own same shant she shed shell shes should shouldnt so some such than that
thats the their theirs them themselves then there theres these they theyd theyll
theyre theyve this those through to too under until up very was wasnt we wed
well were weve were werent what whats when whens where wheres which while who
whos whom why whys with wont would wouldnt you youd youll youre youve your
yours yourself yourselves
""".split())

def clean(txt: str) -> str:
    txt = txt.lower()
    txt = re.sub(r'https?://\S+', '', txt)      # 去 URL
    txt = re.sub(r'@\w+', '', txt)              # 去 @mention
    txt = re.sub(r'[^a-z\s]', '', txt)          # 非字母→空
    return ' '.join(t for t in txt.split() if t not in _STOP)

df['clean_text'] = df['text'].astype(str).apply(clean)

# 把 VADER polarity 映射为三分类标签
df['sentiment'] = df['polarity'].apply(
    lambda p: 'positive' if p > 0 else 'negative' if p < 0 else 'neutral'
)

# ====================== 2. 情感分析：TF-IDF + NB ======================
X_tr, X_te, y_tr, y_te = train_test_split(
    df['clean_text'], df['sentiment'],
    test_size=0.2, random_state=42, stratify=df['sentiment']
)

clf_nb = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=1_0000, ngram_range=(1,2))),
    ('nb', MultinomialNB())
])
clf_nb.fit(X_tr, y_tr)

print("\n=== TF-IDF + NB  情感分类结果 ===")
print(classification_report(y_te, clf_nb.predict(X_te)))
print(confusion_matrix(y_te, clf_nb.predict(X_te)))

joblib.dump(clf_nb, 'tfidf_nb_sentiment.pkl')

# ====================== 3. Skip-gram 语料训练 ======================
sentences = [s.split() for s in df['clean_text']]
sg_model = Word2Vec(
    sentences=sentences,
    vector_size=100,
    window=5,
    min_count=5,
    sg=1,              # 1 = Skip-gram
    workers=4,
    epochs=10
)
sg_model.save('skipgram.model')

def sent_vec(tokens):
    vecs = [sg_model.wv[w] for w in tokens if w in sg_model.wv]
    return np.mean(vecs, axis=0) if vecs else np.zeros(sg_model.vector_size)

# 为每篇推文生成文档向量（可缓存到硬盘）
doc_vectors = np.vstack(df['clean_text'].apply(lambda x: sent_vec(x.split())))

np.save('doc_vectors.npy', doc_vectors)        # 若需持久化

# ====================== 4. 检索函数：Skip-gram 余弦相似度 ======================
def retrieve(query: str, top_k=5):
    q_vec = sent_vec(clean(query).split()).reshape(1, -1)
    sims  = cosine_similarity(q_vec, doc_vectors).ravel()
    idx   = sims.argsort()[::-1][:top_k]
    return [(df.iloc[i]['text'], float(sims[i])) for i in idx]

print("\n=== Skip-gram 检索示例：\"lakers vs warriors\" ===")
for t, sc in retrieve("lakers vs warriors"):
    print(f"{sc:.3f} | {t[:100]}...")

# ====================== 5. 组合示例 ======================
def ensemble_sentiment(text):
    """NB 预测 + Skip-gram 最近邻投票（示范：正>负>中）"""
    nb_pred = clf_nb.predict([clean(text)])[0]
    # 找最近 10 条相似 tweet 的情感取众数
    sims_idx = cosine_similarity(
        sent_vec(clean(text).split()).reshape(1,-1),
        doc_vectors
    ).ravel().argsort()[::-1][:10]
    neighbor_major = df.iloc[sims_idx]['sentiment'].mode()[0]
    # 若两者一致，则输出；不一致按正>负>中优先
    if nb_pred == neighbor_major:
        return nb_pred
    priority = ['positive', 'negative', 'neutral']
    return min([nb_pred, neighbor_major], key=priority.index)

print("\nEnsemble demo →", ensemble_sentiment("That game was unbelievable!"))


ModuleNotFoundError: No module named 'gensim'