<a href="https://colab.research.google.com/github/kurosakiichig/SW-mid/blob/main/NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!ls -la /content


total 16
drwxr-xr-x 1 root root 4096 Apr 11 13:37 .
drwxr-xr-x 1 root root 4096 Apr 17 03:10 ..
drwxr-xr-x 4 root root 4096 Apr 11 13:37 .config
drwxr-xr-x 1 root root 4096 Apr 11 13:37 sample_data


In [None]:
# 假设原文件名为 "NBADataset - 12-07-2020 till 19-09-2020.csv"
!mv "/content/NBADataset - 12-07-2020 till 19-09-2020.csv" /content/NBADataset.csv


mv: cannot stat '/content/NBADataset - 12-07-2020 till 19-09-2020.csv': No such file or directory


In [None]:
import pandas as pd
import re
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics.pairwise import cosine_similarity
import joblib
import scipy.sparse
import os

# 1. 加载数据（注意文件名需与上一步一致）
df = pd.read_csv('/NBADataset - 12-07-2020 till 19-09-2020.csv')

# 2. 定义停用词并清洗函数
stop_words = set("a about above after again against all am an and any are arent as at be because been before being below between both but by couldnt did didnt do does doesnt doing dont down during each few for from further had hadnt has hasnt have havent having he hed hell hes her here heres hers herself him himself his how hows i id ill im ive if in into is isnt it its itself lets me more most mustnt my myself no nor not of off on once only or other ought our ours ourselves out over own same shant she shed shell shes should shouldnt so some such than that thats the their theirs them themselves then there theres these they theyd theyll theyre theyve this those through to too under until up very was wasnt we wed well were weve were werent what whats when whens where wheres which while who whos whom why whys with wont would wouldnt you youd youll youre youve your yours yourself yourselves".split())
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'http\\S+', '', text)
    text = re.sub(r'@\\w+', '', text)
    text = re.sub(r'[^a-z\\s]', '', text)
    return ' '.join([t for t in text.split() if t not in stop_words])

df['clean_text'] = df['text'].apply(clean_text)

# 3. 生成情感标签
def to_sentiment(p):
    if p > 0: return 'positive'
    if p < 0: return 'negative'
    return 'neutral'
df['sentiment'] = df['polarity'].apply(to_sentiment)

# 4. 划分数据集
X_train, X_test, y_train, y_test = train_test_split(
    df['clean_text'], df['sentiment'],
    test_size=0.2, random_state=42, stratify=df['sentiment']
)

# 5. 情感分析模型：TF-IDF + MultinomialNB
sent_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=10000, ngram_range=(1,2))),
    ('clf', MultinomialNB())
])
sent_pipeline.fit(X_train, y_train)

# 6. 评估并保存模型
y_pred = sent_pipeline.predict(X_test)
print("=== 情感分类报告 ===")
print(classification_report(y_test, y_pred))
print("=== 混淆矩阵 ===")
print(confusion_matrix(y_test, y_pred))
joblib.dump(sent_pipeline, 'nba_sentiment_model.pkl')
print("情感模型已保存：nba_sentiment_model.pkl")

# 7. 构建 TF-IDF 检索（Cosine 相似度）
tfidf_vec = TfidfVectorizer(max_features=10000, ngram_range=(1,2))
tfidf_matrix = tfidf_vec.fit_transform(df['clean_text'])
joblib.dump(tfidf_vec, 'tfidf_vectorizer.pkl')
scipy.sparse.save_npz('tfidf_matrix.npz', tfidf_matrix)
print("TF-IDF 向量化器与矩阵已保存")

# 8. 检索函数示例
def retrieve_tfidf(query, top_k=5):
    q_clean = clean_text(query)
    q_vec = tfidf_vec.transform([q_clean])
    sims = cosine_similarity(q_vec, tfidf_matrix).flatten()
    top_idx = np.argsort(sims)[::-1][:top_k]
    return [(df.iloc[i]['text'], float(sims[i])) for i in top_idx]

print("\\n=== 检索示例：'lakers vs warriors' ===")
for text, sim in retrieve_tfidf("lakers vs warriors"):
    print(f"Sim={sim:.3f} | {text[:100]}...")


=== 情感分类报告 ===
              precision    recall  f1-score   support

    negative       1.00      0.35      0.51      2302
     neutral       0.66      1.00      0.80      9649
    positive       1.00      0.55      0.71      7670

    accuracy                           0.75     19621
   macro avg       0.89      0.63      0.67     19621
weighted avg       0.83      0.75      0.73     19621

=== 混淆矩阵 ===
[[ 795 1507    0]
 [   0 9649    0]
 [   0 3464 4206]]
情感模型已保存：nba_sentiment_model.pkl
TF-IDF 向量化器与矩阵已保存
\n=== 检索示例：'lakers vs warriors' ===
Sim=0.000 | @NBA The @DetroitPistons could have had @spidadmitchell but passed on him for some lame player #detr...
Sim=0.000 | Alberts Saturday Slate  Rockets -5 🔨 Djokovich win US Open +125🔨 Central Arkansas-5 🔨 Cent Arkansas ...
Sim=0.000 | #Nba #HoustonRockets #jamesharden #NBAPlayoffs  #OneMission Make it 3-2 win. Go rockets. H-Town fore...
Sim=0.000 | I fully support everyone's right to protest against the issues they feel strongly about. T