<a href="https://colab.research.google.com/github/kurosakiichig/SW-mid/blob/main/NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!ls -l


total 35248
-rw-r--r-- 1 root root 33597927 Apr 25 03:46 'NBADataset - 12-07-2020 till 19-09-2020.csv'
-rw-r--r-- 1 root root  1216623 Apr 25 04:36  ngram_matrix.npz
-rw-r--r-- 1 root root   373789 Apr 25 04:36  ngram_vectorizer.pkl
drwxr-xr-x 1 root root     4096 Apr 23 13:39  sample_data
-rw-r--r-- 1 root root   886297 Apr 25 04:36  sentiment_tfidf_nb.pkl


In [14]:
import pandas as pd
import re
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics.pairwise import cosine_similarity
import joblib
import scipy.sparse

# 1. Load and preprocess data
df = pd.read_csv('/content/NBADataset - 12-07-2020 till 19-09-2020.csv')
STOP = set(
    "a about above after again against all am an and any are arent as at be "
    "because been before being below between both but by couldnt did didnt do "
    "does doesnt doing dont down during each few for from further had hadnt has hasnt "
    "have havent having he hed hell hes her here heres hers herself him himself his how "
    "hows i id ill im ive if in into is isnt it its itself lets me more most mustnt "
    "my myself no nor not of off on once only or other ought our ours ourselves out over "
    "own same shant she shed shell shes should shouldnt so some such than that thats the "
    "their theirs them themselves then there theres these they theyd theyll theyre theyve "
    "this those through to too under until up very was wasnt we wed well were weve were werent "
    "what whats when whens where wheres which while who whos whom why whys with wont would wouldnt "
    "you youd youll youre youve your yours yourself yourselves"
)

def clean(text):
    t = re.sub(r'https?://\S+|@\w+|[^a-zA-Z\s]', '', str(text)).lower()
    return ' '.join(w for w in t.split() if w not in STOP)

# Clean text
df['clean_text'] = df['text'].apply(clean)

# Convert polarity into 3 sentiment classes
df['sentiment'] = df['polarity'].apply(
    lambda p: 'positive' if p > 0 else 'negative' if p < 0 else 'neutral'
)

# 2. Sentiment analysis: TF-IDF + Naive Bayes
X = df['clean_text']
y = df['sentiment']
X_tr, X_te, y_tr, y_te = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

model_sent = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=10000, ngram_range=(1,2))),
    ('nb', MultinomialNB())
])
model_sent.fit(X_tr, y_tr)
print("=== Sentiment Analysis (TF-IDF + Naive Bayes) ===")
print(classification_report(y_te, model_sent.predict(X_te)))
print(confusion_matrix(y_te, model_sent.predict(X_te)))
joblib.dump(model_sent, 'sentiment_tfidf_nb.pkl')

# 3. Text Retrieval: Pure N-gram Matching
vec_ng = CountVectorizer(max_features=10000, ngram_range=(2,3))
gram_mat = vec_ng.fit_transform(df['clean_text'])
joblib.dump(vec_ng, 'ngram_vectorizer.pkl')
scipy.sparse.save_npz('ngram_matrix.npz', gram_mat)

def retrieve(query, top_k=5):
    q = clean(query)
    qv = vec_ng.transform([q])
    sims = cosine_similarity(qv, gram_mat).flatten()
    idx = np.argsort(sims)[::-1][:top_k]
    results = []
    for i in idx:
        results.append((df.iloc[i]['text'], float(sims[i])))
    return results

print("\n=== Retrieval Example (N-gram + Cosine Similarity) ===")
for txt, sc in retrieve("lakers vs warriors", top_k=5):
    print(f"{sc:.3f} | {txt[:80]}...")

# 4. Input Examples: Sentiment Classification & Retrieval
sample_texts = [
    "The Lakers are unstoppable tonight!",
    "I can’t believe how bad that refereeing was.",
    "Neutral comment about the game.",
    "Amazing comeback by the Heat last night!",
    "The defense was solid, but the offense was lacking.",
]
print("\n=== Sentiment Classification Examples ===")
for t in sample_texts:
    print(f"Text: {t} -> Sentiment: {model_sent.predict([clean(t)])[0]}")

sample_queries = [
    "Warriors championship game analysis",
    "Raptors playoff performance",
    "Lakers versus Suns highlights",
    "Heat season preview",
]
print("\n=== Retrieval Example Queries ===")
for q in sample_queries:
    print(f"Query: {q}")
    for txt, sc in retrieve(q, top_k=3):
        print(f"  {sc:.3f} | {txt[:60]}...")
    print()


=== Sentiment Analysis (TF-IDF + Naive Bayes) ===
              precision    recall  f1-score   support

    negative       0.70      0.62      0.66      2302
     neutral       0.91      0.84      0.87      9649
    positive       0.79      0.89      0.83      7670

    accuracy                           0.83     19621
   macro avg       0.80      0.78      0.79     19621
weighted avg       0.84      0.83      0.83     19621

[[1421  251  630]
 [ 310 8126 1213]
 [ 287  576 6807]]

=== Retrieval Example (N-gram + Cosine Similarity) ===
1.000 | Lakers vs Thunder lice now! Busby’s West #busbys #busbyswest #lakers #oklahomaci...
0.943 | #NBAPlayoffs Calendario  #LosAngelesLakers  J1 8/18 Lakers vs Blazers 8pm J2 8/2...
0.894 | @barstoolsports 2007 Phoenix Suns vs. Spurs series.   Lakers vs Kings Lakers vs ...
0.707 | Lakers vs Jazz 6pm Busby’s West #lakers #utahjazz #losangeleslakers #busbys #bus...
0.707 | Enjoying this game Lakers vs Raptors #lakers #TorontoRaptors...

=== Sentiment Cla