# 🧠 NLP Midterm Study Notebook
This guided notebook will help you review key concepts from preprocessing, embeddings, supervised learning, and LDA.

## 📌 Section 1: Preprocessing
Review how text is cleaned before modeling.

In [None]:
# Task: Clean the text using lowercase, remove URLs, usernames, and non-alphabet characters
import re

def preprocess(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\.\S+", "<URL>", text)
    text = re.sub(r"@\w+", "<USER>", text)
    text = re.sub(r"[^a-z\s]", "", text)
    text = re.sub(r"(.)\1{2,}", r"\1\1", text)  # normalize repeated letters
    return text

# Try it out:
sample_text = "Sooo excited!!! Check this out: https://example.com @user123"
print(preprocess(sample_text))

## 📌 Section 2: Embeddings
Review sparse vs dense embeddings.

In [None]:
# Task: Compare TF-IDF and Word2Vec embeddings
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = ["I love machine learning", "Text data is very interesting", "I love working with NLP"]

# TF-IDF
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(corpus)
print("TF-IDF shape:", X_tfidf.shape)

# Word2Vec (mock example)
# In practice, use gensim.models.Word2Vec or pretrained embeddings

from gensim.models import Word2Vec
tokens = [sent.lower().split() for sent in corpus]
w2v_model = Word2Vec(sentences=tokens, vector_size=50, window=3, min_count=1, workers=1)
print("Word2Vec vector for 'love':", w2v_model.wv['love'])

## 📌 Section 3: Supervised Learning
Train models using vectorized text.

In [None]:
# Task: Train a simple classifier on TF-IDF features
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

y = [1, 0, 1]  # labels: 1 = positive, 0 = neutral
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.33, random_state=42)

clf = LogisticRegression()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))

## 📌 Section 4: Topic Modeling with LDA
Extract topics from documents.

In [None]:
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamodel import LdaModel

# Tokenize and create dictionary
tokenized_docs = [doc.lower().split() for doc in corpus]
dictionary = Dictionary(tokenized_docs)
corpus_bow = [dictionary.doc2bow(doc) for doc in tokenized_docs]

# LDA
lda_model = LdaModel(corpus=corpus_bow, id2word=dictionary, num_topics=2, random_state=0)
topics = lda_model.print_topics()
for topic in topics:
    print(topic)