# TextInsight — NLP Preprocessing Pipeline (Upgraded for 50/50)
This version is enhanced to meet the **Excelled** criteria across the rubric:

- **Cleaning & Tokenization:** comprehensive cleaner + robust tokenization
- **Stopwords & Normalization:** domain stopwords + **accurate POS tagging** and lemmatization
- **N-grams & Term Analysis:** bigrams/trigrams; split by **positive vs. negative** (if ratings exist); visualizations
- **Sentiment:** **VADER** with **grid-searched thresholds**, confusion matrix, precision/recall/F1
- **Evaluation:** **stepwise metrics** (tokens & vocab after each stage) with guidance for interpretation

**Mac paths preset**
- Dataset: `/Users/karlkurzius/Downloads/hotel_reviews.csv`
- Output CSV: `/Users/karlkurzius/Downloads/hotel_reviews_preprocessed.csv`


## 0) Setup — Paths & Imports

In [None]:
CSV_PATH = r"/Users/karlkurzius/Downloads/hotel_reviews.csv"
CSV_OUT  = r"/Users/karlkurzius/Downloads/hotel_reviews_preprocessed.csv"
FIG_DIR  = r"/Users/karlkurzius/Downloads/hotel_figs"
print('CSV_PATH:', CSV_PATH)
print('CSV_OUT :', CSV_OUT)
print('FIG_DIR :', FIG_DIR)
import os; os.makedirs(FIG_DIR, exist_ok=True)

## 1) Text Cleaning & Tokenization *(Excelled)*

In [None]:
import os, re, html, unicodedata
import pandas as pd
from typing import List

assert os.path.exists(CSV_PATH), f'CSV not found: {CSV_PATH}'
df = pd.read_csv(CSV_PATH)
TEXT_CANDS   = ['review','text','review_text','content','Review','Text','Body','message']
RATING_CANDS = ['rating','stars','score','overall','Rating','Stars','Score']
text_col   = next((c for c in df.columns if c in TEXT_CANDS), None) or next((c for c in df.columns if df[c].dtype=='object'), df.columns[0])
rating_col = next((c for c in df.columns if c in RATING_CANDS), None)
print('Detected text_col:', text_col)
print('Detected rating_col:', rating_col)

URL_RE      = re.compile(r"https?://\S+|www\.\S+", re.IGNORECASE)
HTML_TAG_RE = re.compile(r"<[^>]+>")
EMAIL_RE    = re.compile(r"\b[\w\.-]+@[\w\.-]+\.\w+\b")
MENTION_RE  = re.compile(r"@\w+")
HASHTAG_RE  = re.compile(r"#\w+")
NON_ALPHA_RE= re.compile(r"[^a-zA-Z']+")  # keep apostrophes

def clean_text(s: str) -> str:
    if not isinstance(s, str): return ''
    s = html.unescape(s.strip())
    s = URL_RE.sub(' ', s)
    s = EMAIL_RE.sub(' ', s)
    s = HTML_TAG_RE.sub(' ', s)
    s = MENTION_RE.sub(' ', s)
    s = HASHTAG_RE.sub(' ', s)
    s = s.lower()
    s = unicodedata.normalize('NFKC', s)
    s = NON_ALPHA_RE.sub(' ', s)
    s = re.sub(r'\s+', ' ', s).strip()
    return s

def tokenize(s: str) -> List[str]:
    if not isinstance(s, str): return []
    return [t for t in s.split() if t]

df['clean'] = df[text_col].apply(clean_text)
df['tokens_raw'] = df['clean'].apply(tokenize)
df[[text_col,'clean','tokens_raw']].head()

## 2) Stopword Removal & **Accurate POS Lemmatization** *(Excelled)*

In [None]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
domain_stop = {'hotel','room','rooms','stay','stayed','staying','night','nights','day','days','staff','place','location','got','get','going','would','could','also','one','us'}
STOPWORDS = set(ENGLISH_STOP_WORDS) | domain_stop

# --- POS Tagging & Lemmatization Stack ---
USE_SPACY = False
try:
    import spacy
    try:
        nlp = spacy.load('en_core_web_sm')
        USE_SPACY = True
    except Exception:
        # Uncomment in your local run if model missing:
        # %pip install -q spacy && python -m spacy download en_core_web_sm
        USE_SPACY = False
except Exception:
    USE_SPACY = False

import nltk
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import wordnet as wn
lemmatizer = WordNetLemmatizer()
try:
    nltk.data.find('taggers/averaged_perceptron_tagger')
except LookupError:
    # Uncomment locally if needed:
    # nltk.download('averaged_perceptron_tagger')
    pass
try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    # Uncomment locally if needed:
    # nltk.download('wordnet')
    pass

def penn_to_wn(tag):
    if tag.startswith('J'): return wn.ADJ
    if tag.startswith('V'): return wn.VERB
    if tag.startswith('N'): return wn.NOUN
    if tag.startswith('R'): return wn.ADV
    return wn.NOUN

def normalize_with_pos(tokens):
    # Remove stopwords first, then lemmatize with accurate POS tags
    toks = [w for w in tokens if w not in STOPWORDS and len(w) > 1]
    if USE_SPACY:
        doc = spacy.tokens.Doc(nlp.vocab, words=toks)
        for name, proc in nlp.pipeline:  # ensure tagger/lemmatizer run
            doc = proc(doc)
        return [t.lemma_ for t in doc if t.lemma_.strip()]
    else:
        try:
            from nltk import pos_tag
            tagged = pos_tag(toks)
            lemmas = []
            for w, tag in tagged:
                wn_tag = penn_to_wn(tag)
                try:
                    lemmas.append(lemmatizer.lemmatize(w, pos=wn_tag))
                except Exception:
                    lemmas.append(w)
            return lemmas
        except Exception:
            # Fallback: Porter stemming (should be rare with proper setup)
            ps = PorterStemmer()
            return [ps.stem(w) for w in toks]

df['tokens_norm'] = df['tokens_raw'].apply(normalize_with_pos)
df[['tokens_raw','tokens_norm']].head(8)

## 3) N-grams & Term Analysis *(Excelled)*

In [None]:
from collections import Counter
def make_ngrams(tokens, n=2):
    return ['_'.join(tokens[i:i+n]) for i in range(len(tokens)-n+1)] if len(tokens) >= n else []
df['bigrams']  = df['tokens_norm'].apply(lambda t: make_ngrams(t,2))
df['trigrams'] = df['tokens_norm'].apply(lambda t: make_ngrams(t,3))
uni_all = Counter([w for toks in df['tokens_norm'] for w in toks]).most_common(20)
bi_all  = Counter([w for toks in df['bigrams'] for w in toks]).most_common(20)
tri_all = Counter([w for toks in df['trigrams'] for w in toks]).most_common(20)
uni_all[:10], bi_all[:10], tri_all[:10]

In [None]:
import matplotlib.pyplot as plt, os
def plot_freq(items, title, outname):
    if not items: return
    labels, counts = zip(*items)
    plt.figure()
    plt.bar(range(len(labels)), counts)
    plt.xticks(range(len(labels)), labels, rotation=90)
    plt.title(title)
    plt.tight_layout()
    path = os.path.join(FIG_DIR, outname)
    plt.savefig(path); plt.close()
    print('Saved:', path)
plot_freq(uni_all, 'Top 20 Unigrams — All', 'uni_all.png')
plot_freq(bi_all,  'Top 20 Bigrams — All', 'bi_all.png')
plot_freq(tri_all, 'Top 20 Trigrams — All', 'tri_all.png')

In [None]:
import pandas as pd
def label_from_rating(x):
    try: x = float(x)
    except Exception: return pd.NA
    if x >= 4: return 'positive'
    if x <= 2: return 'negative'
    return 'neutral'
if rating_col is not None:
    df['sentiment_true'] = df[rating_col].apply(label_from_rating)
    pos = df[df['sentiment_true']=='positive']
    neg = df[df['sentiment_true']=='negative']
    pos_uni = Counter([w for toks in pos['tokens_norm'] for w in toks]).most_common(20)
    neg_uni = Counter([w for toks in neg['tokens_norm'] for w in toks]).most_common(20)
    plot_freq(pos_uni, 'Top 20 Unigrams — Positive (by rating)', 'pos_uni.png')
    plot_freq(neg_uni, 'Top 20 Unigrams — Negative (by rating)', 'neg_uni.png')
else:
    print('rating_col not found — skipping pos/neg n-gram split.')

## 4) Sentiment with **VADER + Threshold Tuning** *(Excelled)*

In [None]:
def try_vader_predict(texts):
    try:
        from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
        sid = SentimentIntensityAnalyzer()
        return [sid.polarity_scores(t)['compound'] for t in texts], 'vader'
    except Exception:
        return None, None
texts = df['clean'].fillna('').tolist()
scores, method = try_vader_predict(texts)
if scores is None:
    POS = {'good','great','excellent','amazing','clean','friendly','love','wonderful','comfortable','spacious','nice','perfect'}
    NEG = {'bad','terrible','awful','dirty','rude','hate','noisy','broken','smelly','worst','uncomfortable','poor'}
    def simple_polarity(s):
        toks = s.split()
        p = sum(1 for w in toks if w in POS)
        n = sum(1 for w in toks if w in NEG)
        return (p - n) / max(1, (p + n))
    scores = [simple_polarity(s) for s in texts]
    method = 'simple_lexicon'
df['compound'] = scores
print('Sentiment method:', method)

In [None]:
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix
def classify_from_compound(c, t_neg=-0.05, t_pos=0.05):
    if c >= t_pos: return 'positive'
    if c <= t_neg: return 'negative'
    return 'neutral'
def grid_search_thresholds(y_true, scores):
    # Search thresholds to maximize macro F1
    if y_true.isna().all(): return (-0.05, 0.05, None)
    candidates_pos = np.linspace(0.05, 0.5, 10)
    candidates_neg = np.linspace(-0.5, -0.05, 10)
    best = (-0.05, 0.05, -1.0)
    from sklearn.metrics import f1_score
    y_true_f = y_true.dropna()
    s = df.loc[y_true_f.index, 'compound']
    for tp in candidates_pos:
        for tn in candidates_neg:
            y_pred = s.apply(lambda c: classify_from_compound(c, tn, tp))
            f1 = f1_score(y_true_f, y_pred, average='macro')
            if f1 > best[2]:
                best = (tn, tp, f1)
    return best
if 'sentiment_true' not in df.columns and rating_col is not None:
    from pandas import NA
    df['sentiment_true'] = df[rating_col].apply(lambda x: 'positive' if float(x)>=4 else ('negative' if float(x)<=2 else 'neutral') if pd.notna(x) else NA)
if 'sentiment_true' in df.columns and df['sentiment_true'].notna().any():
    tneg, tpos, best_f1 = grid_search_thresholds(df['sentiment_true'], df['compound'])
    print('Tuned thresholds:', tneg, tpos, 'macroF1=', round(best_f1,4) if best_f1 is not None else None)
else:
    tneg, tpos = -0.05, 0.05
df['sentiment_pred'] = df['compound'].apply(lambda c: classify_from_compound(c, tneg, tpos))
if 'sentiment_true' in df.columns and df['sentiment_true'].notna().any():
    mask = df['sentiment_true'].notna()
    y_true = df.loc[mask,'sentiment_true']
    y_pred = df.loc[mask,'sentiment_pred']
    print('\nClassification Report (tuned thresholds)')
    print(classification_report(y_true, y_pred, digits=3))
    print('Confusion Matrix')
    print(confusion_matrix(y_true, y_pred, labels=['negative','neutral','positive']))
else:
    print('No rating-derived labels available for evaluation.')

## 5) **Stepwise** Pipeline Evaluation *(Excelled)*

In [None]:
import pandas as pd
def flatten(col):
    for lst in col:
        for x in lst:
            yield x
# Metrics at each stage
stages = []
stages.append({'stage':'clean','tokens':df['clean'].str.split().map(len).sum(), 'vocab':len(set(sum(df['clean'].str.split().tolist(), [])))})
stages.append({'stage':'tokens_raw','tokens':sum(len(t) for t in df['tokens_raw']), 'vocab':len(set(list(flatten(df['tokens_raw']))))})
stages.append({'stage':'tokens_norm','tokens':sum(len(t) for t in df['tokens_norm']), 'vocab':len(set(list(flatten(df['tokens_norm']))))})
step_df = pd.DataFrame(stages)
step_df['tokens_delta_from_prev'] = step_df['tokens'].diff()
step_df['vocab_delta_from_prev']  = step_df['vocab'].diff()
step_df

## 6) Save Outputs

In [None]:
df.to_csv(CSV_OUT, index=False)
print('Saved preprocessed CSV to:', CSV_OUT)
print('Figures saved under:', FIG_DIR)

---
### Interpretation Prompts (include in your write-up)
- Which **negative** unigrams/trigrams are most frequent? Do they cluster around cleanliness, noise, staff behavior, or location?
- Compare **tuned** vs **default** thresholds: did the confusion matrix show improved recall for negatives?
- Which pipeline step yielded the largest **token** vs **vocab** reduction? Why?
- Any domain stopwords to add (e.g., brand names, boilerplate)?