In [5]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.util import ngrams
from collections import Counter

from rapidfuzz import fuzz  # Optional for fuzzy matching; install if needed

In [3]:
# Load DataFrame
df = pd.read_csv('data\data.csv')  # Replace with your file path
comments_col = 'comments'  # Your column name

# Initialize tools
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    if pd.isna(text):
        return ''
    # Lowercase and remove punctuation
    text = re.sub(r'[^\w\s]', '', str(text).lower())
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stop words and lemmatize
    lemmatized = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words and len(token) > 2]
    return ' '.join(lemmatized)

# Apply preprocessing
df['processed_comments'] = df[comments_col].apply(preprocess_text)
print(f"Preprocessed sample: {df['processed_comments'].iloc[0]}")

  df = pd.read_csv('data\data.csv')  # Replace with your file path


Preprocessed sample: great quality material complaint worth every penny test


In [4]:

# Function to extract bigrams from processed text
def extract_bigrams(text):
    tokens = word_tokenize(text)
    return list(ngrams(tokens, 2))  # Bigrams

# Extract all bigrams
df['bigrams'] = df['processed_comments'].apply(extract_bigrams)

# Flatten and count bigram frequencies (optional, for basic stats)
all_bigrams = [bigram for bigrams_list in df['bigrams'] for bigram in bigrams_list]
bigram_freq = Counter(all_bigrams)
print("Top 5 frequent bigrams:", bigram_freq.most_common(5))

# TF-IDF for unigrams and bigrams
# For unigrams
vectorizer_uni = TfidfVectorizer(max_features=1000, stop_words='english', ngram_range=(1,1))
tfidf_uni = vectorizer_uni.fit_transform(df['processed_comments'])
feature_names_uni = vectorizer_uni.get_feature_names_out()

# For bigrams
vectorizer_bi = TfidfVectorizer(max_features=500, stop_words='english', ngram_range=(2,2))
tfidf_bi = vectorizer_bi.fit_transform(df['processed_comments'])
feature_names_bi = vectorizer_bi.get_feature_names_out()

# Get top key terms/phrases (combine uni + bi, score > threshold)
def get_top_terms(tfidf_matrix, feature_names, top_n=50, threshold=0.1):
    scores = tfidf_matrix.mean(axis=0).A1
    top_indices = scores.argsort()[-top_n:][::-1]
    top_terms = [(feature_names[i], scores[i]) for i in top_indices if scores[i] > threshold]
    return top_terms

top_unigrams = get_top_terms(tfidf_uni, feature_names_uni)
top_bigrams = get_top_terms(tfidf_bi, feature_names_bi)

# Combine into a single list of key terms/phrases
key_terms = [term[0] for term in top_unigrams + top_bigrams]
print("Sample key terms/phrases:", key_terms[:10])

# Optional: Save to DataFrame for UI
df['key_phrases'] = df['processed_comments'].apply(lambda x: [phrase for phrase in key_terms if phrase in x])

Top 5 frequent bigrams: [(('test', 'plus'), 16), (('plus', 'color'), 16), (('color', 'option'), 16), (('option', 'fantastic'), 16), (('billing', 'error'), 11)]
Sample key terms/phrases: ['test']


In [6]:
def filter_by_term(df, term, column=comments_col, fuzzy_threshold=80):
    filtered = df[df[column].str.contains(term, case=False, na=False)]
    # Optional fuzzy: For more flexible matching
    # fuzzy_matches = df[df[column].apply(lambda x: fuzz.partial_ratio(str(x), term) >= fuzzy_threshold if pd.notna(x) else False)]
    return filtered

# Test
sample_term = key_terms[0]
filtered_df = filter_by_term(df, sample_term)
print(f"Rows matching '{sample_term}': {len(filtered_df)}")

Rows matching 'test': 90


In [8]:
df.head()

Unnamed: 0,id,date,rating,comments,processed_comments,bigrams,key_phrases
0,1,2024-06-05,5,"Great quality materials. No complaints here, w...",great quality material complaint worth every p...,"[(great, quality), (quality, material), (mater...",[test]
1,2,2024-09-14,5,Amazing build quality! It feels premium and wo...,amazing build quality feel premium work flawle...,"[(amazing, build), (build, quality), (quality,...",[test]
2,3,2024-07-20,3,,,[],[]
3,4,2024-08-09,5,Outstanding value. Five stars all the way. (Te...,outstanding value five star way test plus colo...,"[(outstanding, value), (value, five), (five, s...",[test]
4,5,2024-10-19,2,Took forever to get a response from support. F...,took forever get response support frustrating ...,"[(took, forever), (forever, get), (get, respon...",[test]
