# Text Feature Preprocessing for EDA

In [1]:
# Import necessary libraries
import nltk
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.util import ngrams

In [2]:
# Download necessary NLTK resources
nltk.download("stopwords", quiet=True)
nltk.download("vader_lexicon", quiet=True)
nltk.download("wordnet", quiet=True)

True

In [3]:
# Define the text columns to load them and the target variable
text_cols = ["item_name", "description", "hashtags"]
stats = ["char_count", "word_count", "avg_word_length"]
n_grams = ["unigrams", "bigrams", "trigrams"]

In [4]:
# Load the data with specified columns and a limit of 10,000 rows
df = pd.read_csv(
    "../data/raw/sold_listings.csv", nrows=10000, usecols=text_cols + ["sold_price"]
)

In [5]:
# Replace missing values in the 'hashtags' column with an empty string
df.replace({"hashtags": {"missing": ""}}, inplace=True)

In [6]:
# Calculate the quantiles for the target variable (sold_price) to filter for analysis later
q_low, q_high = df["sold_price"].quantile([0.25, 0.75])

In [7]:
# Reshape the data using melt to combine text columns into a single column (for visualization purposes)
df_long = df.melt(
    value_vars=text_cols,
    id_vars=["sold_price"],
    var_name="text_feature",
    value_name="text",
)

## Text Statistics

In [8]:
# Function to calculate average word length


def avg_word_length(text):
    words = text.split()
    if len(words) > 0:
        return np.mean([len(word) for word in words])
    return 0

In [9]:
# Create a new DataFrame to store text statistics for each text feature
df_stats = df_long[["text_feature", "sold_price"]]

In [10]:
# Calculate character count, word count, and average word length for each row
df_stats["char_count"] = df_long["text"].apply(len)
df_stats["word_count"] = df_long["text"].apply(str.split).apply(len)
df_stats["avg_word_length"] = df_long["text"].apply(avg_word_length)

In [11]:
# View the first few rows of text statistics
df_stats.head()

Unnamed: 0,text_feature,sold_price,char_count,word_count,avg_word_length
0,item_name,60,42,5,7.6
1,item_name,327,45,8,4.75
2,item_name,135,29,5,5.0
3,item_name,56,59,9,5.666667
4,item_name,60,30,5,5.2


In [12]:
# Reshape text statistics data for easier visualization
df_stats_long = df_stats.melt(
    value_vars=stats, id_vars=["text_feature", "sold_price"], var_name="stat"
)

In [13]:
# View the reshaped text statistics
df_stats_long.head()

Unnamed: 0,text_feature,sold_price,stat,value
0,item_name,60,char_count,42.0
1,item_name,327,char_count,45.0
2,item_name,135,char_count,29.0
3,item_name,56,char_count,59.0
4,item_name,60,char_count,30.0


In [14]:
# Save the preprocessed text statistics to a CSV file
df_stats_long.to_csv(
    "../data/preprocessed/preprocessed_text_stats_10k.csv", index=False
)

## N-grams

Before extracting n-grams, we preprocess the text features using the `preprocess_text` function, which performs the following steps:
- Converts text to lowercase
- Lemmatizes words
- Removes stop words

In [15]:
# Function for preprocessing: tokenization, lemmatization, and stopword removal
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))


def preprocess_text(text, placeholder="missing"):
    if not isinstance(text, str) or not text.strip():
        return placeholder  # Return placeholder if the input is empty or non-string

    # Tokenize, lemmatize, and remove stop words
    words = word_tokenize(text.lower())
    words = [
        lemmatizer.lemmatize(word)
        for word in words
        if word.isalpha() and word not in stop_words
    ]
    processed_text = " ".join(words)

    return processed_text if processed_text else placeholder  # Ensure non-empty output

In [16]:
# Apply preprocessing to each text column
df_long["cleaned_text"] = df_long["text"].apply(preprocess_text)

In [17]:
# Create a new DataFrame for n-grams
df_ngrams = df_long[["text_feature", "sold_price"]].copy()

In [18]:
# Extract unigrams, bigrams, and trigrams
df_ngrams["unigrams"] = df_long["cleaned_text"].str.split()
df_ngrams["bigrams"] = df_ngrams["unigrams"].apply(
    lambda x: [" ".join(ngram) for ngram in ngrams(x, 2)]
)
df_ngrams["trigrams"] = df_ngrams["unigrams"].apply(
    lambda x: [" ".join(ngram) for ngram in ngrams(x, 3)]
)

In [19]:
# View the first few rows of n-grams
df_ngrams.head()

Unnamed: 0,text_feature,sold_price,unigrams,bigrams,trigrams
0,item_name,60,"[dangerous, warning, ifsixwasnine, long, sleeve]","[dangerous warning, warning ifsixwasnine, ifsi...","[dangerous warning ifsixwasnine, warning ifsix..."
1,item_name,327,"[black, rugged, twill, original, briefcase, new]","[black rugged, rugged twill, twill original, o...","[black rugged twill, rugged twill original, tw..."
2,item_name,135,"[bape, wgm, garment, dyed]","[bape wgm, wgm garment, garment dyed]","[bape wgm garment, wgm garment dyed]"
3,item_name,56,"[tommy, hilfiger, embroidery, logo, colour, bl...","[tommy hilfiger, hilfiger embroidery, embroide...","[tommy hilfiger embroidery, hilfiger embroider..."
4,item_name,60,"[minted, new, york, runclub, hoodie]","[minted new, new york, york runclub, runclub h...","[minted new york, new york runclub, york runcl..."


In [20]:
# Filter n-grams for the top 25% and bottom 25% sold price listings
df_ngrams = df_ngrams[(df_ngrams.sold_price > q_high) | (df_ngrams.sold_price < q_low)]

In [21]:
# Reshape n-grams data for easier visualization
df_ngrams_long = df_ngrams.melt(
    value_vars=n_grams,
    id_vars=["text_feature", "sold_price"],
    var_name="ngram",
).explode("value")

In [22]:
# Drop any rows with missing values
df_ngrams_long.dropna(inplace=True)

In [23]:
# View the reshaped n-grams data
df_ngrams_long.head()

Unnamed: 0,text_feature,sold_price,ngram,value
0,item_name,327,unigrams,black
0,item_name,327,unigrams,rugged
0,item_name,327,unigrams,twill
0,item_name,327,unigrams,original
0,item_name,327,unigrams,briefcase


In [24]:
# Save the preprocessed n-grams data to a CSV file
df_ngrams_long.to_csv(
    "../data/preprocessed/preprocessed_text_ngrams_10k.csv", index=False
)

## Sentiment Analysis

In [25]:
# Initialize the SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

In [26]:
# Function for calculating sentiment score using VADER


def analyze_sentiment(description):
    sentiment_scores = sid.polarity_scores(description)
    return sentiment_scores["compound"]

In [27]:
# Filter data for the description text feature only
df_sentiment = df_long[df_long.text_feature == "description"][
    ["cleaned_text", "sold_price"]
]

# Apply sentiment analysis to the cleaned descriptions
df_sentiment["description_sentiment"] = df_sentiment["cleaned_text"].apply(
    analyze_sentiment
)

# Drop the cleaned text column (no longer needed)
df_sentiment.drop(columns=["cleaned_text"], inplace=True)

In [28]:
# View the sentiment analysis results
df_sentiment.head()

Unnamed: 0,sold_price,description_sentiment
10000,60,0.6124
10001,327,0.0
10002,135,0.0
10003,56,0.7579
10004,60,-0.296


In [29]:
# Save the preprocessed sentiment data to a CSV file
df_sentiment.to_csv(
    "../data/preprocessed/preprocessed_text_sentiment_10k.csv", index=False
)