# ML Text Classification

### Problem Statement


### Data
   * Dataset of product names ("Query" column) and the category label IDs ("Class_Label" column)

### Applications 
   * Search and Customer Insights domains

### Solution

   * Built a model that returns a reasonable recall number (micro & macro) on a test dataset with multiple class labels
   * An unseen test set was used to evaluate the model’s performance. That is not meant to be scoring function but instead an approach to calibrate the task.


In [1]:
import pandas as pd
import zipfile
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
import numpy as np

In [2]:
##-- Unzip the file
with zipfile.ZipFile('data_train.tsv.zip', 'r') as z:
    z.extractall()

##-- Load data
df = pd.read_csv('data_train.tsv', sep='\t',names=["Query","Class_Label"]).dropna()

##-- Filter non-alphanumeric characters, numbers, symbols
df["Query_Filtered"] = df["Query"].apply(lambda x: re.sub(r'[^a-zA-Z]', ' ', str(x)))

##-- If all rows start with 'ID_', then strip 'ID_' from Class_Label
if all(df['Class_Label'].str.startswith('ID_')):
    df['Class_Label'] = df['Class_Label'].str[3:]
    
df.drop('Query', axis=1, inplace=True)

df = df.sort_values(["Class_Label","Query_Filtered"]).reset_index()
df["Query_Filtered"] = df["Query_Filtered"].astype(str)
df["Class_Label"] = df["Class_Label"].astype(int)

df.head(20)

Unnamed: 0,index,Class_Label,Query_Filtered
0,2754172,101371589,
1,3670779,101371589,
2,2834048,101371589,
3,3673809,101371589,
4,4543176,101371589,
5,5274746,101371589,
6,1267291,101371589,
7,1279518,101371589,
8,3819408,101371589,
9,3549328,101371589,


In [3]:
def data_quality_check(df):
    ##-- Check for NaN values
    print("\nNaN values in each column:\n", df.isna().sum())
    
    ##-- Check for duplicate rows
    print(f"\nNumber of duplicate rows: {df.duplicated().sum()}")
    
    ##-- Check for queries with unusually high length
    unusual_length_queries = df[df['Query_Filtered'].str.len() > 100]
    print(f"\nNumber of unusually long queries: {len(unusual_length_queries)}")
    return

data_quality_check(df)


NaN values in each column:
 index             0
Class_Label       0
Query_Filtered    0
dtype: int64

Number of duplicate rows: 0

Number of unusually long queries: 7799


In [4]:
df['Merged_Label'] = pd.qcut(df['Class_Label'], q=[x for x in np.arange(0, 1.05, 0.05)],
                                 labels=False, precision=3, duplicates='drop')
print("new labels: %s\nold labels: %s" % (len(df['Merged_Label'].unique()),len(df['Class_Label'].unique())) )

new labels: 20
old labels: 485


In [5]:
##-- Split Data
X,y = df['Query_Filtered'],df['Merged_Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
##-- Vectorize train and test data
count_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')

In [7]:
X_train_vec = count_vectorizer.fit_transform(X_train)
X_test_vec = count_vectorizer.transform(X_test)

In [8]:
##-- Initialize LDA Model
lda = LatentDirichletAllocation(n_components=10, random_state=42)

In [9]:
##-- Fit LDA Model on train data
lda.fit(X_train_vec)

LatentDirichletAllocation(random_state=42)

In [10]:
##-- Review sample of features
feature_names = count_vectorizer.get_feature_names_out()
for topic_idx, topic in enumerate(lda.components_):
    print(f"Topic #{topic_idx+1}")
    print(" ".join([feature_names[i] for i in topic.argsort()[:-11:-1]]))

Topic #1
inch fan gas wheel metal blade oil portable air exhaust
Topic #2
light safety led tape mm bulb lamp bar glasses heater
Topic #3
air filter battery plastic spray gal saw vacuum chain gallon
Topic #4
water screw head gloves paint flat cap resistant steel black
Topic #5
valve switch box pressure door ball lock cover gauge control
Topic #6
pipe hose steel wire stainless cable pvc brass inch adapter
Topic #7
nut square drain work breaker faucet toilet magnetic handle floor
Topic #8
wall motor hp ft amp cord mount lb plug rubber
Topic #9
kit drill set bit tool wrench socket drive belt impact
Topic #10
pump hand ring series vac glass adhesive dispenser fuse wheels


In [11]:
##-- Log Likelihood --> Higher is better
print("Log Likelihood: ", lda.score(X_train_vec))
##-- Perplexity --> Lower is better. Perplexity is exp(-1. * log-likelihood per word)
print("Perplexity: ", lda.perplexity(X_train_vec))

Log Likelihood:  -120398023.19584143
Perplexity:  3161.60032491207


In [12]:
##-- Transform and fit topics for train and test data
X_train_topics = lda.fit_transform(X_train_vec)

In [13]:
X_test_topics = lda.transform(X_test_vec) 

In [14]:
##-- Fit RFC classifier on the topic-space representation
rfc = RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_split=5, min_samples_leaf=1, random_state=42)

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [15]:
##-- Instantiate grid search
grid_search = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=StratifiedKFold(n_splits=5), n_jobs=-1, verbose=2, scoring='accuracy')

In [1]:
##-- Cross-validate: Fit grid search to data
grid_search.fit(X_train_topics, y_train)

In [None]:
##-- Determine optimal parameters
best_gs_params = grid_search.best_params_
print(f"Best parameters: {best_gs_params}")

In [None]:
##-- Determine best estimator
best_gs_estim = grid_search.best_estimator_
print(f"Best estimator: {best_gs_estim}")

In [None]:
##-- Classification metrics
y_pred_gs = best_gs_estim.predict(X_test_topics)
print(classification_report(y_test, y_pred_gs))

In [None]:
##-- Visualization --> 
# import pyLDAvis
# import pyLDAvis.lda_model
# pyLDAvis.enable_notebook()

# pyLDAvis.lda_model.prepare(lda, X_train_vec, count_vectorizer)

## Optional

In [None]:
# def visualize_sublabels(df):
#     ##-- Assume df already has sub-label columns as you provided
#     sublabel_cols = [f'SubLabel_{i}' for i in range(1, 10)]
#     ##-- sublabel_cols = df.columns.where(df.columns.str.startswith("SubLabel")).dropna()

#     ##-- Melt DataFrame to make it suitable for seaborn
#     melted_df = pd.melt(df, value_vars=sublabel_cols, var_name='SubLabel', value_name='Value')

#     ##-- Create countplot
#     plt.figure(figsize=(15, 8))
#     sns.countplot(data=melted_df, x='Value', hue='SubLabel')
#     plt.title('SubLabel Distributions')
#     plt.xlabel('Value')
#     plt.ylabel('Count')
#     plt.show()
#     return

# visualize_sublabels(df)