In [8]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from wordcloud import WordCloud
# from spellchecker import SpellChecker
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, hamming_loss
from sklearn.feature_extraction.text import TfidfVectorizer

# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('stopwords')
nltk.download('omw-1.4')

stop_words = set(stopwords.words('english'))
# spell = SpellChecker()
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Rimsha\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [9]:
def transform_text(text):
    # Lowercasing
    text = text.lower()
    # Removing urls
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    text = url_pattern.sub('', text)
    # Removing special characters, punctuations, emojis and symbols
    text = re.sub(r'[^\w\s\d]|[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F700-\U0001F77F\U0001F780-\U0001F7FF\U0001F800-\U0001F8FF\U0001F900-\U0001F9FF\U0001FA00-\U0001FA6F\U0001FA70-\U0001FAFF\U00026000-\U00026FFF]', '', text)
    # Removing email address 
    text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '', text)
    # Removing newline
    text = text.replace('\n', ' ')
    # Removing hashtags
    text = re.sub(r'@\w+|#\w+', '', text)
    # Removing whitespace and extra spaces
    text = ' '.join(text.split())
    # Removing stopwords
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word not in stop_words]
    # spelling correction
#     corrected_tokens = [spell.correction(word) for word in filtered_tokens]
    # lemmatization
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    text = ' '.join(lemmatized_tokens)
    return text

def multioutput_accuracy(y_true, y_pred):
    num_outputs = y_true.shape[1]
    accuracies = []

    for i in range(num_outputs):
        accuracy = accuracy_score(y_true[:, i], y_pred[:, i])
        accuracies.append(accuracy)

    # Overall accuracy is the average of accuracies for all outputs
    overall_accuracy = sum(accuracies) / num_outputs

    return overall_accuracy

class CategoriesClassifier:
    def __init__(self, df_train, df_test, df_validation):
        self.df_train = df_train.copy()
        self.df_test = df_test.copy()
        self.df_validation = df_validation.copy()
        self.data_type = {
            'train': self.df_train,
            'test': self.df_test,
            'validation': self.df_validation,
        }
        
    def preprocess_text(self, cols):
        for v in self.data_type.values():
            for c in cols:
                v[c] = v[c].apply(transform_text)
            
    def visualize_top_categories(self, data, level, top_n=10):
        df = self.data_type.get(data)
        top_categories = df[level].value_counts().nlargest(top_n)
        plt.figure(figsize=(12, 6))
        sns.barplot(x=top_categories.values, y=top_categories.index, palette='viridis')
        plt.title(f'Top {top_n} Categories in {level}')
        plt.xlabel('Number of Products')
        plt.ylabel('Category')
        plt.show()
        
    def visualize_wordcloud(self, data):
        df = self.data_type.get(data)
        all_descriptions = ' '.join(df['Description'])
        wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_descriptions)
        plt.figure(figsize=(10, 5))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis('off')
        plt.title('Word Cloud of Product Descriptions\n')
        plt.show()
        
    def visualize_description_length(self, data):
        df = self.data_type.get(data)
        df['Description_Length'] = df['Description'].apply(len)
        plt.figure(figsize=(10, 5))
        sns.histplot(df['Description_Length'], bins=20, kde=True)
        plt.xlabel('Description Length')
        plt.ylabel('Count')
        plt.title('Description Length Distribution')
        plt.show()
        
    def apply_tfidf(self, data, col):
        df = self.data_type.get(data)
        tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5,
                        ngram_range=(1, 2), 
                        stop_words='english')
        return tfidf.fit_transform(df[col]).toarray()
    
    def feature_extraction(self):
        # Target encoding
        self.target_encoding = {
            'lvl1': {v: i for i, v in enumerate(self.df_train['lvl1'].unique())},
            'lvl2': {v: i for i, v in enumerate(self.df_train['lvl2'].unique())},
            'lvl3': {v: i for i, v in enumerate(self.df_train['lvl3'].unique())}
        }

        # Create a TF-IDF vectorizer and fit it on the training data
        self.tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=5, ngram_range=(1, 2), stop_words='english')
        self.X_train = self.tfidf_vectorizer.fit_transform(self.df_train['Description']).toarray()

        # Use the same vectorizer to transform validation and test data
        self.X_val = self.tfidf_vectorizer.transform(self.df_validation['Description']).toarray()
        self.X_test = self.tfidf_vectorizer.transform(self.df_test['Description']).toarray()


        # Target encoding for y values
        self.y_train_lvl1 = self.df_train['lvl1'].map(self.target_encoding.get('lvl1')).values
        self.y_train_lvl2 = self.df_train['lvl2'].map(self.target_encoding.get('lvl2')).values
        self.y_train_lvl3 = self.df_train['lvl3'].map(self.target_encoding.get('lvl3')).values

        self.y_val_lvl1 = self.df_validation['lvl1'].map(self.target_encoding.get('lvl1')).values
        self.y_val_lvl2 = self.df_validation['lvl2'].map(self.target_encoding.get('lvl2')).values
        self.y_val_lvl3 = self.df_validation['lvl3'].map(self.target_encoding.get('lvl3')).values

        # Stack them horizontally to create y_train
        self.y_train = np.column_stack((self.y_train_lvl1, self.y_train_lvl2, self.y_train_lvl3))

        # Stack them horizontally to create y_val
        self.y_val = np.column_stack((self.y_val_lvl1, self.y_val_lvl2, self.y_val_lvl3))

        
    def train_evaluate_ml_models(self, models, param_grids):
        # Train and evaluate each model
        best_models = {}
        accuracies = {}

        for model_name, model in models.items():
            print(f"Training and evaluating {model_name}...")
            param_grid = param_grids[model_name]
            multi_output_model = MultiOutputClassifier(model)
            grid_search = GridSearchCV(multi_output_model, param_grid, cv=3, n_jobs=-1)
            grid_search.fit(self.X_train, self.y_train)

            # Save the best model
            best_models[model_name] = grid_search.best_estimator_

            # Predict on train and validation sets
            y_train_pred = grid_search.predict(self.X_train)
            y_val_pred = grid_search.predict(self.X_val)
            

            # Calculate accuracy for train and validation sets
            train_accuracy = multioutput_accuracy(self.y_train, y_train_pred)
            val_accuracy = multioutput_accuracy(self.y_val, y_val_pred)

            accuracies[model_name] = {'train_accuracy': train_accuracy, 'val_accuracy': val_accuracy}

            print(f"Best parameters for {model_name}: {grid_search.best_params_}")
            print(f"Train accuracy for {model_name}: {train_accuracy}")
            print(f"Validation accuracy for {model_name}: {val_accuracy}")

        return best_models, accuracies
#     # Rest of the functions remain the same...

# # Usage:
# # Create the classifier instance
# classifier = CategoriesClassifier(df_train, df_test, df_validation)

# # Preprocess the 'Description' column
# classifier.preprocess_data('Description')

# # Vectorize the data
# classifier.vectorize_data()

# # Train and evaluate the multi-output models
# best_models = classifier.train_evaluate_models()

In [10]:
df_train = pd.read_json('Data/train_data.json', lines=True)
df_test = pd.read_json('Data/test_data.json', lines=True)
df_validation = pd.read_json('Data/validation_data.json', lines=True)

cls = CategoriesClassifier(df_train, df_test, df_validation)
cls.preprocess_text(['Description', 'Name', 'CategoryText'])
# cls.visualize_top_categories('train', 'lvl1')
# cls.visualize_top_categories('train', 'lvl2')
# cls.visualize_top_categories('train', 'lvl3')
# cls.visualize_wordcloud('train')
# cls.visualize_description_length('train')

In [11]:
cls.feature_extraction()

In [None]:
# Models to evaluate
models = {
    'RandomForest': RandomForestClassifier(),
    'SVM': SVC(),
    'KNeighbors': KNeighborsClassifier()
}

# Parameter grids for grid search
param_grids = {
    'RandomForest': {'estimator__n_estimators': [50, 100, 200]},
    'SVM': {'estimator__C': [0.1, 1, 10]},
    'KNeighbors': {'estimator__n_neighbors': [3, 5, 7]}
}

best_models, accuracies = cls.train_evaluate_ml_models(models, param_grids)

Training and evaluating RandomForest...


In [44]:
# from nltk.stem import WordNetLemmatizer
# from nltk.tokenize import word_tokenize
# import unicodedata

# from spellchecker import SpellChecker

# def correct_spelling(text):
#     # Correct spelling errors using a spell checker
#     spell = SpellChecker()
#     tokens = word_tokenize(text)
#     corrected_tokens = [spell.correction(word) for word in tokens]
#     return ' '.join(corrected_tokens)

# def remove_html_tags(text):
#     # Remove HTML tags from the text
#     cleaner = re.compile('<.*?>')
#     cleaned_text = re.sub(cleaner, '', text)
#     return cleaned_text

# def remove_stopwords(text):
#     # Remove stopwords using spaCy (which has a more comprehensive stopwords list)
#     doc = nlp(text)
#     tokens = [token.text for token in doc if not token.is_stop]
#     return ' '.join(tokens)


# import string

# def remove_punctuation(text):
#     # Remove punctuation
#     translator = str.maketrans('', '', string.punctuation)
#     return text.translate(translator)



In [59]:
# import pandas as pd
# from sklearn.model_selection import train_test_split
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.multioutput import MultiOutputClassifier
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import hamming_loss, jaccard_score, f1_score

# # Load the data into a DataFrame
# data = [...]  # Replace with your data
# df = df_train.copy()

# # Feature Engineering
# X = df['Description']  # Feature: Description
# y = df[['lvl1', 'lvl2', 'lvl3']]  # Targets: lvl1, lvl2, lvl3

# # Convert Description to TF-IDF features
# tfidf_vectorizer = TfidfVectorizer()
# X_tfidf = tfidf_vectorizer.fit_transform(X)

# # Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# # Model Selection and Training
# models = []
# for i in range(y.shape[1]):
#     model = LogisticRegression()
#     model.fit(X_train, y_train.iloc[:, i])
#     models.append(model)

# # Model Evaluation
# y_pred = []
# for model in models:
#     y_pred.append(model.predict(X_test))

# y_pred = pd.DataFrame(y_pred).T

# print('Hamming Loss:', hamming_loss(y_test, y_pred))
# print('Jaccard Score:', jaccard_score(y_test, y_pred, average='samples'))
# print('F1 Score:', f1_score(y_test, y_pred, average='samples'))
