In [20]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from wordcloud import WordCloud
# from spellchecker import SpellChecker

# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('stopwords')
nltk.download('omw-1.4')

stop_words = set(stopwords.words('english'))
# spell = SpellChecker()
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Rimsha\AppData\Roaming\nltk_data...


In [53]:
def transform_text(text):
    # Lowercasing
    text = text.lower()
    # Removing urls
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    text = url_pattern.sub('', text)
    # Removing special characters, punctuations, emojis and symbols
    text = re.sub(r'[^\w\s\d]|[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F700-\U0001F77F\U0001F780-\U0001F7FF\U0001F800-\U0001F8FF\U0001F900-\U0001F9FF\U0001FA00-\U0001FA6F\U0001FA70-\U0001FAFF\U00026000-\U00026FFF]', '', text)
    # Removing email address 
    text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '', text)
    # Removing newline
    text = text.replace('\n', ' ')
    # Removing hashtags
    text = re.sub(r'@\w+|#\w+', '', text)
    # Removing whitespace and extra spaces
    text = ' '.join(text.split())
    # Removing stopwords
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word not in stop_words]
    # spelling correction
#     corrected_tokens = [spell.correction(word) for word in filtered_tokens]
    # lemmatization
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    text = ' '.join(lemmatized_tokens)
    return text

class CategoriesClassifier:
    def __init__(self, df_train, df_test, df_validation):
        self.df_train = df_train.copy()
        self.df_test = df_test.copy()
        self.df_validation = df_validation.copy()
        self.data_type = {
            'train': self.df_train,
            'test': self.df_test,
            'validation': self.df_validation,
        }
        
    def preprocess_text(self, cols):
        for v in self.data_type.values():
            for c in cols:
                v[c] = v[c].apply(preprocess_text)
            
    def visualize_top_categories(self, data, level, top_n=10):
        df = self.data_type.get(data)
        top_categories = df[level].value_counts().nlargest(top_n)
        plt.figure(figsize=(12, 6))
        sns.barplot(x=top_categories.values, y=top_categories.index, palette='viridis')
        plt.title(f'Top {top_n} Categories in {level}')
        plt.xlabel('Number of Products')
        plt.ylabel('Category')
        plt.show()
        
    def visualize_wordcloud(self, data):
        df = self.data_type.get(data)
        all_descriptions = ' '.join(df['Description'])
        wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_descriptions)
        plt.figure(figsize=(10, 5))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis('off')
        plt.title('Word Cloud of Product Descriptions\n')
        plt.show()
        
    def visualize_description_length(self, data):
        df = self.data_type.get(data)
        df['Description_Length'] = df['Description'].apply(len)
        plt.figure(figsize=(10, 5))
        sns.histplot(df['Description_Length'], bins=20, kde=True)
        plt.xlabel('Description Length')
        plt.ylabel('Count')
        plt.title('Description Length Distribution')
        plt.show()

In [54]:
df_train = pd.read_json('Data/train_data.json', lines=True)
df_test = pd.read_json('Data/test_data.json', lines=True)
df_validation = pd.read_json('Data/validation_data.json', lines=True)

cls = CategoriesClassifier(df_train, df_test, df_validation)
cls.preprocess_text(['Description', 'Name', 'CategoryText'])
# cls.visualize_top_categories('train', 'lvl1')
# cls.visualize_top_categories('train', 'lvl2')
# cls.visualize_top_categories('train', 'lvl3')
# cls.visualize_wordcloud('train')
# cls.visualize_description_length('train')

In [45]:
cls.df_train

Unnamed: 0,ID,Name,Description,CategoryText,URL,lvl1,lvl2,lvl3
0,549,sterling silver angel charm,little angel charm heavenly,product,http://www.thecharmworks.com/product/CW-UA/Ste...,64000000_Personal Accessories,64010000_Personal Accessories,64010100_Jewellery
1,5664,hp pavilion 23xi 5840 cm 23 ip monitor,share photo video game everyone room experienc...,product,http://store.hp.com/UKStore/Merch/Product.aspx...,65000000_Computing,65010000_Computers/Video Games,65010700_Computer/Video Game Peripherals
2,3307,east carolina pirate lady personalized basketb...,feel like bona fide member east carolina pirat...,east carolina pirate east carolina pirate lady...,http://eastcarolina.teamfanshop.com/COLLEGE_Ea...,67000000_Clothing,67010000_Clothing,67010800_Upper Body Wear/Tops
3,4609,tekonsha 90195 p3 electric brake control 14 tr...,receive free shipping item enter coupon code f...,vehicle part vehicle part accessory,http://www.anythingtruck.com/product/755-90195...,77000000_Automotive,77010000_Automotive Accessories and Maintenance,77011200_Automotive Maintenance/Repair
4,7822,rnxv wifly module wire antenna,description rnxv module roving network certifi...,home wireless wifi rnxv wifly module wire antenna,http://www.karlssonrobotics.com/cart/rn-xv-wif...,78000000_Electrical Supplies,78050000_Electronic Communication Components,78050100_Electronic Communication Components
...,...,...,...,...,...,...,...,...
10007,10329,men washington capital reebok striped scarf,get game season washington capital striped sca...,,http://shop.nhl.com/Men_Mothers_Day,67000000_Clothing,67010000_Clothing,67010100_Clothing Accessories
10008,5191,new york yankee lady stripe cami tank navy blue,warm weather day want stay cool enjoy new york...,mlb new york yankee new york yankee tshirts ne...,http://yahoosports.teamfanshop.com/MLB_Basebal...,67000000_Clothing,67010000_Clothing,67010800_Upper Body Wear/Tops
10009,5390,men uconn husky navy blue arch tshirt,celebrate fandom uconn husky arch tshirt featu...,uconn husky uconn husky tshirts,http://shop.uconnhuskies.com/COLLEGE_UCONN_Hus...,67000000_Clothing,67010000_Clothing,67010800_Upper Body Wear/Tops
10010,860,sony ericsson xperia arc lt18i,product feature 3g wifi hdmi14 ghz processor51...,mobile,http://www.smartprix.com/mobiles/sony_ericsson...,66000000_Communications,66010000_Communications,66010300_Mobile Communication Devices/Services


In [44]:
# from nltk.stem import WordNetLemmatizer
# from nltk.tokenize import word_tokenize
# import unicodedata

# from spellchecker import SpellChecker

# def correct_spelling(text):
#     # Correct spelling errors using a spell checker
#     spell = SpellChecker()
#     tokens = word_tokenize(text)
#     corrected_tokens = [spell.correction(word) for word in tokens]
#     return ' '.join(corrected_tokens)

# def remove_html_tags(text):
#     # Remove HTML tags from the text
#     cleaner = re.compile('<.*?>')
#     cleaned_text = re.sub(cleaner, '', text)
#     return cleaned_text

# def remove_stopwords(text):
#     # Remove stopwords using spaCy (which has a more comprehensive stopwords list)
#     doc = nlp(text)
#     tokens = [token.text for token in doc if not token.is_stop]
#     return ' '.join(tokens)


# import string

# def remove_punctuation(text):
#     # Remove punctuation
#     translator = str.maketrans('', '', string.punctuation)
#     return text.translate(translator)



In [59]:
# import pandas as pd
# from sklearn.model_selection import train_test_split
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.multioutput import MultiOutputClassifier
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import hamming_loss, jaccard_score, f1_score

# # Load the data into a DataFrame
# data = [...]  # Replace with your data
# df = df_train.copy()

# # Feature Engineering
# X = df['Description']  # Feature: Description
# y = df[['lvl1', 'lvl2', 'lvl3']]  # Targets: lvl1, lvl2, lvl3

# # Convert Description to TF-IDF features
# tfidf_vectorizer = TfidfVectorizer()
# X_tfidf = tfidf_vectorizer.fit_transform(X)

# # Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# # Model Selection and Training
# models = []
# for i in range(y.shape[1]):
#     model = LogisticRegression()
#     model.fit(X_train, y_train.iloc[:, i])
#     models.append(model)

# # Model Evaluation
# y_pred = []
# for model in models:
#     y_pred.append(model.predict(X_test))

# y_pred = pd.DataFrame(y_pred).T

# print('Hamming Loss:', hamming_loss(y_test, y_pred))
# print('Jaccard Score:', jaccard_score(y_test, y_pred, average='samples'))
# print('F1 Score:', f1_score(y_test, y_pred, average='samples'))
