In [None]:
!pip install stanza

In [None]:
!pip install hazm

In [1]:
import pandas as pd
import numpy as np

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# reading train and test datasets
train_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/DL/11.project3/title_categorizer/data/torob_train.csv')
test_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/DL/11.project3/title_categorizer/data/torob_test.csv')

test_data

Unnamed: 0,name1,name2
0,بگ پارچه ای دو رنگ چاپ,
1,بادگیر کلاهدار ورزشی فرانسه France windrunner,
2,ساعت دیواری هندسی,
3,ساک ورزشی Nike مدل N20627,
4,ساعت فلزی,
...,...,...
1196,ست گرمکن مردانه Versace | 74GAA3.FS063.G89,
1197,پالتو مردانه مشکی کد 18225 ا Black men's coat ...,Black men's coat code 18225
1198,بند مدل Braided Solo Loop M مناسب برای اپل واچ...,Braided Solo Loop M strap suitable for Apple W...
1199,ساک ورزشی CrossGear مدل SA-9956 ا Sports bag C...,Sports bag CrossGear model SA-9956


In [4]:
# cleansing and lemmatizing Persian product titles using Stanza and Hazm for better categorization
import re
import pandas as pd
import stanza
from hazm import stopwords_list
from tqdm import tqdm

stanza.download('fa')
nlp = stanza.Pipeline(lang='fa', processors='tokenize,mwt,pos,lemma')

persian_stopwords = set(stopwords_list())

def preprocess_texts(texts):
    cleaned_texts = []
    for text in texts:
        doc = nlp(text)
        words = []
        for sentence in doc.sentences:
            for word in sentence.words:
                lemma = word.lemma
                if lemma not in persian_stopwords:
                    words.append(lemma)
        cleaned_texts.append(' '.join(words))
    return cleaned_texts


def clean_names_batch(df, batch_size=500):
    all_clean = []
    for i in tqdm(range(0, len(df), batch_size)):
        batch = df.iloc[i:i+batch_size]
        texts = []
        for idx, row in batch.iterrows():
            name1 = re.sub(r'\S+@\S+', 'EMAIL', row['name1'])
            name1 = re.sub(r'http\S+|www\S+', 'URL', name1)
            name1 = re.sub(r'\d+', '', name1)
            name1 = re.sub(r'[^\w\s]', '', name1)

            if pd.notnull(row['name2']):
                name2 = re.sub(r'\S+@\S+', 'EMAIL', row['name2'])
                name2 = re.sub(r'http\S+|www\S+', 'URL', name2)
                name2 = re.sub(r'\d+', '', name2)
                name2 = re.sub(r'[^\w\s]', '', name2)
                combined = name1 + ' ' + name2
            else:
                combined = name1

            texts.append(combined)

        cleaned_batch = preprocess_texts(texts)
        all_clean.extend(cleaned_batch)

    return all_clean

train_data['clean_name'] = clean_names_batch(train_data)
test_data['clean_name'] = clean_names_batch(test_data)

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Downloading default packages for language: fa (Persian) ...


Downloading https://huggingface.co/stanfordnlp/stanza-fa/resolve/v1.10.0/models/default.zip:   0%|          | …

INFO:stanza:Downloaded file to /root/stanza_resources/fa/default.zip
INFO:stanza:Finished downloading models and saved to /root/stanza_resources
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: fa (Persian):
| Processor | Package        |
------------------------------
| tokenize  | perdt          |
| mwt       | perdt          |
| pos       | perdt_charlm   |
| lemma     | perdt_nocharlm |

INFO:stanza:Using device: cuda
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Done loading processors!
100%|██████████| 18/18 [05:12<00:00, 17.37s/it]
100%|██████████| 3/3 [00:40<00:00, 13.36s/it]


In [5]:
train_data.to_csv('/content/drive/MyDrive/Colab Notebooks/DL/11.project3/title_categorizer/data/torob_clean_train_3.csv', index = False)
test_data.to_csv('/content/drive/MyDrive/Colab Notebooks/DL/11.project3/title_categorizer/data/torob_clean_test_3.csv', index = False)