In [None]:
import gc
import csv
import nltk
import pickle
import pandas as pd
import contractions
import unidecode
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelBinarizer

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
stop = stopwords.words('english')
wordnet_lemmatizer = WordNetLemmatizer()
label_binarizer = LabelBinarizer()
count_vectorizer = CountVectorizer(tokenizer=lambda doc: doc, lowercase=False)

In [None]:
def lemmatize_text(text):
    return [wordnet_lemmatizer.lemmatize(w) for w in text]

In [None]:
def standardize_text(df, column):
    df[column] = df[column].str.replace(r"[^A-Za-z]", " ")
    df[column] = df[column].str.lower()
    df[column] = df[column].apply(contractions.fix)
    df[column] = df[column].apply(word_tokenize)
    df[column] = df[column].apply(lemmatize_text)
    df[column] = df[column].apply(lambda x: [item for item in x if len(item)>2 and item not in stop])
    return df

In [None]:
def standardize_brand(df, column):
    df[column] = df[column].apply(unidecode.unidecode)
    df[column] = df[column].str.replace(r"[^A-Za-z0-9]", "")
    df[column] = df[column].str.lower()
    return df

In [None]:
def merge(row):
    res = list(set().union(row['TITLE'], row['DESCRIPTION'], row['BULLET_POINTS']))
    res.append(row['BRAND'])
    return res

In [None]:
def data_clean(data):
    for column in data.columns:
        if column == 'BRAND':
            data = standardize_brand(data, column)
        else:
            data = standardize_text(data, column)    
    data['FEATURE'] = data.apply(merge, axis=1)
    data = data.drop(columns=['TITLE', 'DESCRIPTION', 'BULLET_POINTS', 'BRAND'])
    return data['FEATURE'].values

Process Train Data

In [None]:
data = pd.read_csv("/content/gdrive/MyDrive/Contest/dataset/train.csv", escapechar = "\\", quoting = csv.QUOTE_NONE, error_bad_lines=False, na_filter=False)
label = data['BROWSE_NODE_ID'].values
data = data.drop(columns='BROWSE_NODE_ID')

Process data in chunks as it is too big to be processed in one go because on RAM/Memory/Resource constraints, and the same can be applied while working on data.
It also minimizes the effort and time loss in case of failure.
This can also be done while reading the data itselt, by providing the nrows and skiprows parameters in read_csv function.

In [None]:
for i in range(15):
  s = i*200000
  e = min(s+200000, data.shape[0])
  data_cleaned = data_clean(data[s:e])
  label_temp = label[s:e]
  data_temp = pd.DataFrame({'FEATURE':data_cleaned, 'BROWSE_NODE_ID':label_temp})
  data_temp = data_temp[['FEATURE', 'BROWSE_NODE_ID']]
  file_name = 'data'+str(i)+'.pkl'
  data_temp.to_pickle(file_name)
  del data_cleaned
  del label_temp
  del data_temp
  gc.collect()

Create small dataset with n rows of each label/class

In [None]:
n = 5
classes = {}
final_data = []
final_label = []
path = '/content/gdrive/MyDrive/Contest/'

In [None]:
for i in range(15):
  file_name = 'data'+str(i)+'.pkl'
  print(path+file_name)
  data = pd.read_pickle(path+file_name)
  for rows in data.values:
    if rows[1] not in classes:
      classes[rows[1]] = 1
      final_data.append(rows[0])
      final_label.append(rows[1])
    elif classes[rows[1]]<n:
      classes[rows[1]] += 1
      final_data.append(rows[0])
      final_label.append(rows[1])
  del data
  gc.collect()

In [None]:
data_processed = pd.DataFrame({'FEATURE':final_data, 'LABEL':final_label})
data_processed = data_processed[['FEATURE', 'LABEL']]
data_processed.to_pickle('data_processed_lite.pkl')

Process test data

In [None]:
data = pd.read_csv("/content/gdrive/MyDrive/Contest/dataset/test.csv", escapechar = "\\", quoting = csv.QUOTE_NONE, error_bad_lines=False, na_filter=False)
ID = data['PRODUCT_ID'].values
data = data.drop(columns='PRODUCT_ID')

In [None]:
data_cleaned = data_clean(data)

In [None]:
pickle.dump(ID, open("ID_test_pickle","wb"))
pickle.dump(data_cleaned, open("data_test_pickle","wb"))