In [None]:
import pandas as pd
import re
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# sklearn packages
from sklearn import metrics
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.dummy import DummyClassifier
from xgboost import XGBClassifier

# nltk packages
import nltk
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords
from string import punctuation
import unidecode
import csv

RANDOM_SEED = 69

In [None]:
train = pd.read_csv('train.csv',escapechar="\\", quoting=csv.QUOTE_NONE);

In [None]:
test = pd.read_csv('test.csv',escapechar="\\", quoting=csv.QUOTE_NONE);

In [None]:
train.info()

In [None]:
test.info()

In [None]:
train.isnull().sum()

In [None]:
train.describe(include='all')

In [None]:
# cleaning part

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    if(text!=text):
        return ""
    text = text.lower()
    text = REPLACE_BY_SPACE_RE.sub(' ', text)
    text = BAD_SYMBOLS_RE.sub('', text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS)
    return text

In [None]:
clean_train = dict()
clean_train['TITLE'] = train['TITLE'].apply(clean_text)
clean_train['BULLET_POINTS'] = train['BULLET_POINTS'].apply(clean_text)
clean_train['BRAND'] = train['BRAND'].apply(clean_text)
clean_train['DESCRIPTION'] = train['DESCRIPTION'].apply(clean_text)
clean_train['BROWSE_NODE_ID'] = train['BROWSE_NODE_ID']

In [None]:
clean_train_df = pd.DataFrame(clean_train)
clean_train_df.to_csv('clean_train.csv')

In [None]:
clean_train_df.info()

In [None]:
clean_train_df = pd.read_csv('clean_train.csv',escapechar="\\", quoting=csv.QUOTE_NONE);

In [None]:
train = clean_train_df

In [None]:
#train = train[pd.notnull(train["TITLE"]) & pd.notnull(train["DESCRIPTION"]) & pd.notnull(train["BULLET_POINTS"]) & pd.notnull(train["BRAND"])]
# test = test[pd.notnull(test["TITLE"]) & pd.notnull(test["DESCRIPTION"]) & pd.notnull(test["BULLET_POINTS"]) & pd.notnull(test["BRAND"])]

In [None]:
train.fillna('', inplace=True)
test.fillna('', inplace=True)

In [None]:
X = train["TITLE"].map(str)+" "+train["DESCRIPTION"].map(str)+" "+train["BULLET_POINTS"].map(str)+" "+train["BRAND"]
y = train["BROWSE_NODE_ID"]

X_test = test["TITLE"].map(str)+" "+test["DESCRIPTION"].map(str)+" "+test["BULLET_POINTS"].map(str)+" "+test["BRAND"]

validation_size = 0.3
X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=validation_size,
                                                                random_state=RANDOM_SEED)

print("Training Size:- ", int((X.shape[0])*(1-validation_size)))
print("Validation Size:- ", int((X.shape[0])*(validation_size)))
print("Test Size:- ", X_test.shape[0])

In [None]:
nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', XGBClassifier(tree_method='gpu_hist',gpu_id=0)),
              ])
print(X_train.shape)
nb.fit(X_train[:15000], y_train[:15000])

In [None]:
from numba import jit, cuda

In [None]:
from lightgbm import LGBMClassifier
from sklearn.svm import LinearSVC
nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', LinearSVC()),
              ])
print(X_train.shape)
def cuda_Train():
    nb.fit(X_train[:100000], y_train[:100000])

cuda_Train()

In [None]:
y_pred = nb.predict(X_validation[:10000])
print('Accuracy:-',100*accuracy_score(y_pred, y_validation[:10000]))

In [None]:
y_ans = nb.predict(X_test[:])

In [None]:
for_sub = pd.DataFrame(test['PRODUCT_ID'])

In [None]:
for_sub.info()

In [None]:
for_sub['BROWSE_NODE_ID']=y_ans

In [None]:
for_sub.to_csv('submission.csv',index=False)

In [None]:
for_sub.head(5)