### Imported required libraries

In [325]:
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns 
import csv

import warnings
warnings.filterwarnings('ignore')

### NLP tool-kit

In [326]:
import dask.dataframe as dd
from nltk.tokenize import word_tokenize
import re
import nltk
from nltk.corpus import stopwords
from spacy.lang.en.stop_words import STOP_WORDS
from itertools import filterfalse
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report

### loading the datasets

In [327]:
data = pd.read_csv("train.csv",escapechar="\\",quoting=csv.QUOTE_NONE,nrows=10000)

In [328]:
test = pd.read_csv("test.csv",escapechar="\\",quoting=csv.QUOTE_NONE)

In [329]:
data.head()

Unnamed: 0,TITLE,DESCRIPTION,BULLET_POINTS,BRAND,BROWSE_NODE_ID
0,"Pete The Cat Bedtime Blues Doll, 14.5 Inch","Pete the Cat is the coolest, most popular cat ...","[Pete the Cat Bedtime Blues plush doll,Based o...",MerryMakers,0
1,"The New Yorker NYHM014 Refrigerator Magnet, 2 ...",The New Yorker Handsome Cello Wrapped Hard Mag...,[Cat In A Tea Cup by New Yorker cover artist G...,The New Yorker,1
2,The Ultimate Self-Sufficiency Handbook: A Comp...,,Skyhorse Publishing,imusti,2
3,Amway Nutrilite Kids Chewable Iron Tablets (100),,"[Nutrilite Kids,Chewable Iron Tablets,Quantity...",Amway,3
4,Teacher Planner Company A4 6 Lesson Academic T...,,,,4


In [330]:
test.head()

Unnamed: 0,PRODUCT_ID,TITLE,DESCRIPTION,BULLET_POINTS,BRAND
0,1,"Command 3M Small Kitchen Hooks, White, Decorat...",Sale Unit: PACK,[INCLUDES - 9 hooks and 12 small indoor strips...,Command
1,2,O'Neal Jump Hardware JAG Unisex-Adult Glove (B...,Synthetic leather palm with double-layer thumb...,[Silicone printing for a better grip. Long las...,O'Neal
2,3,"NFL Detroit Lions Portable Party Fridge, 15.8 ...",Boelter Brands lets you celebrate your favorit...,[Runs on 12 Volt DC Power or 110 Volt AC Power...,Boelter Brands
3,4,Panasonic Single Line KX-TS880MX Corded Phone ...,Features: 50 Station Phonebook Corded Phone Al...,Panasonic Landline Phones doesn't come with a ...,Panasonic
4,5,Zero Baby Girl's 100% Cotton Innerwear Bloomer...,"Zero Baby Girl Panties Set. 100% Cotton, Breat...","[Zero Baby Girl Panties, Pack of 6, 100% Cotto...",Zero


In [331]:
test['PRODUCT_ID'] #taken as a key value as per competition

0              1
1              2
2              3
3              4
4              5
           ...  
110770    110771
110771    110772
110772    110773
110773    110774
110774    110775
Name: PRODUCT_ID, Length: 110775, dtype: int64

### Finding missing values if there any in both the datasets

In [332]:
data.isnull().sum()

TITLE                0
DESCRIPTION       2913
BULLET_POINTS      704
BRAND              279
BROWSE_NODE_ID       0
dtype: int64

In [333]:
test.isnull().sum()

PRODUCT_ID           0
TITLE                7
DESCRIPTION      10469
BULLET_POINTS     6786
BRAND             2430
dtype: int64

### Filling the NAN values with zero

In [334]:
train_data=data.fillna(0)
test=test.fillna(0)

### combining the particular columns for text preprocessing  

In [335]:
cols = ['TITLE','DESCRIPTION','BULLET_POINTS']
train_data["text"] =train_data["TITLE"].astype(str) + train_data["DESCRIPTION"].astype(str) + train_data["BULLET_POINTS"].astype(str)
train_data.head()

Unnamed: 0,TITLE,DESCRIPTION,BULLET_POINTS,BRAND,BROWSE_NODE_ID,text
0,"Pete The Cat Bedtime Blues Doll, 14.5 Inch","Pete the Cat is the coolest, most popular cat ...","[Pete the Cat Bedtime Blues plush doll,Based o...",MerryMakers,0,"Pete The Cat Bedtime Blues Doll, 14.5 InchPete..."
1,"The New Yorker NYHM014 Refrigerator Magnet, 2 ...",The New Yorker Handsome Cello Wrapped Hard Mag...,[Cat In A Tea Cup by New Yorker cover artist G...,The New Yorker,1,"The New Yorker NYHM014 Refrigerator Magnet, 2 ..."
2,The Ultimate Self-Sufficiency Handbook: A Comp...,0,Skyhorse Publishing,imusti,2,The Ultimate Self-Sufficiency Handbook: A Comp...
3,Amway Nutrilite Kids Chewable Iron Tablets (100),0,"[Nutrilite Kids,Chewable Iron Tablets,Quantity...",Amway,3,Amway Nutrilite Kids Chewable Iron Tablets (10...
4,Teacher Planner Company A4 6 Lesson Academic T...,0,0,0,4,Teacher Planner Company A4 6 Lesson Academic T...


### same as per training data

In [336]:
cols = ['TITLE','DESCRIPTION','BULLET_POINTS']
test["text"] =test["TITLE"].astype(str) + test["DESCRIPTION"].astype(str) + test["BULLET_POINTS"].astype(str)
test.head()

Unnamed: 0,PRODUCT_ID,TITLE,DESCRIPTION,BULLET_POINTS,BRAND,text
0,1,"Command 3M Small Kitchen Hooks, White, Decorat...",Sale Unit: PACK,[INCLUDES - 9 hooks and 12 small indoor strips...,Command,"Command 3M Small Kitchen Hooks, White, Decorat..."
1,2,O'Neal Jump Hardware JAG Unisex-Adult Glove (B...,Synthetic leather palm with double-layer thumb...,[Silicone printing for a better grip. Long las...,O'Neal,O'Neal Jump Hardware JAG Unisex-Adult Glove (B...
2,3,"NFL Detroit Lions Portable Party Fridge, 15.8 ...",Boelter Brands lets you celebrate your favorit...,[Runs on 12 Volt DC Power or 110 Volt AC Power...,Boelter Brands,"NFL Detroit Lions Portable Party Fridge, 15.8 ..."
3,4,Panasonic Single Line KX-TS880MX Corded Phone ...,Features: 50 Station Phonebook Corded Phone Al...,Panasonic Landline Phones doesn't come with a ...,Panasonic,Panasonic Single Line KX-TS880MX Corded Phone ...
4,5,Zero Baby Girl's 100% Cotton Innerwear Bloomer...,"Zero Baby Girl Panties Set. 100% Cotton, Breat...","[Zero Baby Girl Panties, Pack of 6, 100% Cotto...",Zero,Zero Baby Girl's 100% Cotton Innerwear Bloomer...


In [337]:
data = train_data.drop(cols, axis=1)
test = test.drop(cols, axis=1)

In [338]:
data.head()

Unnamed: 0,BRAND,BROWSE_NODE_ID,text
0,MerryMakers,0,"Pete The Cat Bedtime Blues Doll, 14.5 InchPete..."
1,The New Yorker,1,"The New Yorker NYHM014 Refrigerator Magnet, 2 ..."
2,imusti,2,The Ultimate Self-Sufficiency Handbook: A Comp...
3,Amway,3,Amway Nutrilite Kids Chewable Iron Tablets (10...
4,0,4,Teacher Planner Company A4 6 Lesson Academic T...


In [339]:
test.head()

Unnamed: 0,PRODUCT_ID,BRAND,text
0,1,Command,"Command 3M Small Kitchen Hooks, White, Decorat..."
1,2,O'Neal,O'Neal Jump Hardware JAG Unisex-Adult Glove (B...
2,3,Boelter Brands,"NFL Detroit Lions Portable Party Fridge, 15.8 ..."
3,4,Panasonic,Panasonic Single Line KX-TS880MX Corded Phone ...
4,5,Zero,Zero Baby Girl's 100% Cotton Innerwear Bloomer...


### cleaning HTML tags , punctuatuion and non alphabetic characters from the text

In [340]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import re
import sys
import warnings

def cleanHtml(sentence):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', str(sentence))
    return cleantext

def cleanPunc(sentence): #function to clean the word of any punctuation or special characters
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    cleaned = cleaned.strip()
    cleaned = cleaned.replace("\n"," ")
    return cleaned

def keepAlpha(sentence):
    alpha_sent = ""
    for word in sentence.split():
        alpha_word = re.sub('[^a-z A-Z]+', ' ', word)
        alpha_sent += alpha_word
        alpha_sent += " "
    alpha_sent = alpha_sent.strip()
    return alpha_sent

In [341]:
data['text'] = data['text'].str.lower()
data['text'] = data['text'].apply(cleanHtml)
data['text'] = data['text'].apply(cleanPunc)
data['text'] = data['text'].apply(keepAlpha)

test['text'] = test['text'].str.lower()
test['text'] = test['text'].apply(cleanHtml)
test['text'] = test['text'].apply(cleanPunc)
test['text'] = test['text'].apply(keepAlpha)

In [342]:
data.head()

Unnamed: 0,BRAND,BROWSE_NODE_ID,text
0,MerryMakers,0,pete the cat bedtime blues doll inchpete t...
1,The New Yorker,1,the new yorker nyhm refrigerator magnet x ...
2,imusti,2,the ultimate self sufficiency handbook a comp...
3,Amway,3,amway nutrilite kids chewable iron tablets ...
4,0,4,teacher planner company a lesson academic t...


In [343]:
test.head()

Unnamed: 0,PRODUCT_ID,BRAND,text
0,1,Command,command m small kitchen hooks white decorate ...
1,2,O'Neal,oneal jump hardware jag unisex adult glove bla...
2,3,Boelter Brands,nfl detroit lions portable party fridge qu...
3,4,Panasonic,panasonic single line kx ts mx corded phone wh...
4,5,Zero,zero baby girls cotton innerwear bloomer dra...


In [344]:
import re
punc = r'[^\w\s]|^[a-zA-z]+|[a-zA-z]+"" "'
for ele in train_data:
	if ele in punc:
		data = data.replace(ele,"")

In [345]:
import re
punc = r'[^\w\s]|^[a-zA-z]+|[a-zA-z]+"" "'
for ele in train_data:
	if ele in punc:
		test = test.replace(ele,"")

In [346]:
import string
from string import punctuation
stop_words = stopwords.words("english")
punctuation = punctuation + '\n' + '—' + '“' + ',' + '”' + '‘' + '-' + '’'

### process of transforming a text into canonical (standard ) form  

In [347]:
def normalize_tokens(list_of_tokens):
    return map(lambda x: x.lower(),list_of_tokens)

In [348]:
def contractions_expansion(list_of_tokens):
    return map(contracted_word_expansion,list_of_tokens)

### applied work tokenize for saperating a piece of text into smaller units 

In [349]:
data['text'] = data['text'].apply(lambda x: word_tokenize(x))
test['text'] = test['text'].apply(lambda x: word_tokenize(x))

data['text'] = data['text'].apply(lambda x: normalize_tokens(x))
test['text'] = test['text'].apply(lambda x: normalize_tokens(x))

In [350]:
data['text'] = data['text'].apply(lambda x: list(x))
test['text'] = test['text'].apply(lambda x: list(x))

###  universal regular expression for removal of special and unneccessary blocks

In [351]:
regex = r'^@[a-zA-z0-9]|^#[a-zA-Z0-9]|\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*|\W+|\d+|<("[^"]*"|\'[^\']*\'|[^\'">])*>|_+|[^\u0000-\u007f]+'

In [352]:
def waste_word_or_not(token):
    return re.search(regex,token)

In [353]:
def filter_waste_words(list_of_tokens):
    return filterfalse(waste_word_or_not,list_of_tokens)

In [354]:
def split(list_of_tokens):
    return map(lambda x: re.split(regex,x)[0],list_of_tokens)

In [355]:
data['text'] = data['text'].apply(lambda x: filter_waste_words(x))
test['text'] = test['text'].apply(lambda x: filter_waste_words(x))

In [356]:
data['text'] = data['text'].apply(lambda x: list(x))
test['text'] = test['text'].apply(lambda x: list(x))

### removing the most common words from the text

In [357]:
en_stop_words = list(set(stopwords.words('english')).union(set(STOP_WORDS)))

In [358]:
def is_stopword(token):
    return not(token in en_stop_words or re.search(r'\b\w\b|[^\u0000-\u007f]+|_+|\W+',token))

In [359]:
def stopwords_removal(list_of_tokens):
    return filter(is_stopword,list_of_tokens)

In [360]:
data['text'] = data['text'].apply(lambda x: stopwords_removal(x))
test['text'] = test['text'].apply(lambda x: stopwords_removal(x))

data['text'] = data['text'].apply(lambda x: list(x))
test['text'] = test['text'].apply(lambda x: list(x))

In [361]:
data.head()

Unnamed: 0,BRAND,BROWSE_NODE_ID,text
0,MerryMakers,0,"[pete, cat, bedtime, blues, doll, inchpete, ca..."
1,The New Yorker,1,"[new, yorker, nyhm, refrigerator, magnet, new,..."
2,imusti,2,"[ultimate, self, sufficiency, handbook, comple..."
3,Amway,3,"[amway, nutrilite, kids, chewable, iron, table..."
4,0,4,"[teacher, planner, company, lesson, academic, ..."


In [362]:
test.head()

Unnamed: 0,PRODUCT_ID,BRAND,text
0,1,Command,"[command, small, kitchen, hooks, white, decora..."
1,2,O'Neal,"[oneal, jump, hardware, jag, unisex, adult, gl..."
2,3,Boelter Brands,"[nfl, detroit, lions, portable, party, fridge,..."
3,4,Panasonic,"[panasonic, single, line, kx, ts, mx, corded, ..."
4,5,Zero,"[zero, baby, girls, cotton, innerwear, bloomer..."


### Once the tuple is given to the Counter, it will be converted to a hashtable object wherein the elements will become keys and the values will be the count of the elements from the tuple given

In [None]:
from collections import Counter

In [364]:
data['text'] = data['text'].apply(lambda x:[k for k, v in Counter(x).most_common(5)])
test['text'] = test['text'].apply(lambda x:[k for k, v in Counter(x).most_common(5)])

In [365]:
data = data.drop(columns='BRAND',axis=1)
test = test.drop(columns='BRAND',axis=1)

In [366]:
def listtostring(s):

  text = ", "
  return (text.join(s))

In [367]:
data['text'] = data['text'].apply(lambda x:listtostring(x))
test['text'] = test['text'].apply(lambda x:listtostring(x))

In [368]:
data.head()

Unnamed: 0,BROWSE_NODE_ID,text
0,0,"cat, pete, bedtime, blues, doll"
1,1,"new, yorker, magnet, inch, handsome"
2,2,"ultimate, self, sufficiency, handbook, complete"
3,3,"tablets, nutrilite, kids, chewable, iron"
4,4,"teacher, planner, company, lesson, academic"


In [369]:
test.head()

Unnamed: 0,PRODUCT_ID,text
0,1,"hooks, easy, command, damage, free"
1,2,"synthetic, leather, palm, double, layer"
2,3,"home, fridge, nfl, party, brands"
3,4,"features, display, redial, corded, phone"
4,5,"panties, baby, wash, girls, wear"


In [370]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, classification_report

### applied Tf-IDF to transform text into a meaninful representation of numbers 

In [371]:
vectorizer=TfidfVectorizer(strip_accents='unicode',analyzer='word',ngram_range=(1,3),norm='l2')

In [372]:
vectorizer.fit(data['text'])
vectorizer.fit(test['text'])

TfidfVectorizer(ngram_range=(1, 3), strip_accents='unicode')

### Feature engineering 

In [373]:
x_train = vectorizer.transform(data['text'])
y_train = data.drop(labels=['text'],axis=1)

In [374]:
x_train

<10000x356470 sparse matrix of type '<class 'numpy.float64'>'
	with 62475 stored elements in Compressed Sparse Row format>

In [375]:
y_train

Unnamed: 0,BROWSE_NODE_ID
0,0
1,1
2,2
3,3
4,4
...,...
9995,5
9996,1213
9997,1045
9998,2603


### multiclass logistic regression // 

In [376]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier

### where SAG supports the non smooth panelty option and N_jobs suggests that each process runs in a different core

In [377]:
LogReg_pipeline = Pipeline([('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=-1)),])

### model training

In [378]:
LogReg_pipeline.fit(x_train, y_train)

Pipeline(steps=[('clf',
                 OneVsRestClassifier(estimator=LogisticRegression(solver='sag'),
                                     n_jobs=-1))])

In [379]:
vectorizer.fit(test['text'])

x_test = vectorizer.transform(test['text'])

In [380]:
x_test

<110775x356470 sparse matrix of type '<class 'numpy.float64'>'
	with 1328027 stored elements in Compressed Sparse Row format>

In [383]:
y_test = test.drop(labels=['text'],axis=1)

### predicting the score 

In [384]:
prediction = LogReg_pipeline.predict(x_test)

In [385]:
prediction

array([1045, 1045, 1045, ..., 1045,    4,    5], dtype=int64)

In [388]:
test_label = pd.DataFrame(list(prediction))

In [389]:
test_label

Unnamed: 0,0
0,1045
1,1045
2,1045
3,1045
4,1687
...,...
110770,1045
110771,1045
110772,1045
110773,4


In [390]:
product_id = test['PRODUCT_ID']

### defining the key and values as per the hackathon

In [392]:
submission = pd.DataFrame()
submission['PRODUCT_ID'] = test['PRODUCT_ID']
submission['BROWSE_NODE_ID'] = test_label
submission

Unnamed: 0,PRODUCT_ID,BROWSE_NODE_ID
0,1,1045
1,2,1045
2,3,1045
3,4,1045
4,5,1687
...,...,...
110770,110771,1045
110771,110772,1045
110772,110773,1045
110773,110774,4
