**NOTE:**
This notebook was used to try out various models for the product categorization challenge as part of the **DL Hackathon conducted by Analytics Club of IITM in association with Tech-Soc IITM**.

**NOTE:**

Various kinds of models including Transformers(inferance time 5mins), Bert(inferance time 12 mins) and LSTM based models were tried out. The deep learning based models gave a slight rise in accuracy score with a huge inferance time whereas the LinearSVM model(with a TgdifVectorizer) was found to have a significant accuracy very close to the above mentioned models in a very minimal inferance time of 23 secs for predicting about 8k test datapoints.

In [None]:
import os
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import re

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
import os
os.environ['KAGGLE_CONFIG_DIR'] = "/content/gdrive/My Drive/Kaggle"
# /content/gdrive/My Drive/Kaggle is the path where kaggle.json is present in the Google Drive

In [None]:
! kaggle competitions download -c techsoc-analytics-21-22 

In [None]:
#unzipping the zip files and deleting the zip files
!unzip \test.csv.zip  && rm test.csv.zip
!unzip \train.csv.zip  && rm train.csv.zip

In [None]:
df=pd.read_csv('/content/train.csv')
sample_submission=pd.read_csv('/content/sample_submission.csv')

In [None]:
df

In [None]:
df['target_ind'].unique().shape #we have to categorize using the captions into these 500 categories

#LSTM Model

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.layers import Dropout
import re
from nltk.corpus import stopwords
from nltk import word_tokenize
import nltk
nltk.download('stopwords')
STOPWORDS = set(stopwords.words('english'))
from bs4 import BeautifulSoup
import plotly.graph_objs as go

In [None]:
from nltk.corpus import stopwords
import re
from bs4 import BeautifulSoup
import nltk
nltk.download('stopwords')

df['content'] = df['content'].str.lower() #reduces all letters in the comments to lower case
df['content'].replace( { r'[^a-zA-Z ]' : '' }, inplace= True, regex = True) #anything other than (a-z)(A-Z)(space) will be removed
                                                                                 # ^ means "not" in regex
df['title'] = df['title'].str.lower()
df['title'].replace( { r'[^a-zA-Z ]' : '' }, inplace= True, regex = True)

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = BeautifulSoup(text, "lxml").text # HTML decoding
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
    return text
    
df['content'] = df['content'].apply(clean_text)
df['title'] = df['title'].apply(clean_text)

In [None]:
df['X']=df['title']+df['content']

In [None]:
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 70000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 250
# This is fixed.
EMBEDDING_DIM = 100
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(df['X'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

In [None]:
X = tokenizer.texts_to_sequences(df['X'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

In [None]:
Y = pd.get_dummies(df['target_ind']).values
print('Shape of label tensor:', Y.shape)

In [None]:
X_train=X
Y_train=Y
print(X_train.shape,Y_train.shape)

In [None]:
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(500, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

In [None]:
model=torch.load('/content/gdrive/MyDrive/Pre-Inter IIT/LSTMmodel.pth') 

In [None]:
#val=38.83
epochs = 1
batch_size = 1024

history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,validation_split=0.15,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

In [None]:
torch.save(model, '/content/gdrive/MyDrive/Pre-Inter IIT/LSTMmodel.pth')

In [None]:
test_df=pd.read_csv('/content/test.csv')

In [None]:
test_df['content'] = test_df['content'].str.lower() #reduces all letters in the comments to lower case
test_df['content'].replace( { r'[^a-zA-Z ]' : '' }, inplace= True, regex = True) #anything other than (a-z)(A-Z)(space) will be removed
                                                                                 # ^ means "not" in regex
test_df['title'] = test_df['title'].str.lower()
test_df['title'].replace( { r'[^a-zA-Z ]' : '' }, inplace= True, regex = True)

test_df['content'] = test_df['content'].apply(clean_text)
test_df['title'] = test_df['title'].apply(clean_text)

In [None]:
test_df['X']=test_df['title']+test_df['content']

In [None]:
X = tokenizer.texts_to_sequences(test_df['X'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)

In [None]:
preds=model.predict(X)

In [None]:
preds=np.argmax(preds,axis=1)

In [None]:
preds

In [None]:
test_df['target_ind']=preds

In [None]:
test_df=test_df.drop(['content', 'title', 'X'],axis=1)

In [None]:
test_df.to_csv('/content/submission.csv',index=False)

#Bidirectional LSTM

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn import model_selection
from sklearn.preprocessing import LabelEncoder

In [None]:
df['content'] = df['content'].str.lower() #reduces all letters in the comments to lower case
df['content'].replace( { r'[^a-zA-Z ]' : '' }, inplace= True, regex = True) #anything other than (a-z)(A-Z)(space) will be removed
                                                                                 # ^ means "not" in regex
df['title'] = df['title'].str.lower()
df['title'].replace( { r'[^a-zA-Z ]' : '' }, inplace= True, regex = True)

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    return text
    
df['content'] = df['content'].apply(clean_text)
df['title'] = df['title'].apply(clean_text)

In [None]:
train_df=df

In [None]:
max_features=10000 #we set maximum number of words to 10000
maxlen=1 #we set maximum sequence length to 10000

In [None]:
tok = tf.keras.preprocessing.text.Tokenizer(num_words=max_features) #again tokenizer step
tok.fit_on_texts(list(train_df['content'])+list(train_df['title'])) #fit to cleaned text

In [None]:
print(len(tok.word_index))
vocab_size = len(tok.word_index) + 1

In [None]:
text_df = tok.texts_to_sequences(list(train_df['content'])) #this is how we create sequences
text_df = tf.keras.preprocessing.sequence.pad_sequences(text_df, maxlen=maxlen) #let's execute pad step

title_df = tok.texts_to_sequences(list(train_df['title'])) #this is how we create sequences
title_df = tf.keras.preprocessing.sequence.pad_sequences(title_df, maxlen=maxlen)

In [None]:
train_df = title_df + text_df

In [None]:
#One-Hot Encoding for the Target Categories
from sklearn.preprocessing import OneHotEncoder
categorical_cols=['target_ind']
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore').fit(df[categorical_cols])
encoded_cols = list(encoder.get_feature_names(categorical_cols))
target_output = encoder.transform(df[categorical_cols])

Y=target_output

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train_df, Y, test_size=0.1)

In [None]:
X_train=X_train[:2000]
y_train=y_train[:2000]

In [None]:
embedding_dim = 50

In [None]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=maxlen),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(embedding_dim, activation='relu'),
    tf.keras.layers.Dense(500, activation=tf.nn.softmax)
])

In [None]:
model.compile(optimizer='adam',
              loss='categorical_crossentropy',#no more categorical_crossentropy
              metrics=['accuracy'])

In [None]:
model.fit(np.array(X_train), np.array(y_train), epochs=1)

In [None]:
test_df['content'] = test_df['content'].str.lower() #reduces all letters in the comments to lower case
test_df['content'].replace( { r'[^a-zA-Z ]' : '' }, inplace= True, regex = True) #anything other than (a-z)(A-Z)(space) will be removed
                                                                                 # ^ means "not" in regex
test_df['title'] = test_df['title'].str.lower()
test_df['title'].replace( { r'[^a-zA-Z ]' : '' }, inplace= True, regex = True)

test_df['content'] = test_df['content'].apply(clean_text)
test_df['title'] = test_df['title'].apply(clean_text)

text_df = tok.texts_to_sequences(list(test_df['content'])) #this is how we create sequences
text_df = tf.keras.preprocessing.sequence.pad_sequences(text_df, maxlen=maxlen) #let's execute pad step

title_df = tok.texts_to_sequences(list(test_df['title'])) #this is how we create sequences
title_df = tf.keras.preprocessing.sequence.pad_sequences(title_df, maxlen=maxlen)

test_df = title_df + text_df
X_test=test_df

In [None]:
test_df=pd.read_csv('/content/test.csv')
test_df.drop('title',axis=1,inplace=True)
test_df.drop('content',axis=1,inplace=True)

In [None]:
test_df['target_ind']=0

In [None]:
model.predict(X_test[0]).shape

In [None]:
np.argmax(model.predict(X_test[9]),axis=1)

In [None]:
np.argmax(model.predict(X_test[12]),axis=1)

In [None]:
for i in range(0,len(X_test)):
   preds=model.predict(X_test[i])
   test_df.loc[i,'target_ind']=preds
   if i%100==0:
     print(i)

In [None]:
test_df.to_csv('submission.csv',index=False)

#LinearSVC 
This model gave the highest accuracy with least inferance time.

In [None]:
from nltk.corpus import stopwords
import re
from bs4 import BeautifulSoup
import nltk
nltk.download('stopwords')

df['content'] = df['content'].str.lower() #reduces all letters in the comments to lower case
df['content'].replace( { r'[^a-zA-Z ]' : '' }, inplace= True, regex = True) #anything other than (a-z)(A-Z)(space) will be removed
                                                                                 # ^ means "not" in regex
df['title'] = df['title'].str.lower()
df['title'].replace( { r'[^a-zA-Z ]' : '' }, inplace= True, regex = True)

df['title'].replace( { 'amazoncom' : '' }, inplace= True, regex = True)
df['content'].replace( { 'amazoncom' : '' }, inplace= True, regex = True)

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = BeautifulSoup(text, "lxml").text # HTML decoding
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
    return text
    
df['content'] = df['content'].apply(clean_text)
df['title'] = df['title'].apply(clean_text)

In [None]:
import nltk
nltk.download('wordnet')

In [None]:
df['X']=df['title']+' '+df['title']+' '+df['title']+' '+df['content']
#df['X']=df['content']
#df['X']=df['title']

from nltk.stem.porter import PorterStemmer

stemmer = PorterStemmer()
def stem_words(text):
    return " ".join([stemmer.stem(word) for word in text.split()])

df["X"] = df["X"].apply(lambda text: stem_words(text))

In [None]:
df.drop('title',axis=1,inplace=True)
df.drop('content',axis=1,inplace=True)

X_train=df['X']
y_train=df['target_ind']

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

In [None]:
# Let's first try with Count Vectorizer from scikit learn

cv = TfidfVectorizer(stop_words=stopwords)
X_train_cv = cv.fit_transform(X_train)
X_train_cv.shape

In [None]:
from sklearn.svm import LinearSVC
clf = LinearSVC()
clf.fit(X_train_cv,y_train)

In [None]:
test_df=pd.read_csv('/content/test.csv')

In [None]:
test_df['content'] = test_df['content'].str.lower() #reduces all letters in the comments to lower case
test_df['content'].replace( { r'[^a-zA-Z ]' : '' }, inplace= True, regex = True) #anything other than (a-z)(A-Z)(space) will be removed
                                                                                 # ^ means "not" in regex
test_df['title'] = test_df['title'].str.lower()
test_df['title'].replace( { r'[^a-zA-Z ]' : '' }, inplace= True, regex = True)

test_df['content'] = test_df['content'].apply(clean_text)
test_df['title'] = test_df['title'].apply(clean_text)

test_df['X']=test_df['title']+' '+test_df['title']+' '+test_df['title']+' '+test_df['content']
#X_test=test_df['content']
#X_test=test_df['title']


from nltk.stem.porter import PorterStemmer

stemmer = PorterStemmer()
def stem_words(text):
   return " ".join([stemmer.stem(word) for word in text.split()])

test_df["X"] = test_df["X"].apply(lambda text: stem_words(text))

X_test=test_df['X']

X_test_cv = cv.transform(X_test)

In [None]:
preds=clf.predict(X_test_cv)

In [None]:
test_df['target_ind']=preds
test_df.drop('title',axis=1,inplace=True)
test_df.drop('content',axis=1,inplace=True)

In [None]:
test_df.drop('X',axis=1,inplace=True)

In [None]:
test_df.to_csv('submission.csv',index=False)

In [None]:
import pickle

In [None]:
filename = 'finalized_model.sav'
pickle.dump(clf, open(filename, 'wb'))

In [None]:
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))

In [None]:
filename = 'TfidfVectorizer.sav'
pickle.dump(cv, open(filename, 'wb'))

#MLP Classifier

In [None]:
df['content'] = df['content'].str.lower() #reduces all letters in the comments to lower case
df['content'].replace( { r'[^a-zA-Z0-9, ]' : '' }, inplace= True, regex = True) #anything other than (a-z)(A-Z)(0-9)(comma)(space) will be removed
                                                                                 # ^ means "not" in regex

#df['X']=df['title']+df['content']
df['X']=df['content']

df.drop('title',axis=1,inplace=True)
df.drop('content',axis=1,inplace=True)

X_train=df['X']
y_train=df['target_ind']

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer(stop_words=stopwords)
X_train_cv = cv.fit_transform(X_train)

In [None]:
from sklearn.neural_network import MLPClassifier
mlp=MLPClassifier()
mlp.fit(X_train_cv[:],y_train[:])

In [None]:
#X_test=test_df['title']+test_df['content']
X_test=test_df['content']
X_test_cv = cv.transform(X_test)
mlp_prediction=mlp.predict(X_test_cv)

In [None]:
test_df['target_ind']=mlp_prediction

In [None]:
test_df.drop('title',axis=1,inplace=True)
test_df.drop('content',axis=1,inplace=True)

In [None]:
test_df.to_csv('submission.csv',index=False)

#Using Glove Embeddings: 
*make separate lstms for predicting using content and title and then ensemble them.

In [None]:
import spacy
spacy_eng = spacy.load("en")

class Vocabulary:
    PAD_token = 0   # Used for padding short sentences
    SOS_token = 1   # Start-of-sentence token
    EOS_token = 2   # End-of-sentence token

    def __init__(self, name): #creating various dictionaries
        self.name = name
        self.word2index = {'PAD': 0, 'SOS': 1, 'EOS': 2} #maps words to indexes
        self.word2count = {} #counts the number of times a particular word occurs
        self.index2word = {0: "PAD", 1: "SOS", 2: "EOS"} #maps indexes to words
        self.num_words = 3 #total number of unique words/tokens (including EOS, SOS and PAD)
        self.num_sentences = 0 
        self.longest_sentence = 0
    


    def add_word(self, word): #adding words/tokens to the several dictionaries
        if word not in self.word2index:
            # First entry of word into vocabulary (if the word doesnt already exist in word2index and index2word; add it to the same)
            self.word2index[word] = self.num_words
            self.word2count[word] = 1
            self.index2word[self.num_words] = word
            self.num_words += 1
        else:
            # If Word already exists in word2index and index2word; increase word count
            self.word2count[word] += 1

    def tokenizer(self,text): #Tokenization is essentially splitting a phrase, sentence, paragraph, or an entire text document into smaller units, such as individual words or terms. Each of these smaller units are called tokens. The tokens could be words, numbers or punctuation marks.
        return [tok.text.lower() for tok in spacy_eng.tokenizer(text)]
    
    def add_sentence(self, sentence):
        sentence_len = 0
        #for word in sentence.split(' '):
        for word in self.tokenizer(sentence):
            sentence_len += 1
            self.add_word(word)
        if sentence_len > self.longest_sentence:
            # This is the longest sentence
            self.longest_sentence = sentence_len
        # Count the number of sentences
        self.num_sentences += 1

    def to_word(self, index):
        return self.index2word[index]

    def to_index(self, word):
        return self.word2index[word]

In [None]:
from nltk.corpus import stopwords
import re
from bs4 import BeautifulSoup
import nltk
nltk.download('stopwords')


df['content'] = df['content'].str.lower() #reduces all letters in the comments to lower case
df['content'].replace( { r'[^a-zA-Z ]' : '' }, inplace= True, regex = True) #anything other than (a-z)(A-Z)(space) will be removed
                                                                                 # ^ means "not" in regex
df['title'] = df['title'].str.lower()
df['title'].replace( { r'[^a-zA-Z ]' : '' }, inplace= True, regex = True)

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = BeautifulSoup(text, "lxml").text # HTML decoding
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
    return text
    
df['content'] = df['content'].apply(clean_text)
df['title'] = df['title'].apply(clean_text)

In [None]:
from nltk.corpus import stopwords
import re
from bs4 import BeautifulSoup
import nltk
nltk.download('stopwords')

In [None]:
# creating the vocabulary 
vocab = Vocabulary('test')

# adding words to the vocabulary 
for sentence in df['content']:
    vocab.add_sentence(sentence) 

In [None]:
l=[]
for i in vocab.word2count.keys():
  if vocab.word2count[i]>1000:
    l.append(i)
l

In [None]:
# converting the captions to tokens 
sent_idxs = [1] # sent_idxs is a list that will contain the indexes representing each word/token of the sentance
captions = [] # captions is a list that will contain the sent_idxs lists of all sentances
for idx, sentence in enumerate (df['content']): #Enumerate returns the index(idx) and the value inside that index(sentance)
    i = 0                                        # we use idx to just keep track of the iterations
    '''for word in sentence.split(' '):
        if i==0:
            sent_idxs.append(1)
        index = vocab.to_index(word)    
        if index != 3:    
            sent_idxs.append(index)
        elif index == 3 and i !=0:
            sent_idxs.append(2)
        i+=1'''
    #for word in sentence.split(' '):
    for word in vocab.tokenizer(sentence):
        sent_idxs.append(vocab.to_index(word))
        i+=1
    while i < (9396): #padding the sentances until length of the sentace becomes equal to the length of the longest sentance
        sent_idxs.append(0)
        i+=1
    captions.append(sent_idxs)
    sent_idxs = [1]

# converting list of word tokens to numpy array
captions = np.array(captions)
captions.shape

In [None]:
!kaggle datasets download -d anindya2906/glove6b 
!unzip "/content/glove6b.zip"

In [None]:
import numpy as np 
vocabs = vocab.word2index.keys()

def load_embeds(root_dir):
    embeddings_index = dict()
    f = open(root_dir)

    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

    f.close()
    return embeddings_index
    
embeddings_index = load_embeds('../content/glove.6B.300d.txt')

In [None]:
def load_embed_weights(embeddings_index, embed_dim, vocab, vocab_size):
    matrix_len = vocab_size
    weights_matrix = np.zeros((matrix_len, embed_dim))
    words_found = 0

    for i, word in enumerate(vocab):
        try: 
            weights_matrix[i] = embeddings_index[word]
            words_found += 1
        #but if the embedding for that word is not found in the glove embeddings, then:    
        except KeyError: 
            weights_matrix[i] = np.random.normal(scale=0.6, size=(embed_dim, ))

    weights_matrix = torch.tensor(weights_matrix)
    return weights_matrix


weights_matrix = load_embed_weights(embeddings_index, 300, vocabs, vocab.num_words)
weights_matrix.shape

In [None]:
captions.shape

In [None]:
# adding end tokens
for i in range(len(captions)):
    for j in range(1,len(captions[1])):
        if captions[i,j] == 0:
                captions[i,j] = 2 #we replace the first PAD with EOS
                break  

In [None]:
#One-Hot Encoding for the Target Categories
from sklearn.preprocessing import OneHotEncoder
categorical_cols=['target_ind']
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore').fit(df[categorical_cols])
encoded_cols = list(encoder.get_feature_names(categorical_cols))
target_output = encoder.transform(df[categorical_cols])

In [None]:
embed = nn.Embedding(num_embeddings = len(vocabs), embedding_dim = 300)

In [None]:
x_train = np.array(captions[0:100])
y_train =  np.array(target_output[0:100])

In [None]:
del(captions)
del(target_output)

In [None]:
x_train.shape

In [None]:
x_train = np.reshape(x_train, (x_train.shape[0] , 1, 9397, 1))
y_train = np.reshape(y_train, (y_train.shape[0] , 500, 1))
#x_train = torch.from_numpy(x_train)
#y_train = torch.from_numpy(y_train)
x_train = torch.tensor(x_train)
y_train = torch.tensor(y_train)
x_train=embed(x_train)
x_train=x_train.detach().numpy()
x_train = np.reshape(x_train, (x_train.shape[0] , 9397, 300, 1))
x_train = torch.tensor(x_train)
x_train = x_train.type(torch.FloatTensor)
y_train = y_train.type(torch.FloatTensor)

In [None]:
torch.cuda.empty_cache()

In [None]:
from torch.utils.data import Dataset, DataLoader, random_split
trainset = [(x_train[i], y_train[i]) for i in range(len(x_train))]
batch_size = 32
trainloader = DataLoader(dataset=trainset, batch_size=batch_size, shuffle = True)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
class LSTM(nn.Module):
  
  def __init__(self):
    
    super().__init__()
    self.lstm = nn.LSTM(300,512, batch_first = True)
    self.fc = nn.Linear(512,500)
    self.relu = nn.ReLU()
    
  def forward(self,x):
    out, _ = self.lstm(x)(embedding_layer)
    out = self.fc(self.relu(out[:,-1,:]))
    return out


In [None]:
import numpy as np
best_loss = 0 

model = LSTM(300,vocab_size,weights_matrix)

model.to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)
epochs = 2
for epoch in range(epochs):
  trainloss = []
  for idx, (x,y) in enumerate(trainloader): 
    x = x.to(device)
    y = y.to(device)
    preds = model(x.squeeze(-1))
    optimizer.zero_grad()
    loss = criterion(preds, y.squeeze(-1))
    trainloss.append(loss.item())
    loss.backward()
    optimizer.step()
    if idx%250 == 0:
      print(f'epoch:{epoch+1}({idx*100/len(trainloader)}%)\t loss: {np.mean(trainloss)}')

In [None]:
model=torch.load('/content/gdrive/MyDrive/Pre-Inter IIT/model.pth',map_location=torch.device('cpu'))
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

In [None]:
del(x_train)
del(y_train)

In [None]:
torch.save(model, '/content/gdrive/MyDrive/Pre-Inter IIT/model.pth')

In [None]:
test_df['content'] = test_df['content'].str.lower() #reduces all letters in the comments to lower case
test_df['content'].replace( { r'[^a-zA-Z ]' : '' }, inplace= True, regex = True) #anything other than (a-z)(A-Z)(space) will be removed
                                                                                 # ^ means "not" in regex
test_df['title'] = test_df['title'].str.lower()
test_df['title'].replace( { r'[^a-zA-Z ]' : '' }, inplace= True, regex = True)

test_df['content'] = test_df['content'].apply(clean_text)
test_df['title'] = test_df['title'].apply(clean_text)

In [None]:
# adding words to the vocabulary 
for sentence in test_df['content']:
    vocab.add_sentence(sentence) 

# converting the captions to tokens 
sent_idxs = [1] # sent_idxs is a list that will contain the indexes representing each word/token of the sentance
captions = [] # captions is a list that will contain the sent_idxs lists of all sentances
for idx, sentence in enumerate (test_df['content']): #Enumerate returns the index(idx) and the value inside that index(sentance)
    i = 0                                        # we use idx to just keep track of the iterations
    '''for word in sentence.split(' '):
        if i==0:
            sent_idxs.append(1)
        index = vocab.to_index(word)    
        if index != 3:    
            sent_idxs.append(index)
        elif index == 3 and i !=0:
            sent_idxs.append(2)
        i+=1'''
    #for word in sentence.split(' '):
    for word in vocab.tokenizer(sentence):
        sent_idxs.append(vocab.to_index(word))
        i+=1
    while i < (9396): #padding the sentances until length of the sentace becomes equal to the length of the sentance the lstm model is trained on
        sent_idxs.append(0)
        i+=1
    captions.append(sent_idxs)
    sent_idxs = [1]

# converting list of word tokens to numpy array
captions = np.array(captions)

# adding end tokens
for i in range(len(captions)):
    for j in range(1,len(captions[1])):
        if captions[i,j] == 0:
                captions[i,j] = 2 #we replace the first PAD with EOS
                break  

In [None]:
del(df)

In [None]:
test_df.drop('title',axis=1, inplace=True)
test_df.drop('content',axis=1, inplace=True)

In [None]:
test_df['target_ind']=0

In [None]:
embed = nn.Embedding(num_embeddings = len(vocabs), embedding_dim = 300)

In [None]:
x=embed(captions)

In [None]:
captions

In [None]:
x_test = np.array(captions[8])
x_test = np.reshape(x_test, (1 , 1, 9397, 1))
x_test = torch.tensor(x_test)
X_test=embed(x_test)
del(x_test)
x_test=X_test.detach().numpy()
del(X_test)
x_test = np.reshape(x_test, (1 , 9397, 300, 1))
x_test = torch.tensor(x_test)
x_test = x_test.type(torch.FloatTensor)
model.to('cpu')
torch.cuda.empty_cache()
pred = model(x_test.squeeze(-1))
pred=pred.cpu()
pred=pred.detach().numpy()
target_ind=np.argmax(pred,axis=1)
target_ind

In [None]:
x_test = np.array(captions[0])
x_test = np.reshape(x_test, (1 , 1, 9397, 1))
x_test = torch.tensor(x_test)
X_test=embed(x_test)
del(x_test)
x_test=X_test.detach().numpy()
del(X_test)
x_test = np.reshape(x_test, (1 , 9397, 300, 1))
x_test = torch.tensor(x_test)
x_test = x_test.type(torch.FloatTensor)
model.to('cpu')
torch.cuda.empty_cache()
pred = model(x_test.squeeze(-1))
pred=pred.cpu()
pred=pred.detach().numpy()
target_ind=np.argmax(pred,axis=1)
target_ind

In [None]:
model.to(device)

In [None]:
for i in test_df.index:
 x1 = np.array(captions[i])
 x2= np.reshape(x1, (1, 1, 9397, 1))
 del(x1)
 x3 = torch.tensor(x2)
 del(x2)
 x4=embed(x3)
 del(x3)
 x5=x4.detach().numpy()
 del(x4)
 x6 = np.reshape(x5, (1 , 9397, 300, 1))
 del(x5)
 x7 = torch.tensor(x6)
 del(x6)
 x8 = x7.type(torch.FloatTensor)
 del(x7)
 x9=x8.to(device)
 del(x8)
 pred = model(x9.squeeze(-1))
 del(x9)
 pred=pred.cpu()
 pred=pred.detach().numpy()
 target_ind=np.argmax(pred,axis=1)
 del(pred)
 test_df.loc[i,'target_ind']=target_ind
 print(i," ",target_ind)
 del(target_ind)

In [None]:
test_df.to_csv('/content/submission.csv',index=False)

In [None]:
test_df