In [1]:
#Importing libraries
import pandas as pd
import numpy as np
import re
import nltk
import spacy
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
wordnet = WordNetLemmatizer()
nlp = spacy.load('en_core_web_sm')

In [2]:
#Pre-processing steps. 
stop_words = set(stopwords.words('english'))
def pre_process(msg):
    msg = str(msg)
    msg = msg.lower()
    msg = re.sub('[^a-zA-Z]',' ', msg)
    msg = nltk.word_tokenize(msg)
    msg = [wordnet.lemmatize(word) for word in msg if word not in stop_words]
    msg = ' '.join(msg)
    return msg

In [3]:
df = pd.read_excel('Training_1.xlsx')
df_new = df[['title','sentence','Quality','Features','Purchase/interaction experience (delivery/packaging, customer care etc)','Price']]
del df

In [4]:
df_new['Quality'] = pd.to_numeric(df_new['Quality'],errors = 'coerce')

In [5]:
#fill na values with zero 
df_new.fillna(value=0,inplace=True)

#a function to detect rows that don't have any classification labels
#For that we will take sum of all the classes, if the sum is zero, that row has no classes and can be removed
def na_class(df):
    idx_lst = []
    for i in range(0,len(df)):
        sum = df['Quality'].iloc[i] + df['Features'].iloc[i] + df['Purchase/interaction experience (delivery/packaging, customer care etc)'].iloc[i] + df['Price'].iloc[i]
        if sum == 0 :
            idx_lst.append(i)
    return idx_lst

In [6]:
# call the function
idx_lst = na_class(df_new)

In [7]:
#we can drop these rows
df_new.drop(idx_lst, inplace=True)

In [8]:
#function to create tokens 
def create_tokens(doc):
    doc = nlp(doc)
    tokens = [token.text for token in doc]
    tokens = list(dict.fromkeys(tokens))
    tokens = ' '.join(tokens)
    return tokens

# function to pre_process and get tokens for all the text files
def process(df):
    df_new1 = pd.DataFrame()
    df_new1['title'] = df['title'].apply(lambda x: pre_process(x))
    df_new1['sentence'] = df['sentence'].apply(lambda x: pre_process(x))
    text = df_new1['title'] + ' ' + df_new1['sentence']
    document = list()
    for i in range(0,len(text)):
        tokens = create_tokens(text.iloc[i])
        document.append(tokens)
    return document

In [9]:
X_data = process(df_new)

In [10]:
Y_data = df_new[['Quality','Features','Purchase/interaction experience (delivery/packaging, customer care etc)','Price']]

In [11]:
rev_name = pd.DataFrame(columns=['rev_name'], index = Y_data.index)
for x in Y_data.columns:
    idx = Y_data[Y_data[x]== 1 ].index
    rev_name['rev_name'].loc[idx] = x

In [13]:
X = pd.DataFrame(data = X_data, columns=['Text'])

In [14]:
final_df = pd.concat([X, rev_name], axis = 1 )

In [20]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11052 entries, 0 to 11173
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Text      9904 non-null   object
 1   rev_name  9904 non-null   object
dtypes: object(2)
memory usage: 259.0+ KB


#### Part 2 Doc2Vec

In [16]:
from sklearn.model_selection import train_test_split
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

In [17]:
train, test = train_test_split(final_df, test_size = 0.3, random_state=42)

In [19]:
train_tagged = train.apply(
    lambda r: TaggedDocument(words=r['Text'].split(), tags=[r.rev_name]), axis=1)
test_tagged = test.apply(
    lambda r: TaggedDocument(words=r['Text'].split(), tags=[r.rev_name]), axis=1)

AttributeError: 'float' object has no attribute 'split'

In [28]:
final_df.to_csv('Processed_data.csv')