In [34]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import defaultdict
from nltk.corpus import wordnet as wn


In [35]:
exported_keywords = pd.read_csv(r"personal-loan-kwds.csv",encoding='latin-1')

In [36]:
exported_keywords.head()

Unnamed: 0,Keywords
0,personalloans
1,personaloaneasy
2,home remodeling loans
3,home improvement loans
4,home renovation loans


In [37]:
# Step - a : Remove blank rows if any.
exported_keywords['Keywords'].dropna(inplace=True)

In [38]:
# Step - b : Change all the text to lower case. This is required as python interprets 'dog' and 'DOG' differently
exported_keywords['Keywords'] = [entry.lower() for entry in exported_keywords['Keywords']]

In [39]:
# Step - c : Tokenization : In this each entry in the corpus will be broken into set of words
import nltk
nltk.download('punkt')

exported_keywords['Keywords']= [word_tokenize(entry) for entry in exported_keywords['Keywords']]

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\FitzRoy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [40]:
# Step - d : Remove Stop words, Non-Numeric and perfom Word Stemming/Lemmenting.

import nltk
nltk.download('wordnet')
import nltk
nltk.download('averaged_perceptron_tagger')
import nltk
nltk.download('stopwords')

# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
for index,entry in enumerate(exported_keywords['Keywords']):
    # Declaring Empty List to store the words that follow the rules for this step
    Final_words = []
    # Initializing WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(entry):
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    # The final processed set of words for each iteration will be stored in 'text_final'
    exported_keywords.loc[index,'Keywords_final'] = str(Final_words)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\FitzRoy\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\FitzRoy\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\FitzRoy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [41]:
exported_keywords.head()

Unnamed: 0,Keywords,Keywords_final
0,[personalloans],['personalloans']
1,[personaloaneasy],['personaloaneasy']
2,"[home, remodeling, loans]","['home', 'remodeling', 'loan']"
3,"[home, improvement, loans]","['home', 'improvement', 'loan']"
4,"[home, renovation, loans]","['home', 'renovation', 'loan']"


In [42]:
import re

for index,entry in enumerate(exported_keywords['Keywords_final']):
    #entry = re.sub('[,]',' ',entry)
    entry = re.sub('[,\[\]\']','',entry)
    exported_keywords.loc[index,'Keywords_final_str'] = entry


In [43]:
exported_keywords.head()

Unnamed: 0,Keywords,Keywords_final,Keywords_final_str
0,[personalloans],['personalloans'],personalloans
1,[personaloaneasy],['personaloaneasy'],personaloaneasy
2,"[home, remodeling, loans]","['home', 'remodeling', 'loan']",home remodeling loan
3,"[home, improvement, loans]","['home', 'improvement', 'loan']",home improvement loan
4,"[home, renovation, loans]","['home', 'renovation', 'loan']",home renovation loan


In [44]:
print(len(exported_keywords))

1051


In [45]:
exported_keywords = exported_keywords.drop_duplicates(subset='Keywords_final_str', keep='first', inplace=False)

In [46]:
print(len(exported_keywords))

754


In [47]:
exported_keywords = exported_keywords.dropna()

In [48]:
print(len(exported_keywords))

754


In [49]:
df_Keywords = pd.DataFrame(data=exported_keywords['Keywords_final_str'])
df_Keywords = df_Keywords.rename(columns={"Keywords_final_str": "Keywords"})

In [50]:
df_Keywords.head()

Unnamed: 0,Keywords
0,personalloans
1,personaloaneasy
2,home remodeling loan
3,home improvement loan
4,home renovation loan


In [57]:
cols = ['Keywords','Tokens']
lst = []
for keywords in df_Keywords['Keywords']:
    token = len(re.findall("[ \s]+", keywords))+1
    lst.append([keywords, token])
df1 = pd.DataFrame(lst, columns=cols)
df1

Unnamed: 0,Keywords,Tokens
0,personalloans,1
1,personaloaneasy,1
2,home remodeling loan,3
3,home improvement loan,3
4,home renovation loan,3
...,...,...
749,credit card loan wiki,4
750,credit card debt loan,4
751,credit card loan definition,4
752,credit card personal loan,4


In [58]:
df1.to_csv(r'personal-loan-kwd-update.csv', index = False)