### Validate ONPdoc2vec.ONPd2v class

In [1]:
import numpy as np
import pandas as pd
import pickle

import string
import nltk
from nltk.corpus import stopwords

from ONPdoc2vec import ONPd2v

In [2]:
cached_stopwords=stopwords.words('english')

punc=string.punctuation
table=str.maketrans('','',string.punctuation)

def prep(text):
    #word tokenization
    tokens=nltk.word_tokenize(text)
    
    #converting to lower case
    tokens=[t.lower() for t in tokens]
    
    #removing punctuations
    tokens=[t.translate(table) for t in tokens]
    
    #filter tokens by length (minimum 3 chars)
    tokens=[t for t in tokens if len(t)>2]
    
    #remove stopwords
    tokens=[t for t in tokens if t not in cached_stopwords]
    
    return ",".join( tokens)

In [3]:
prep( "This is for testing the NLP mini project").split(",")

['testing', 'nlp', 'mini', 'project']

In [4]:
size = 100
pca_count = 10

filename = '../data/output/SS_Extracted_content_NER_all.xlsx'
column = 'refined_content'

print( "Processing " + filename + " " + column)
df = pd.read_excel(filename)

df["refined_content"] = df.refined_content.apply( prep)

df.at[0, 'refined_content']


Processing ../data/output/SS_Extracted_content_NER_all.xlsx refined_content




In [5]:
d2v_model = ONPd2v( df[column].values, size=size, pca_count=pca_count)

# Save the model
d2v_model.save_model( '../data/output/models/d2v_content.model')

Number of valid training row entries : 7795
Doc2vec model creation completed
PCA fitting completed with n_components = 10


In [6]:
all_columns_list = [ "content_" + str(i) for i in range(size)]
pca_columns_list = [ "content_pca_" + str(i) for i in range(pca_count)]

df_all = df[["Id"]]
for pca_column in pca_columns_list:
    df_all[ pca_column] = 0.0

for all_column in all_columns_list:
    df_all[ all_column] = 0.0

for index, row in df_all.iterrows():
    pca_values = d2v_model.infer_vector_pca( df.at[ index, column])
    for i in range(pca_count):
        pca_column = "content_pca_" + str(i)
        df_all.at[ index, pca_column] = pca_values[i]

    all_values = d2v_model.infer_vector( df.at[ index, column])
    for i in range(size):
        all_column = "content_" + str(i)
        df_all.at[ index, all_column] = all_values[i]
        
    if index % 100 == 0:
        print( str(index) + ", ", end='')
    
print()

df_all.to_excel('../data/output/2_d2v_content.xlsx', index=False)

print( "Completed d2v model and matrix for entire content")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


0, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, 1600, 1700, 1800, 1900, 2000, 2100, 2200, 2300, 2400, 2500, 2600, 2700, 2800, 2900, 3000, 3100, 3200, 3300, 3400, 3500, 3600, 3700, 3800, 3900, 4000, 4100, 4200, 4300, 4400, 4500, 4600, 4700, 4800, 4900, 5000, 5100, 5200, 5300, 5400, 5500, 5600, 5700, 5800, 5900, 6000, 6100, 6200, 6300, 6400, 6500, 6600, 6700, 6800, 6900, 7000, 7100, 7200, 7300, 7400, 7500, 7600, 7700, 




Completed d2v model and matrix for entire content


### for Keywords

In [7]:
df = pd.read_excel('../data/output/2_keywords_list.xlsx')
d2v_model = ONPd2v( df['keywords'].values, size=size, pca_count=pca_count)
column = "keywords"

# Save the model
d2v_model.save_model( '../data/output/models/d2v_keywords.model')


Number of valid training row entries : 7795
Doc2vec model creation completed
PCA fitting completed with n_components = 10


In [8]:
all_columns_list = [ "keywords_" + str(i) for i in range(size)]
pca_columns_list = [ "keywords_pca_" + str(i) for i in range(pca_count)]

df_all = df[["Id"]]
for pca_column in pca_columns_list:
    df_all[ pca_column] = 0.0

for all_column in all_columns_list:
    df_all[ all_column] = 0.0

for index, row in df_all.iterrows():
    pca_values = d2v_model.infer_vector_pca( df.at[ index, column])
    for i in range(pca_count):
        pca_column = "keywords_pca_" + str(i)
        df_all.at[ index, pca_column] = pca_values[i]

    all_values = d2v_model.infer_vector( df.at[ index, column])
    for i in range(size):
        all_column = "keywords_" + str(i)
        df_all.at[ index, all_column] = all_values[i]
        
    if index % 100 == 0:
        print( str(index) + ", ", end='')
    
print()

df_all.to_excel('../data/output/2_d2v_keywords.xlsx', index=False)

print( "Completed d2v model and matrix for all keywords")


0, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, 1600, 1700, 1800, 1900, 2000, 2100, 2200, 2300, 2400, 2500, 2600, 2700, 2800, 2900, 3000, 3100, 3200, 3300, 3400, 3500, 3600, 3700, 3800, 3900, 4000, 4100, 4200, 4300, 4400, 4500, 4600, 4700, 4800, 4900, 5000, 5100, 5200, 5300, 5400, 5500, 5600, 5700, 5800, 5900, 6000, 6100, 6200, 6300, 6400, 6500, 6600, 6700, 6800, 6900, 7000, 7100, 7200, 7300, 7400, 7500, 7600, 7700, 




Completed d2v model and matrix for all keywords


#### NER List d2v model and output

In [9]:
column = "NER_list"
df = pd.read_excel('../data/output/SS_Extracted_content_NER_text.xlsx')
d2v_model = ONPd2v( df[column].values, size=size, pca_count=pca_count)

# Save the model
d2v_model.save_model( '../data/output/models/d2v_NER_list.model')

Number of valid training row entries : 7793
Doc2vec model creation completed
PCA fitting completed with n_components = 10


In [10]:
all_columns_list = [ "NER_list_" + str(i) for i in range(size)]
pca_columns_list = [ "NER_list_pca_" + str(i) for i in range(pca_count)]

df_all = df[["Id"]]
for pca_column in pca_columns_list:
    df_all[ pca_column] = 0.0

for all_column in all_columns_list:
    df_all[ all_column] = 0.0

for index, row in df_all.iterrows():
    pca_values = d2v_model.infer_vector_pca( df.at[ index, column])
    for i in range(pca_count):
        pca_column = "NER_list_pca_" + str(i)
        df_all.at[ index, pca_column] = pca_values[i]

    all_values = d2v_model.infer_vector( df.at[ index, column])
    for i in range(size):
        all_column = "NER_list_" + str(i)
        df_all.at[ index, all_column] = all_values[i]
        
    if index % 100 == 0:
        print( str(index) + ", ", end='')
    
print()

df_all.to_excel('../data/output/2_d2v_NER_list.xlsx', index=False)

print( "Completed d2v model and matrix for all NER list")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


0, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, 1600, 1700, 1800, 1900, 2000, 2100, 2200, 2300, 2400, 2500, 2600, 2700, 2800, 2900, 3000, 3100, 3200, 3300, 3400, 3500, 3600, 3700, 3800, 3900, 4000, 4100, 4200, 4300, 4400, 4500, 4600, 4700, 4800, 4900, 5000, 5100, 5200, 5300, 5400, 5500, 5600, 5700, 5800, 5900, 6000, 6100, 6200, 6300, 6400, 6500, 6600, 6700, 6800, 6900, 7000, 7100, 7200, 7300, 7400, 7500, 7600, 7700, 




Completed d2v model and matrix for all NER list


### NER most common

In [11]:
column = "NER_most_common"
df = pd.read_excel('../data/output/SS_Extracted_content_NER_text.xlsx')
d2v_model = ONPd2v( df[column].values, size=size, pca_count=pca_count)

# Save the model
d2v_model.save_model( '../data/output/models/d2v_NER_most_common.model')

Number of valid training row entries : 7793
Doc2vec model creation completed
PCA fitting completed with n_components = 10


In [12]:
all_columns_list = [ "NER_most_common_" + str(i) for i in range(size)]
pca_columns_list = [ "NER_most_common_pca_" + str(i) for i in range(pca_count)]

df_all = df[["Id"]]
for pca_column in pca_columns_list:
    df_all[ pca_column] = 0.0

for all_column in all_columns_list:
    df_all[ all_column] = 0.0

for index, row in df_all.iterrows():
    pca_values = d2v_model.infer_vector_pca( df.at[ index, column])
    for i in range(pca_count):
        pca_column = "NER_most_common_pca_" + str(i)
        df_all.at[ index, pca_column] = pca_values[i]

    all_values = d2v_model.infer_vector( df.at[ index, column])
    for i in range(size):
        all_column = "NER_most_common_" + str(i)
        df_all.at[ index, all_column] = all_values[i]
        
    if index % 100 == 0:
        print( str(index) + ", ", end='')
    
print()

df_all.to_excel('../data/output/2_d2v_NER_most_common.xlsx', index=False)

print( "Completed d2v model and matrix for all NER most common")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


0, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, 1600, 1700, 1800, 1900, 2000, 2100, 2200, 2300, 2400, 2500, 2600, 2700, 2800, 2900, 3000, 3100, 3200, 3300, 3400, 3500, 3600, 3700, 3800, 3900, 4000, 4100, 4200, 4300, 4400, 4500, 4600, 4700, 4800, 4900, 5000, 5100, 5200, 5300, 5400, 5500, 5600, 5700, 5800, 5900, 6000, 6100, 6200, 6300, 6400, 6500, 6600, 6700, 6800, 6900, 7000, 7100, 7200, 7300, 7400, 7500, 7600, 7700, 




Completed d2v model and matrix for all NER most common
