In [1]:
# Import libraries
import gensim.models as g
from collections import namedtuple
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import sys 
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity

#parameters
doc2vec_dir="model/enwiki_dbow/doc2vec.bin"

In [2]:
# Read in complaints
complaints = pd.read_csv("Consumer_Complaints.csv")
complaints.shape

(881981, 18)

In [3]:
# Drop null narratives
complaints = complaints[complaints["Consumer complaint narrative"].notnull()]
complaints.shape

(187645, 18)

In [4]:
pretrained_model = g.Doc2Vec.load(doc2vec_dir)

In [5]:
narratives = complaints["Consumer complaint narrative"].as_matrix()

In [7]:
weights = np.genfromtxt('cfpb_weights.csv', delimiter=',')

In [None]:
similarity = cosine_similarity(weights)

In [None]:
np.savetxt("cfpb_weights.csv", weights, delimiter=",")

In [35]:
def test(docs_test,docs_original,model):
    docvecs = []
    for test_doc in docs_test:
        docvec = model.infer_vector(test_doc)
        print(type(docvec))
        docvecs.append(docvec)
        sims = model.docvecs.most_similar(docvec)
        print('Test Document: «{}»\n'.format(test_doc))
        for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
            print(u'%s %s: «%s»\n' % (label, sims[index], docs_original[sims[index][0]]))
        print("\n")
    return docvecs

In [10]:
# Transform data (you can add more data preprocessing steps) 
def transform(docs_original):
    docs = []
    analyzedDocument = namedtuple('AnalyzedDocument', 'words tags')
    for i, text in enumerate(docs_original):
        words = text.lower().replace('.','').split()
        tags = [i]
        docs.append(analyzedDocument(words, tags))
    return docs

In [None]:
# Split to train and test
# train_complaints, test_complaints = train_test_split(
#    complaints, test_size=0.1, random_state=42)

# Doc2Vec hyper parameters

# size = 10 # dimension of the hidden layer (default: 100)
# window = 5 # max distance between the predicted word and context words (default: 5)
# min_count = 5 # ignore all words with total frequency lower than this (default: 5)


# The ideal parameters

# vector_size = 300
# window_size = 15
# min_count = 1
# sampling_threshold = 1e-5
# negative_size = 5
# train_epoch = 100
# dm = 0 #0 = dbow; 1 = dmpv
# worker_count = 1 #number of parallel processes

# Transform narratives as input
# train_narratives = transform(train_complaints["Consumer complaint narrative"])

# Train model 
# model = g.doc2vec.Doc2Vec(train_narratives, size = size, window = window, min_count = min_count)

# Train ideal model 
# model = g.doc2vec.Doc2Vec(train_narratives, 
#                         size=vector_size, 
#                         window=window_size, 
#                         min_count=min_count, 
#                         sample=sampling_threshold, 
#                         workers=worker_count, 
#                         hs=0, 
#                         dm=dm, 
#                         negative=negative_size, 
#                         dbow_words=1,
#                         dm_concat=1, 
#                         iter=train_epoch)

# train_docs = np.array(list(train_complaints["Consumer complaint narrative"]))
# test_docs = np.array(list(test_complaints["Consumer complaint narrative"]))