In [4]:
# Import libraries

from gensim.models import doc2vec
from collections import namedtuple
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
complaints = pd.read_csv("Consumer_Complaints.csv")
complaints.shape

(881981, 18)

In [3]:
complaints = complaints[complaints["Consumer complaint narrative"].notnull()]
complaints.shape

(187645, 18)

In [13]:
train_complaints, test_complaints = train_test_split(
    complaints, test_size=0.1, random_state=42)

In [28]:
train_narratives = train_complaints["Consumer complaint narrative"]
test_narratives = test_complaints["Consumer complaint narrative"]

In [33]:
# Doc2Vec hyper parameters

size = 3 # dimension of the hidden layer (default: 100)
window = 5 # max distance between the predicted word and context words (default: 5)
min_count = 5 # ignore all words with total frequency lower than this (default: 5)

In [29]:
train_narratives = transform(train_complaints["Consumer complaint narrative"])

In [None]:
# Train model 
model = doc2vec.Doc2Vec(train_narratives, size = size, window = window, min_count = min_count)

In [32]:
# Test with new docs
test_docvecs = test(test_narratives[:3],train_narratives)

Test Document: «We had a loan modification done with Nationstar Mortgage to lower our payments, change to a fixed rate and avoid a foreclosure. But in the long run our payments went up and our mortgage was extended 10 years. Also, I did not receive a copy of the modification.»

MOST (142827, 0.8663327097892761): «AnalyzedDocument(words=['there', 'is', 'a', 'negative', 'mark', 'against', 'me', 'that', 'should', 'have', 'been', 'taken', 'off', 'years', 'ago!'], tags=[142827])»

MEDIAN (98008, 0.5247979164123535): «AnalyzedDocument(words=['i', 'have', 'had', 'on', 'my', 'credit', 'for', '2+', 'years', 'a', 'charge', 'of', 'xxxx', 'i', 'have', 'questioned', 'this', 'balance', 'numerous', 'times', 'and', 'my', 'response', 'from', 'xxxx', 'hospital', 'is', ';', 'payment', 'in', 'full', 'is', 'required', 'xxxx', 'hospital', 'accepted', 'a', 'negotiated', 'balance', 'from', 'the', 'xxxx', 'xxxx', 'via', 'xxxx', 'original', 'bill', 'was', 'xxxx,', 'xxxx', 'insurance', 'pd', 'xxxx', 'hospital', 

In [26]:
def test(docs_test,docs_original):
    docvecs = []
    for test_doc in docs_test:
        docvec = model.infer_vector(test_doc)
        docvecs.append(docvec)
        sims = model.docvecs.most_similar([docvec], topn=len(model.docvecs))
        print('Test Document: «{}»\n'.format(test_doc))
        for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
            print(u'%s %s: «%s»\n' % (label, sims[index], docs_original[sims[index][0]]))
        print("\n")
    return docvecs

In [7]:
# Transform data (you can add more data preprocessing steps) 
def transform(docs_original):
    docs = []
    analyzedDocument = namedtuple('AnalyzedDocument', 'words tags')
    for i, text in enumerate(docs_original):
        words = text.lower().replace('.','').split()
        tags = [i]
        docs.append(analyzedDocument(words, tags))
    return docs