In [1]:
# Import libraries
import gensim.models as g
from collections import namedtuple
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
# Read in complaints
complaints = pd.read_csv("Consumer_Complaints.csv")
complaints.shape

(881981, 18)

In [4]:
# Drop null narratives
complaints = complaints[complaints["Consumer complaint narrative"].notnull()]
complaints.shape

(187645, 18)

In [10]:
train_narratives = train_complaints["Consumer complaint narrative"]
test_narratives = test_complaints["Consumer complaint narrative"]

In [16]:
# Split to train and test
train_complaints, test_complaints = train_test_split(
   complaints, test_size=0.1, random_state=42)

# Doc2Vec hyper parameters

size = 10 # dimension of the hidden layer (default: 100)
window = 5 # max distance between the predicted word and context words (default: 5)
min_count = 5 # ignore all words with total frequency lower than this (default: 5)

In [17]:
# Transform narratives as input
train_narratives = transform(train_narratives)

# Train model 
model = g.doc2vec.Doc2Vec(train_narratives, size = size, window = window, min_count = min_count)

In [26]:
test(test_narratives[:20],train_complaints["Consumer complaint narrative"].as_matrix(),model)

Test Document: «We had a loan modification done with Nationstar Mortgage to lower our payments, change to a fixed rate and avoid a foreclosure. But in the long run our payments went up and our mortgage was extended 10 years. Also, I did not receive a copy of the modification.»

MOST (159498, 0.9945135116577148): «Hello, account # XXXX from XXXX XXXX has been paid and closed. You have documented that the account is " paid as agreed ''. The account is appearing on the negative items of my credit report. Please remove the item from negative or delete the account. You are reporting false information ans as per the FCRA you are in violation. Thank You.»

MEDIAN (36837, 0.9925611019134521): «I saw on my report that they have an address for me of XXXX. I have never lived at or near that address.»

LEAST (128307, 0.9917168617248535): «I have an existing credit account with Guitar Center, which was originally serviced by XXXX XXXX. On XXXX, 2015 it was announced a new deal with Synchrony Financ

[array([-3.53971505, -0.08377027,  1.90242052,  0.74869645, -1.63076365,
        -0.46452883,  3.62382483,  0.21095544,  0.92393613, -2.47854757], dtype=float32),
 array([-4.23677778,  0.7786575 ,  3.12947512,  0.66193867, -6.52903938,
         1.42390764,  7.01831913, -0.81276387,  2.14432359, -4.15998316], dtype=float32),
 array([-3.79656792, -0.4334262 ,  1.84969962,  0.62124312, -0.56634796,
        -0.94541299,  3.25877905, -0.12842965,  1.08895588, -2.72023654], dtype=float32),
 array([-4.39343548,  1.16520011,  2.30519342,  0.01686008, -3.51363993,
         0.72063643,  5.42878246, -0.76579976,  2.54990649, -4.16196585], dtype=float32),
 array([ -4.3832469 ,   3.28696442,   4.45151758,   0.46202648,
        -10.86248398,   1.67573154,   8.95717716,  -1.95643759,
          0.93823028,  -5.32272148], dtype=float32),
 array([-4.26425648,  0.36065298,  1.88422191,  0.22455837, -2.6833415 ,
         0.26085585,  4.31509638, -0.4060556 ,  1.79797089, -3.44431472], dtype=float32),
 arr

In [25]:
def test(docs_test,docs_original,model):
    docvecs = []
    for test_doc in docs_test:
        docvec = model.infer_vector(test_doc)
        docvecs.append(docvec)
        sims = model.docvecs.most_similar([docvec])
        print('Test Document: «{}»\n'.format(test_doc))
        for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
            print(u'%s %s: «%s»\n' % (label, sims[index], docs_original[sims[index][0]]))
        print("\n")
    return docvecs

In [7]:
# Transform data (you can add more data preprocessing steps) 
def transform(docs_original):
    docs = []
    analyzedDocument = namedtuple('AnalyzedDocument', 'words tags')
    for i, text in enumerate(docs_original):
        words = text.lower().replace('.','').split()
        tags = [i]
        docs.append(analyzedDocument(words, tags))
    return docs