In [None]:
# import nltk
import pandas as pd
from glob import glob
import gensim.downloader as api
from nltk.tokenize import TweetTokenizer
import wordninja
import numpy as np
import re
from spellchecker import SpellChecker

spell = SpellChecker()

In [2]:
tokenizer = TweetTokenizer()
glove_model = api.load("glove-twitter-25")
files = glob("lending_club_data/*.zip")

In [3]:
def read_one(path):
    return pd.read_csv(path, low_memory=False, skiprows=1, compression="zip")


def create_combined_dataset(files):

    data = [read_one(f"{file}") for file in files if file.endswith(".zip")]
    return pd.concat(data)


def build_dataset():

    features = [
        "title",
        # "purpose", 
        "emp_title",
    ]
    df = create_combined_dataset(files)

    print(df.head())
    return df[features]

In [31]:
df = build_dataset()

          id  member_id  loan_amnt  funded_amnt  funded_amnt_inv        term  \
0  163477201        NaN     5000.0       5000.0           5000.0   36 months   
1  163564694        NaN    23000.0      23000.0          23000.0   36 months   
2  164137439        NaN    33000.0      33000.0          33000.0   60 months   
3  164024547        NaN     3000.0       3000.0           3000.0   36 months   
4  163607777        NaN    25000.0      25000.0          25000.0   60 months   

  int_rate  installment grade sub_grade  ...  \
0   17.74%       180.12     C        C5  ...   
1    6.46%       704.51     A        A1  ...   
2   15.24%       789.24     C        C2  ...   
3   23.05%       116.21     D        D3  ...   
4   10.33%       535.25     B        B1  ...   

  orig_projected_additional_accrued_interest hardship_payoff_balance_amount  \
0                                        NaN                            NaN   
1                                     228.77                        2125

In [5]:
df["emp_title"].value_counts()

Teacher                                     49656
Manager                                     43333
Owner                                       24758
Registered Nurse                            20495
Driver                                      19491
                                            ...  
Lineage Investments Inc DBA Worldwide Ex        1
CORP Admin                                      1
Ink Technician                                  1
garden estates of tyler                         1
Inventiv health clinical                        1
Name: emp_title, Length: 587644, dtype: int64

In [32]:
df[df["title"].str.lower() == "re.split(r'\W+', 'Words, words, words.')"]

Unnamed: 0,title,emp_title
167087,paymycreditcardsdebt,Hunters Run c.c.


In [97]:
mask = [isinstance(item, (str, bytes)) for item in df['title']]
df = df.loc[mask]
mask = [isinstance(item, (str, bytes)) for item in df['emp_title']]
df = df.loc[mask]

# words = [get_words(col) for col in ["title","emp_title",]]

In [120]:
SINGLES = {"other","vacation","business","consolidate"}
def get_vectors(input_string):
    """Get the sum of the word vectors... Also, try to clean up some stuff"""
    # Empty array
    final_vector = np.zeros(25)
    # Tokenize the words in the input
    words = tokenizer.tokenize(input_string)
    # If there's only one word, something went wrong with the tokenizer
    if len(words) == 1 and words[0].lower() not in SINGLES:
        # Try to split without spaces
        words = wordninja.split(words[0])
        # print(words)
        if len(words) == 1:
                # print("split")
                words = re.split(r"[\W_]+", words[0])
    # Check for spelling errors
    # misspelled = spell.unknown(words)
    # For the words we've found
    for word in words:
        # If the word is misspelled
        # if word in misspelled:
        #     print(f"Fixing {word}")
        #     # Correct it
        #     word = spell.correction(word)
        try:
            # Add the vector from glove to the final_vector
            final_vector += glove_model.word_vec(word.lower())
        except KeyError:
            # print(f"{word.lower()} not valid")
            pass
    # print(f"Added {len(words)}")
    return final_vector



In [121]:
df["title_vectors"] = df["title"].apply(lambda x: get_vectors(x))
df["emp_title_vectors"] = df["emp_title"].apply(lambda x: get_vectors(x))

In [122]:
df

Unnamed: 0,title,emp_title,title_vectors,emp_title_vectors
0,Debt consolidation,Lead Mtc Tech,"[0.05954001843929291, 2.402799963951111, -3.73...","[0.1498199701309204, 1.855019986629486, -1.632..."
2,Credit card refinancing,Financial Analyst,"[-0.7273850105702877, 2.72393000125885, -1.896...","[0.05384000390768051, 2.3497299551963806, -2.9..."
4,Credit card refinancing,Industrial electrician,"[-0.7273850105702877, 2.72393000125885, -1.896...","[-2.1752399802207947, 0.5005300045013428, -1.8..."
5,Debt consolidation,Tech Ops Analyst,"[0.05954001843929291, 2.402799963951111, -3.73...","[1.6198199912905693, 1.7082599997520447, -1.48..."
7,Credit card refinancing,Rn,"[-0.7273850105702877, 2.72393000125885, -1.896...","[-0.5483499765396118, 0.07089100033044815, -0...."
...,...,...,...,...
130766,Debt consolidation,Direct Service Provider Day Activity,"[0.05954001843929291, 2.402799963951111, -3.73...","[0.1776300072669983, 3.315699964761734, -2.947..."
130768,Other,Cleaner,"[0.5369499921798706, 0.4911800026893616, 0.286...","[-1.2438000440597534, -0.46535998582839966, -0..."
130769,Debt consolidation,client analyst,"[0.05954001843929291, 2.402799963951111, -3.73...","[-0.4282099977135658, 1.5535699725151062, -2.1..."
130770,Other,Residence Counselor,"[0.5369499921798706, 0.4911800026893616, 0.286...","[-3.0353000164031982, 1.2958319932222366, -0.0..."
