In [1]:
import numpy as np
import pandas as pd

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

import copy
import pickle
import re
import string

In [4]:
df = pd.read_pickle('../Data/01_sf_recommender_pca')
df.reset_index(drop=True, inplace=True)
df.shape

(3760, 304)

## Creating Pipeline for reading text, vecotrizing it, and creating a recommendation

## Turn Text Description into a Vector

In [6]:
def get_input():
    "Gets input from the user."
    return input()

In [8]:
job_desc = get_input()

  Shape the vision and architecture of the system infrastructure that powers the next-generation of intelligent machines. • Implement mission-critical software in a reliable and sustainable manner. • Craft tools, processes and frameworks to redefine the software development in the rise of autonomous systems powered by artificial intelligence. • Collaborate with, learn from, and mentor a team of diverse roles, which include software engineers, roboticists and AI researchers.  We look for a track record of the following: • 3+ years of software infrastructure experience • Solid background in multiple programming languages, e.g. Python or C/C++ and willingness to pick up any new programming languages or frameworks. • Experience, designing, implementing, and running production services • Having built enough systems to recognize what are the pragmatic designs (not the most fancy ones) • Fearless about jumping around the stack (from improving driver to writing CUDA kernel to putting together 

In [7]:
def format_text(text):
    "Reads a single string. Removes punctuation and makes text lowercase. Removes stopwords"
    
    text = re.sub('[\W_]+', ' ', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text.lower())
    
    stop = set(stopwords.words('english'))
    text = [item for item in text.split() if item not in stop]
    
    lem = WordNetLemmatizer()
    text = [lem.lemmatize(token) for token in text]
    
    stop2 = ['data', 'code', 'team', 'work']
    text = [item for item in text if item not in stop2]
    
    return text

In [10]:
job_desc = format_text(job_desc)

In [8]:
def create_ngrams(text):
    
    doc_ngrams = []
    
    for i, j in zip(text, text[1:]):
        bigram = '_'.join([i, j])
        doc_ngrams.append(bigram)
        
    for i, j, k in zip(text, text[1:], text[2:]):
        trigram = '_'.join([i, j, k])
        doc_ngrams.append(trigram)
        
    for i, j, k, l in zip(text, text[1:],
                          text[2:], text[3:]):
        quadgram = '_'.join([i, j, k, l])
        doc_ngrams.append(quadgram)
        
    pickle_in = open('../Tools_and_models/top_ngrams','rb')
    top_ngrams = pickle.load(pickle_in)
    pickle_in.close()
        
    top_ngrams = top_ngrams.split()
    
    for doc_ngram in doc_ngrams:
        for top_ngram in top_ngrams:
            if doc_ngram == top_ngram:
                text.append(doc_ngram)
            else:
                pass
    return ' '.join(text)
    

In [12]:
job_desc = create_ngrams(job_desc)

In [9]:
def vectorize(text):
    pickle_in = open('../Tools_and_models/tf_idf_array','rb')
    tf_idf_array = pickle.load(pickle_in)
    pickle_in.close()
    
    pickle_in = open('../Tools_and_models/tf_idf_model','rb')
    tf_idf = pickle.load(pickle_in)
    pickle_in.close()
    
    text_vector_array = tf_idf.transform([text]).toarray()
    
    text_vector = pd.DataFrame(text_vector_array,
                               columns=tf_idf.get_feature_names())
    
    return text_vector
    

In [14]:
text_vector = vectorize(job_desc)
text_vector

Unnamed: 0,aa,aaa,aac,aadbstrong_organizational,aai,aalas,aami,aatt,aaude,aav,...,zoneroot_cause,zonestrong_organizational,zoning,zookeeper,zoom,zoura,zsfg,zuora,zweigwhite,zymergen
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
def pcaitize(text_vector):

    pickle_in = open('../Tools_and_models/pca_tool','rb')
    pca = pickle.load(pickle_in)
    pickle_in.close()
    
    text_pca = pca.transform(text_vector)
    
    return pd.DataFrame(text_pca)

In [16]:
text_pca = pcaitize(text_vector)
text_pca

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,0.113463,0.013039,-0.059531,0.023948,0.036254,-0.036031,-0.053242,-0.043271,-0.018679,-0.001879,...,0.009766,0.021482,-0.016988,0.038074,-0.018458,0.005457,-0.015219,0.020732,-0.027946,0.004073


## Making a search based on our text_pca

In [11]:
def recommender(df, text_pca,n_recommendations=5):
    recos = pd.DataFrame(df.iloc[:,4:].apply(lambda x: np.dot(x,text_pca.iloc[0,:]), axis=1))
    largest_indeces = recos[0].nlargest(n_recommendations).index
    urls = df.iloc[largest_indeces].posting_url.values
    companies = df.iloc[largest_indeces].company_name.values
    titles = df.iloc[largest_indeces].job_title.values
    return urls, companies, titles

In [26]:
urls, companies, titles = recommender(df, text_pca,5)
for x, y, z, in zip(urls, companies, titles):
    print(y, z, x, '\n\n')

OpenAI Software Engineer, Robotics Interfaces https://www.indeed.com/rc/clk?jk=c94e3111d304281a&fccid=c037aeb6086309d9&vjs=3 


U.S. Bank AI/ML Full Stack Engineer - San Francisco, CA https://www.indeed.com/rc/clk?jk=36366ba562aaf724&fccid=ae5bfc395c530fbc&vjs=3 


Capital One Full Stack Engineer https://www.indeed.com/rc/clk?jk=25d7481fcd84367e&fccid=b85c5070c3d3d8c8&vjs=3 


Honor Senior Backend Engineer https://www.indeed.com/rc/clk?jk=5d1e6c3f46968c4c&fccid=d12645453d9cc8ea&vjs=3 


Honor Senior Software Engineer https://www.indeed.com/rc/clk?jk=2b1334591f9b682b&fccid=d12645453d9cc8ea&vjs=3 




In [12]:
def make_recommendations(df, n_recommendations=5):
    text = get_input()
    formatted_text = format_text(text)
    ngramed_text = create_ngrams(formatted_text)
    text_vector = vectorize(ngramed_text)
    text_pca = pcaitize(text_vector)
    recommendations = recommender(df, text_pca,n_recommendations=5)
    return recommendations

In [14]:
make_recommendations(df, n_recommendations=5)

 Interact with internal clients and product managers to understand their requirements for predictive analytics applications Help develop and test predictive algorithms to be implemented as part of internal tools and customer facing applications Understand business goals and initiatives, and combine business modeling skills with outstanding data analysis. Guide prioritization by measuring and estimating potential benefits Cooperate with the Data Engineering team to design and execute replicable data acquisition and utilization processes Cooperate with the ML Engineering team to integrate models into production code Provide research and statistical expertise across the company Contribute to a solid development, evaluation, deployment and refinement methodology for data science projects Cooperate with BI teams to deliver forward looking insights into our business Participate in the research community through conference presentations and articles Qualifications   MS/PhD degree in Statistic

(array(['https://www.indeed.com/rc/clk?jk=e6f29d6df6180eb8&fccid=14932909b46ac703&vjs=3',
        'https://www.indeed.com/rc/clk?jk=8240f6e52520f3d6&fccid=1af6457dc9e90c32&vjs=3',
        'https://www.indeed.com/rc/clk?jk=15278c6f872cafd9&fccid=1af6457dc9e90c32&vjs=3',
        'https://www.indeed.com/rc/clk?jk=32b7269602773e12&fccid=f37af6e0eae53880&vjs=3',
        'https://www.indeed.com/rc/clk?jk=c5eca57261768944&fccid=e4b075354d7c2865&vjs=3'],
       dtype=object),
 array(['SurveyMonkey', 'Lattice Engines', 'Lattice Engines',
        'Albertsons Companies', 'Gilead Sciences'], dtype=object),
 array(['Data Scientist', 'Senior Data Scientist',
        'Senior Data Science Engineer', 'Head of Data Science',
        'Clinical Data Scientist'], dtype=object))