In [1]:
from elasticsearch import Elasticsearch
from config import Environment
import pandas as pd
import json

env = Environment()


class MainDataAccessor:
    _instance = None

    def __new__(cls, *args, **kwargs):
        if not cls._instance:
            cls._instance = super(MainDataAccessor, cls).__new__(
                                cls, *args, **kwargs)
        return cls._instance

    def __init__(self):
        # Create client and insert data
        self._instance = self.create_index()

    def to_bulk_json(self,input_df):
        """Converts dataframe to bulk json format"""
        bulk_actions = []
        for record in input_df.to_dict(orient="records"):
            bulk_actions.append(('{ "index" : { "_index" : "%s" }}' % env.ES_INDEX))
            bulk_actions.append(json.dumps(record, default=int))
        print(bulk_actions)

        return bulk_actions

    def create_index(self):

        # Create the client instance
        client = Elasticsearch(hosts="http://localhost:9200", verify_certs=False)
        # Successful response!
        print("Successfully connected to index at: ", env.ES_URL)

        print("Reading data from data frame... ")
        df = pd.read_csv(env.CSV_FILE_PATH)

        print("Converting dataframe to bulk jsons.... ")
        bulk_json_object = self.to_bulk_json(df)

        print("Ingesting bulk json object into Elastic Search index:", env.ES_INDEX)
        client.bulk(index=env.ES_INDEX, operations=bulk_json_object)
        print("Successfully ingested json objects into Elastic Search:", env.ES_INDEX)

        result = client.count(index=env.ES_INDEX)
        print("Total resumes ingested: ", result.body['count'])

        return client



In [2]:
from elasticsearch.helpers import scan


In [3]:
data_accessor = MainDataAccessor()

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Successfully ingested json objects into Elastic Search: applicants
Total resumes ingested:  30784


In [12]:

def get_data_from_elastic():
    # query: The elasticsearch query.
    query  = {
          "query": {
            "bool": {
              "must": [ { "match": { "Category": "Testing"}} , { "match": { "Operation_Mode": 2}}, { "range": { "Hourly_Rate": {'gte': 30}}} , { "range": { "Hourly_Rate": {'lte': 34}}}] ,
        "filter": {
            "term": {
                "Notice_Period": 1
            }
        }
            }
          }
        }

    # Scan function to get all the data.
    rel = scan(client=data_accessor._instance,
               query=query,
               index='resume',
               raise_on_error=True,
               preserve_order=True,
               clear_scroll=True)
    # Keep response in a list.
    result = list(rel)
    temp = []
    # We need only '_source', which has all the fields required.
    # This elimantes the elasticsearch metdata like _id, _type, _index.
    for hit in result:
        temp.append(hit['_source'])
    # Create a dataframe.
    df = pd.DataFrame(temp)
    return df.to_dict()



get_data_from_elastic()

{'Unnamed: 0': {0: 894,
  1: 912,
  2: 918,
  3: 920,
  4: 938,
  5: 894,
  6: 912,
  7: 918,
  8: 920,
  9: 938,
  10: 894,
  11: 912,
  12: 918,
  13: 920,
  14: 938,
  15: 894,
  16: 912,
  17: 918,
  18: 920,
  19: 938,
  20: 894,
  21: 912,
  22: 918,
  23: 920,
  24: 938,
  25: 894,
  26: 912,
  27: 918,
  28: 920,
  29: 938,
  30: 894,
  31: 912,
  32: 918,
  33: 920,
  34: 938,
  35: 894,
  36: 912,
  37: 918,
  38: 920,
  39: 938,
  40: 894,
  41: 912,
  42: 918,
  43: 920,
  44: 938,
  45: 894,
  46: 912,
  47: 918,
  48: 920,
  49: 938,
  50: 459,
  51: 475,
  52: 459,
  53: 475,
  54: 459,
  55: 475,
  56: 459,
  57: 475,
  58: 459,
  59: 475,
  60: 459,
  61: 475,
  62: 459,
  63: 475,
  64: 459,
  65: 475,
  66: 459,
  67: 475,
  68: 459,
  69: 475},
 'Category': {0: 'Testing',
  1: 'Testing',
  2: 'Testing',
  3: 'Testing',
  4: 'Testing',
  5: 'Testing',
  6: 'Testing',
  7: 'Testing',
  8: 'Testing',
  9: 'Testing',
  10: 'Testing',
  11: 'Testing',
  12: 'Testing',
  

In [5]:
import pandas as pd
df = pd.read_csv("final_resume_data.csv")
y = df.pop("Hired")
X=df


features =['Hourly_Rate', 'Notice_Period', 'Operation_Mode', 'Test_Score',
       'Interview_Score']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( X[features], y, test_size=0.33, random_state=42, stratify=y)



In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
model = clf.fit(X_train, y_train)

In [17]:
import pickle
filename = 'trained_model.pkl'
pickle.dump(model, open(filename, 'wb'))

loaded_model = pickle.load(open(filename, 'rb'))

In [18]:
yprob = loaded_model.predict_proba(X_test)
yhat = loaded_model.predict(X_test)
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import auc
from sklearn.metrics import f1_score

# evaluate predictions
acc = accuracy_score(y_test, yhat)
precision = precision_score(y_test, yhat)
recall = recall_score(y_test, yhat)
f1_score = f1_score(y_test, yhat)
# auc = auc(y_test, yhat)

print('precision: %.3f' % precision)
print('recall: %.3f' % recall)
print('f1_score: %.3f' % f1_score)
print('Accuracy: %.3f' % acc)

precision: 0.841
recall: 0.733
f1_score: 0.783
Accuracy: 0.871


In [21]:
yprob[:,1]

array([0.98, 0.04, 1.  , 0.12, 0.11, 0.01, 0.  , 0.18, 0.26, 0.4 , 1.  ,
       0.  , 0.  , 0.99, 0.26, 0.  , 0.06, 0.21, 0.03, 1.  , 0.57, 1.  ,
       0.12, 0.01, 0.  , 0.05, 1.  , 0.01, 0.  , 0.17, 1.  , 0.  , 1.  ,
       1.  , 0.  , 0.12, 0.69, 0.  , 0.38, 0.39, 0.12, 0.53, 0.  , 1.  ,
       1.  , 0.  , 0.  , 0.5 , 0.99, 0.99, 0.  , 0.  , 0.14, 0.  , 1.  ,
       0.  , 0.  , 0.35, 0.  , 0.  , 0.12, 0.  , 0.26, 0.01, 0.25, 0.23,
       0.  , 0.2 , 1.  , 1.  , 1.  , 0.4 , 0.  , 0.34, 0.  , 0.11, 0.  ,
       0.16, 1.  , 1.  , 0.35, 0.  , 0.15, 0.56, 0.  , 0.  , 0.17, 1.  ,
       0.1 , 0.01, 1.  , 1.  , 0.  , 0.1 , 1.  , 0.52, 0.  , 0.  , 0.17,
       0.  , 0.08, 0.23, 0.04, 0.2 , 1.  , 0.11, 0.18, 0.1 , 0.  , 0.43,
       0.  , 0.12, 0.06, 0.36, 0.  , 0.  , 0.  , 0.  , 0.02, 0.  , 0.35,
       0.06, 0.  , 0.  , 0.  , 0.15, 0.07, 0.  , 0.44, 0.  , 0.  , 0.  ,
       1.  , 0.22, 0.21, 0.  , 0.99, 0.13, 0.01, 0.34, 1.  , 0.  , 0.27,
       1.  , 0.08, 0.  , 0.  , 0.99, 0.99, 0.02, 0.