In [18]:
from elasticsearch import Elasticsearch
from config import Environment
import pandas as pd
import json

env = Environment()


class MainDataAccessor:
    _instance = None

    def __new__(cls, *args, **kwargs):
        if not cls._instance:
            cls._instance = super(MainDataAccessor, cls).__new__(
                                cls, *args, **kwargs)
        return cls._instance

    def __init__(self):
        # Create client and insert data
        self._instance = self.create_index()

    def to_bulk_json(self,input_df):
        """Converts dataframe to bulk json format"""
        bulk_actions = []
        for record in input_df.to_dict(orient="records"):
            bulk_actions.append(('{ "index" : { "_index" : "%s" }}' % env.ES_INDEX))
            bulk_actions.append(json.dumps(record, default=int))
        print(bulk_actions)

        return bulk_actions

    def create_index(self):

        # Create the client instance
        client = Elasticsearch(hosts="http://localhost:9200", verify_certs=False)
        # Successful response!
        print("Successfully connected to index at: ", env.ES_URL)

        print("Reading data from data frame... ")
        df = pd.read_csv(env.CSV_FILE_PATH)

        print("Converting dataframe to bulk jsons.... ")
        bulk_json_object = self.to_bulk_json(df)

        print("Ingesting bulk json object into Elastic Search index:", env.ES_INDEX)
        client.bulk(index=env.ES_INDEX, operations=bulk_json_object)
        print("Successfully ingested json objects into Elastic Search:", env.ES_INDEX)

        result = client.count(index=env.ES_INDEX)
        print("Total resumes ingested: ", result.body['count'])

        return client



In [19]:
from elasticsearch.helpers import scan


In [20]:
data_accessor = MainDataAccessor()

Successfully connected to index at:  http://localhost:9200
Reading data from data frame... 
Converting dataframe to bulk jsons.... 


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Successfully ingested json objects into Elastic Search: resume
Total resumes ingested:  0


In [49]:

def get_data_from_elastic():
    # query: The elasticsearch query.
    query  = {
          "query": {
            "bool": {
              "must": [ { "match": { "Category": "Testing"}} , { "match": { "Operation_Mode": 2}}, { "range": { "Hourly_Rate": {'gte': 30}}} , { "range": { "Hourly_Rate": {'lte': 34}}}] ,
        "filter": {
            "term": {
                "Notice_Period": 1
            }
        }
            }
          }
        }

    # Scan function to get all the data.
    rel = scan(client=data_accessor._instance,
               query=query,
               index='resume',
               raise_on_error=True,
               preserve_order=True,
               clear_scroll=True)
    # Keep response in a list.
    result = list(rel)
    temp = []
    # We need only '_source', which has all the fields required.
    # This elimantes the elasticsearch metdata like _id, _type, _index.
    for hit in result:
        temp.append(hit['_source'])
    # Create a dataframe.
    df = pd.DataFrame(temp)
    return df.to_dict()



get_data_from_elastic()

{'Unnamed: 0': {0: 894,
  1: 912,
  2: 918,
  3: 920,
  4: 938,
  5: 894,
  6: 912,
  7: 918,
  8: 920,
  9: 938,
  10: 894,
  11: 912,
  12: 918,
  13: 920,
  14: 938,
  15: 894,
  16: 912,
  17: 918,
  18: 920,
  19: 938,
  20: 894,
  21: 912,
  22: 918,
  23: 920,
  24: 938,
  25: 894,
  26: 912,
  27: 918,
  28: 920,
  29: 938,
  30: 894,
  31: 912,
  32: 918,
  33: 920,
  34: 938,
  35: 894,
  36: 912,
  37: 918,
  38: 920,
  39: 938,
  40: 894,
  41: 894,
  42: 912,
  43: 918,
  44: 920,
  45: 938,
  46: 912,
  47: 918,
  48: 920,
  49: 938,
  50: 894,
  51: 912,
  52: 918,
  53: 920,
  54: 938,
  55: 459,
  56: 475,
  57: 459,
  58: 475,
  59: 459,
  60: 475,
  61: 459,
  62: 475,
  63: 459,
  64: 475,
  65: 459,
  66: 475,
  67: 459,
  68: 475,
  69: 459,
  70: 475,
  71: 459,
  72: 475,
  73: 459,
  74: 475,
  75: 459,
  76: 475},
 'Category': {0: 'Testing',
  1: 'Testing',
  2: 'Testing',
  3: 'Testing',
  4: 'Testing',
  5: 'Testing',
  6: 'Testing',
  7: 'Testing',
  8: 'Te

In [1]:
import pandas as pd
df = pd.read_csv("final_resume_data.csv")
y = df.pop("Hired")
X=df


features =['Hourly_Rate', 'Notice_Period', 'Operation_Mode', 'Test_Score',
       'Interview_Score']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( X[features], y, test_size=0.33, random_state=42, stratify=y)



In [14]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
model = clf.fit(X_train, y_train)

In [15]:
import pickle
filename = 'trained_model.pkl'
pickle.dump(model, open(filename, 'wb'))

loaded_model = pickle.load(open(filename, 'rb'))

In [21]:
yprob = loaded_model.predict_proba(X_test)
yhat = loaded_model.predict(X_test)
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import auc
from sklearn.metrics import f1_score

# evaluate predictions
acc = accuracy_score(y_test, yhat)
precision = precision_score(y_test, yhat)
recall = recall_score(y_test, yhat)
f1_score = f1_score(y_test, yhat)
# auc = auc(y_test, yhat)

print('precision: %.3f' % precision)
print('recall: %.3f' % recall)
print('f1_score: %.3f' % f1_score)
print('Accuracy: %.3f' % acc)

precision: 0.892
recall: 0.733
f1_score: 0.804
Accuracy: 0.887


In [17]:
yprob[]

array([[1.09870326e-01, 8.90129674e-01],
       [9.40000883e-01, 5.99991171e-02],
       [3.58511882e-03, 9.96414881e-01],
       [9.20800833e-01, 7.91991671e-02],
       [8.38584465e-01, 1.61415535e-01],
       [9.99965575e-01, 3.44247948e-05],
       [9.99505046e-01, 4.94953656e-04],
       [3.79092673e-01, 6.20907327e-01],
       [9.05967496e-01, 9.40325045e-02],
       [9.13296610e-01, 8.67033905e-02],
       [1.13252485e-02, 9.88674751e-01],
       [9.99922579e-01, 7.74207953e-05],
       [9.98382891e-01, 1.61710874e-03],
       [1.27795747e-01, 8.72204253e-01],
       [7.59863820e-01, 2.40136180e-01],
       [9.99758422e-01, 2.41578255e-04],
       [8.24399678e-01, 1.75600322e-01],
       [9.59085664e-01, 4.09143358e-02],
       [8.65724982e-01, 1.34275018e-01],
       [4.39479516e-03, 9.95605205e-01],
       [4.97005108e-01, 5.02994892e-01],
       [7.40937757e-04, 9.99259062e-01],
       [9.52805782e-01, 4.71942177e-02],
       [8.54445778e-01, 1.45554222e-01],
       [9.998636