### Imports

In [47]:
import os
import sys
import json
import re
import math
import operator
import pandas as pd

### This function reads data from the data/ directory

The whole JSON structure was not same for company_info and other_details. I tried to make the structure same across all docs. Though this is not required as we are interested only in Job Description .

In [48]:
def extract_data(data_type):
    def structure_dict(keys, attribute, d):
        temp = {}
        for key in keys:
            try:
                temp[key] = d[attribute][key]
            except KeyError:
                temp[key] = []
        return temp

    company_info_keys = [u'Company Description', u'Email', u'Company Name', u'Telephone']
    other_details_keys = [u'Industry:', u'Department:', u'Skills:', u'Other Skills:']
    
    data_dict = {}
    for root, subdirs, files in os.walk('data/' + data_type):
        for file in os.listdir(root):
            filePath = os.path.join(root, file)
            if os.path.isdir(filePath):
                pass
            else:
                with open(filePath) as json_data:
                    d = json.load(json_data)
                    if str(os.path.dirname(filePath)) in data_dict.keys(): 
                        data_dict[str(os.path.dirname(filePath))].append(d)
                        company_info = structure_dict(company_info_keys, 'company_info', d)
                        other_details = structure_dict(other_details_keys, 'other_details', d)
                        d.pop('company_info')
                        d.pop('other_details')
                        d['company_info'] = company_info
                        d['other_details'] = other_details
                    else:
                        data_dict[str(os.path.dirname(filePath))] = list()
    return data_dict

### Text Preprocessing

In [49]:
def preprocess(s):
    nonenglish = re.compile(r'[^\x00-\x7F]')
    tabs = re.compile(r'\t')
    spaces = re.compile(r'(\s{2,})')
    weblinks = re.compile(r"""(https?://?(www\.)?(\w+).+) | #http wale links
                                ((www\.)[.^\S]+) | #www wale link
                                ([.^\S]+(\.com)[.^\S]+)|([.^\S]+(\.co\.)[.^\S]+)|([.^\S]+)(\.com) #.com wale links
                            """, flags = re.IGNORECASE | re.VERBOSE)
    htmltags = re.compile(r'&\w*;', flags = re.IGNORECASE)
    numbers = re.compile(r'(\W+)?([0-9]+)(\W+)?')
    speclchars = re.compile(r'[_~@\^\*\(\)\+={}\|\[\]\\/\.,:]')
    if nonenglish.search(s) is None:
        weblinks_removed = re.sub(weblinks, '', s)
        htmltags_removed = re.sub(htmltags, '', weblinks_removed)
        numbers_removed = re.sub(numbers, ' <number> ', htmltags_removed)
        speclchars_removed = re.sub(speclchars, ' ', numbers_removed)
        if len(speclchars_removed.lstrip())>0:
            return speclchars_removed.lower()

### Creating Vocablary for term-document matrix

In [50]:
training = extract_data('training')
bagofwords = []
documents = {}
count = 0
for k, v in training.items():
    for i, item in enumerate(v):
        desc = item['jd_information']['description']
        if len(desc)>0:
            clean_desc = preprocess(desc)
            if type(clean_desc) is unicode:
                if len(clean_desc)>0:
                    documents[(k, count)] = clean_desc
                    bagofwords.extend(clean_desc.split(' '))
                    count += 1 
del training
vocab = set(bagofwords)
print "no. of job descriptions  ", count
del bagofwords

no. of job descriptions   624


### Creating the Term-Document matrix as a dataframe

In [51]:
vectorspace = pd.DataFrame(index=vocab, columns=documents.keys())
vectorspace = vectorspace.fillna(0)
vectorspace.head()

Unnamed: 0,"(data/training/travel_tourism, 205)","(data/training/agriculture_dairy, 502)","(data/training/travel_tourism, 322)","(data/training/media_dotcom_entertainment, 100)","(data/training/travel_tourism, 231)","(data/training/media_dotcom_entertainment, 2)","(data/training/agriculture_dairy, 527)","(data/training/agriculture_dairy, 416)","(data/training/travel_tourism, 129)","(data/training/media_dotcom_entertainment, 40)",...,"(data/training/agriculture_dairy, 558)","(data/training/travel_tourism, 160)","(data/training/travel_tourism, 281)","(data/training/agriculture_dairy, 418)","(data/training/agriculture_dairy, 596)","(data/training/travel_tourism, 186)","(data/training/it_software, 340)","(data/training/travel_tourism, 307)","(data/training/it_software, 362)","(data/training/media_dotcom_entertainment, 94)"
,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
franchiseejob,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
looking,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
descriptionagricultural,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
eligible,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Filling up the Term-Document matrix

In [52]:
for docID, doc in documents.items():
    for word in doc.split(' '):
        vectorspace[docID].loc[word] += 1 
vectorspace.head()

Unnamed: 0,"(data/training/travel_tourism, 205)","(data/training/agriculture_dairy, 502)","(data/training/travel_tourism, 322)","(data/training/media_dotcom_entertainment, 100)","(data/training/travel_tourism, 231)","(data/training/media_dotcom_entertainment, 2)","(data/training/agriculture_dairy, 527)","(data/training/agriculture_dairy, 416)","(data/training/travel_tourism, 129)","(data/training/media_dotcom_entertainment, 40)",...,"(data/training/agriculture_dairy, 558)","(data/training/travel_tourism, 160)","(data/training/travel_tourism, 281)","(data/training/agriculture_dairy, 418)","(data/training/agriculture_dairy, 596)","(data/training/travel_tourism, 186)","(data/training/it_software, 340)","(data/training/travel_tourism, 307)","(data/training/it_software, 362)","(data/training/media_dotcom_entertainment, 94)"
,4,12,2,3,8,24,15,6,38,12,...,21,37,2,0,20,3,19,31,3,33
franchiseejob,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
looking,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
descriptionagricultural,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
eligible,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Probability of Class

In [60]:
prob_of_class = {}
for (class_, docID), doc in documents.items():
    if class_ in prob_of_class.keys():
        prob_of_class[class_] += 1
    else:
        prob_of_class[class_] = 0
total = sum(prob_of_class.itervalues(), 0.0)
prob_of_class = {k: v / total for k, v in prob_of_class.iteritems()}
prob_of_class

{'data/training/agriculture_dairy': 0.36935483870967745,
 'data/training/it_software': 0.09838709677419355,
 'data/training/media_dotcom_entertainment': 0.19193548387096773,
 'data/training/travel_tourism': 0.3403225806451613}

### Probability of class given a word

In [62]:
def prob_of_class_given_word(word, classes, vectorspace):
    prob = {}
    columns = vectorspace.columns
    for class_, _ in classes.items():
        wordcount = 0
        for col in columns:
            if col[0] == class_: 
                try:
                    wordcount += vectorspace[(class_, col[1])].loc[word]
                except KeyError: 
                    wordcount = 0
        prob[class_]  = wordcount
    nr, nc = vectorspace.shape
    tot = sum(prob.values())
    for k, v in prob.items():
        prob[k] = float((v + 1))/float((tot + nr))
        prob[k] = 1/float((tot + nr))
    return prob

prob_of_class_given_word('ASAS', prob_of_class, vectorspace)

{'data/training/agriculture_dairy': 0.00012938284383490748,
 'data/training/it_software': 0.00012938284383490748,
 'data/training/media_dotcom_entertainment': 0.00012938284383490748,
 'data/training/travel_tourism': 0.00012938284383490748}

### Probability of description given class

In [63]:
def prob_of_desc_given_class(desc, classes, vectorspace):
    prob = {}
    for class_, _ in classes.items():
        total_prob = 0
        for word in desc.split(' '):
            total_prob = float(total_prob) + math.log(float(prob_of_class_given_word(word, classes, vectorspace)[class_]))
        prob[class_] = total_prob
    return prob
text = 'job descriptiondeliver sales and profitability targets'
prob_of_desc_given_class(text, prob_of_class, vectorspace)

{'data/training/agriculture_dairy': -54.13985448559229,
 'data/training/it_software': -54.13985448559229,
 'data/training/media_dotcom_entertainment': -54.13985448559229,
 'data/training/travel_tourism': -54.13985448559229}

### Probability of class given description = Probability of description given class * Probability of class

In [64]:
def prob_of_class_given_desc(prob_of_desc_given_class, prob_of_class):
    prob = {}
    for class_, val in prob_of_class.items():
        prob[class_] = math.log(prob_of_class[class_]) + float(prob_of_desc_given_class[class_])
    return prob
a = prob_of_desc_given_class('good in communication skill', prob_of_class, vectorspace)
b = prob_of_class
prob_of_class_given_desc(a, b)

{'data/training/agriculture_dairy': -36.99424984112993,
 'data/training/it_software': -38.31709798051086,
 'data/training/media_dotcom_entertainment': -37.64884835157264,
 'data/training/travel_tourism': -37.0761137112081}

### Running model on test data

In [85]:
testing = extract_data('testing')
count = 0
expected_outcome = []
model_outcome = []
for k, v in testing.items():
    for i, item in enumerate(v):
        desc = item['jd_information']['description']
        if len(desc)>0:
            clean_desc = preprocess(desc)
            if type(clean_desc) is unicode:
                if len(clean_desc)>0:
                    a = prob_of_desc_given_class(clean_desc, prob_of_class, vectorspace)
                    b = prob_of_class
                    c = prob_of_class_given_desc(a, b)
                    expected_outcome.append(k)
                    model_outcome.append(c)
                    print count, k, max(c.iteritems(), key=operator.itemgetter(1))[0]
                    print "******************************************************************"
                    count += 1
#                     if count==3: break
#     break

0 data/testing/agriculture_dairy data/training/agriculture_dairy
******************************************************************
1 data/testing/agriculture_dairy data/training/agriculture_dairy
******************************************************************
2 data/testing/agriculture_dairy data/training/agriculture_dairy
******************************************************************
3 data/testing/agriculture_dairy data/training/agriculture_dairy
******************************************************************
4 data/testing/agriculture_dairy data/training/agriculture_dairy
******************************************************************
5 data/testing/agriculture_dairy data/training/agriculture_dairy
******************************************************************
6 data/testing/agriculture_dairy data/training/agriculture_dairy
******************************************************************
7 data/testing/agriculture_dairy data/training/agriculture_dairy
***********

62 data/testing/it_software data/training/agriculture_dairy
******************************************************************
63 data/testing/it_software data/training/agriculture_dairy
******************************************************************
64 data/testing/it_software data/training/agriculture_dairy
******************************************************************
65 data/testing/it_software data/training/agriculture_dairy
******************************************************************
66 data/testing/it_software data/training/agriculture_dairy
******************************************************************
67 data/testing/it_software data/training/agriculture_dairy
******************************************************************
68 data/testing/it_software data/training/agriculture_dairy
******************************************************************
69 data/testing/it_software data/training/agriculture_dairy
***************************************************

126 data/testing/travel_tourism data/training/agriculture_dairy
******************************************************************
127 data/testing/travel_tourism data/training/agriculture_dairy
******************************************************************
128 data/testing/travel_tourism data/training/agriculture_dairy
******************************************************************
129 data/testing/travel_tourism data/training/agriculture_dairy
******************************************************************
130 data/testing/travel_tourism data/training/agriculture_dairy
******************************************************************
131 data/testing/travel_tourism data/training/agriculture_dairy
******************************************************************
132 data/testing/travel_tourism data/training/agriculture_dairy
******************************************************************
133 data/testing/travel_tourism data/training/agriculture_dairy
*******************

189 data/testing/travel_tourism data/training/agriculture_dairy
******************************************************************
190 data/testing/travel_tourism data/training/agriculture_dairy
******************************************************************
191 data/testing/travel_tourism data/training/agriculture_dairy
******************************************************************
192 data/testing/travel_tourism data/training/agriculture_dairy
******************************************************************
193 data/testing/travel_tourism data/training/agriculture_dairy
******************************************************************
194 data/testing/media_dotcom_entertainment data/training/agriculture_dairy
******************************************************************
195 data/testing/media_dotcom_entertainment data/training/agriculture_dairy
******************************************************************
196 data/testing/media_dotcom_entertainment data/training/a

247 data/testing/media_dotcom_entertainment data/training/agriculture_dairy
******************************************************************


### Confusion Matrix

In [86]:
index = [key.split('/')[-1] for key in prob_of_class.keys()]
confusionmatrix = pd.DataFrame(index=index, columns=index)
confusionmatrix = confusionmatrix.fillna(0)
confusionmatrix

Unnamed: 0,media_dotcom_entertainment,travel_tourism,it_software,agriculture_dairy
media_dotcom_entertainment,0,0,0,0
travel_tourism,0,0,0,0
it_software,0,0,0,0
agriculture_dairy,0,0,0,0


### Filling up confusion matrix

In [87]:
for true_label, predicted_label_dict in zip(expected_outcome, model_outcome):
    predicted_label = max(predicted_label_dict.iteritems(), key=operator.itemgetter(1))[0]
    confusionmatrix[predicted_label.split('/')[-1]].loc[true_label.split('/')[-1]] += 1
confusionmatrix

Unnamed: 0,media_dotcom_entertainment,travel_tourism,it_software,agriculture_dairy
media_dotcom_entertainment,0,0,0,54
travel_tourism,0,0,0,87
it_software,0,0,0,51
agriculture_dairy,0,0,0,56


### Precision, Recall, Accuracy

In [88]:
for col in confusionmatrix.columns:
    precision = confusionmatrix[col].loc[col] / confusionmatrix[col].sum()
    recall = confusionmatrix[col].loc[col] / confusionmatrix.loc[col].sum()
    print "Precision for Industry ", col, " is ===>  ", precision
    print "Recall for Industry ", col, " is ===>  ", recall

Precision for Industry  media_dotcom_entertainment  is ===>   0
Recall for Industry  media_dotcom_entertainment  is ===>   0
Precision for Industry  travel_tourism  is ===>   0
Recall for Industry  travel_tourism  is ===>   0
Precision for Industry  it_software  is ===>   0
Recall for Industry  it_software  is ===>   0
Precision for Industry  agriculture_dairy  is ===>   0
Recall for Industry  agriculture_dairy  is ===>   1


  
