In [423]:
import pandas as pd
import seaborn as sns
import numpy as np
import re
import matplotlib.pyplot as plt
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from nltk.corpus import stopwords
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn import preprocessing
from sklearn import utils

### Question 1

In [424]:
jobs = pd.read_csv('./Master Copy (standardized jobs).csv')

In [425]:
jobs.head()

Unnamed: 0.1,Unnamed: 0,category,company,details,experience,industry,location,salary,title
0,0,Other,"Integrated Resources, Inc",Job Title: Machine Learning AI Engineer\nDurat...,5+ year,Other Great Industries,"Bellevue, WA",,AI Engineer
1,1,Engineering,Amazon.com,Amazon Web Services (AWS) is looking for an AI...,2+ years,Other Great Industries,"Irvine, CA",,AI Engineer
2,2,Information Technology,Strategic Staffing Solutions,JobID: 136363Artificial Intelligence (AI) Plat...,,Other Great Industries,"301 S. Tryon StreetCharlotte, NC 28202",,AI Engineer
3,3,Information Technology,Wells Fargo,"Job DescriptionAt Wells Fargo, we have one goa...",7+ years,Other Great Industries,"Minneapolis, MN",,AI Engineer
4,4,Engineering,"Discover Financial Services, Inc",Discover Financial Services is a direct bankin...,3 years,Other Great Industries,"Riverwoods, IL 60015",,AI Engineer


In [426]:
jobs.shape

(3963, 9)

In [427]:
jobs.columns

Index([u'Unnamed: 0', u'category', u'company', u'details', u'experience',
       u'industry', u'location', u'salary', u'title'],
      dtype='object')

In [428]:
jobs = jobs.drop(['Unnamed: 0','category'],axis = 1)

In [429]:
jobs.dtypes

company       object
details       object
experience    object
industry      object
location      object
salary        object
title         object
dtype: object

In [430]:
jobs.head()

Unnamed: 0,company,details,experience,industry,location,salary,title
0,"Integrated Resources, Inc",Job Title: Machine Learning AI Engineer\nDurat...,5+ year,Other Great Industries,"Bellevue, WA",,AI Engineer
1,Amazon.com,Amazon Web Services (AWS) is looking for an AI...,2+ years,Other Great Industries,"Irvine, CA",,AI Engineer
2,Strategic Staffing Solutions,JobID: 136363Artificial Intelligence (AI) Plat...,,Other Great Industries,"301 S. Tryon StreetCharlotte, NC 28202",,AI Engineer
3,Wells Fargo,"Job DescriptionAt Wells Fargo, we have one goa...",7+ years,Other Great Industries,"Minneapolis, MN",,AI Engineer
4,"Discover Financial Services, Inc",Discover Financial Services is a direct bankin...,3 years,Other Great Industries,"Riverwoods, IL 60015",,AI Engineer


In [431]:
jobs.dtypes

company       object
details       object
experience    object
industry      object
location      object
salary        object
title         object
dtype: object

In [432]:
jobs.isnull().sum()

company        252
details          2
experience    1598
industry         2
location         4
salary        2261
title            0
dtype: int64

In [433]:
jobs.dropna(subset=['salary'], inplace=True)
# Drop rows without salary information

jobs.isnull().sum()

company       120
details         0
experience    739
industry        0
location        1
salary          0
title           0
dtype: int64

In [434]:
jobs.head()

Unnamed: 0,company,details,experience,industry,location,salary,title
12,"American Cybersystems, Inc. (ACS Group)",Job Title: (IDS) Data Scientist \n Location: N...,,Other Great Industries,"Newport News, VA",$50.00 - $70.00 /Hour,AI Engineer
16,CyberCoders,This position is open as of 4/22/2018.Leading ...,,Industrial,"Los Angeles, CA 90001","$120,000.00 - $160,000.00 /Year",AI Engineer
17,"Synergy Business Consulting, Inc.",Data Scientist\nPosition Summary:\n \nSeeking ...,2+ years,Travel,"Miami, FL","$90,000.00 - $110,000.00 /Year",AI Engineer
22,Access Staffing LLC,Data Scientist- Texas- Contract only!!!\nDurat...,,Consulting,"Houston, TX","$0.00 - $75,000.00 /Year",AI Engineer
37,"American Cybersystems, Inc. (ACS Group)",Job Title: (IDS) Data Scientist \n Location: N...,,Other Great Industries,"Newport News, VA",$50.00 - $70.00 /Hour,AI Engineer


In [435]:
jobs.shape

(1702, 7)

In [436]:
jobs['details'] = jobs['details'].map(lambda s: s.strip())

In [437]:
jobs['details'] = jobs['details'].apply(lambda x: x.lower())

print jobs['details']

12      job title: (ids) data scientist \n location: n...
16      this position is open as of 4/22/2018.leading ...
17      data scientist\nposition summary:\n \nseeking ...
22      data scientist- texas- contract only!!!\ndurat...
37      job title: (ids) data scientist \n location: n...
41      this position is open as of 4/22/2018.leading ...
42      data scientist\nposition summary:\n \nseeking ...
47      data scientist- texas- contract only!!!\ndurat...
62      job title: (ids) data scientist \n location: n...
66      this position is open as of 4/22/2018.leading ...
67      data scientist\nposition summary:\n \nseeking ...
72      data scientist- texas- contract only!!!\ndurat...
87      job title: (ids) data scientist \n location: n...
91      this position is open as of 4/22/2018.leading ...
92      data scientist\nposition summary:\n \nseeking ...
96      data scientist- texas- contract only!!!\ndurat...
112     job title: (ids) data scientist \n location: n...
116     this p

In [438]:
work_days = 251
work_hours = 9

# function returns mid point of a given salary range, else pass if no salary data available
# if salary quoted per year, return mid point of range
# if salary quoted per hour, return mid point of range * no. of work days in year * no. of work hours/day (annualising)
def salary_cleaner(salary_range):
    
    try:
        if 'Year' in salary_range:
            sal = re.findall(r'[0-9.]+',salary_range.replace('$','').replace(',',''))
            return np.mean([float(i) for i in sal])
        
        elif 'Hour' in salary_range:
            sal = re.findall(r'[0-9.]+',salary_range.replace('$','').replace(',',''))
            return np.mean([float(i) for i in sal]) * work_days * work_hours
        
    except TypeError:
        pass
    
    return

In [439]:
clean_salary = []
for i in jobs['salary']:
    i = salary_cleaner(i)
    clean_salary.append(i)

In [440]:
clean_salary = pd.Series(clean_salary)

In [441]:
jobs['clean_salary'] = clean_salary.values

In [442]:
jobs = jobs.drop(['salary'],axis=1)

In [443]:
# Analyse only those with salary information
jobs.dropna(subset=['clean_salary'], inplace=True)

jobs.shape

(1702, 7)

In [446]:
jobs.head()

Unnamed: 0,company,details,experience,industry,location,title,clean_salary
12,"American Cybersystems, Inc. (ACS Group)",job title: (ids) data scientist \n location: n...,,Other Great Industries,"Newport News, VA",AI Engineer,135540.0
16,CyberCoders,this position is open as of 4/22/2018.leading ...,,Industrial,"Los Angeles, CA 90001",AI Engineer,140000.0
17,"Synergy Business Consulting, Inc.",data scientist\nposition summary:\n \nseeking ...,2+ years,Travel,"Miami, FL",AI Engineer,100000.0
22,Access Staffing LLC,data scientist- texas- contract only!!!\ndurat...,,Consulting,"Houston, TX",AI Engineer,37500.0
37,"American Cybersystems, Inc. (ACS Group)",job title: (ids) data scientist \n location: n...,,Other Great Industries,"Newport News, VA",AI Engineer,135540.0


In [447]:
# lab_enc = preprocessing.LabelEncoder()
# jobs['industry'] = lab_enc.fit_transform(jobs['industry'])

Above code was used to try to resolve below error "ValueError: Unknown label type: 'continuous', but alas, it was not to be..

In [448]:
# Use industry to predict salary
X = jobs['industry']
y = jobs['clean_salary']

In [449]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [450]:
cvec = CountVectorizer(ngram_range=(2,3), stop_words='english')
cvec.fit(X_train)

CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(2, 3), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [451]:
X_train = cvec.transform(X_train)
X_test = cvec.transform(X_test)

In [452]:
X_train

<1276x406 sparse matrix of type '<type 'numpy.int64'>'
	with 3444 stored elements in Compressed Sparse Row format>

In [453]:
log = LogisticRegression()

log.fit(X_train, y_train)
y_pred_class = log.predict(X_test)
metrics.accuracy_score(y_test, y_pred_class)

ValueError: Unknown label type: 'continuous'

In [454]:
RF_clf = RandomForestClassifier(n_estimators=15)
RF_clf = RF_clf.fit(X_train, y_train)
predicted = RF_clf.predict(X_test)


print classification_report(y_test, predicted)
cm = confusion_matrix(y_test, predicted)
print cm

ValueError: Unknown label type: 'continuous'

### Question 2

In [455]:
# Generate value counts of job categories
jobs.title.value_counts()

Data Engineer                 231
Machine Learning Engineer     206
Data Architect                192
Statistician                  154
Business Analyst              146
Business Intelligence         142
Machine Learning Scientist    129
AI Scientist                  117
Statistical Programmer         89
Database Developer             79
Database Administrator         59
Data Scientist                 59
Data Analyst                   51
AI Engineer                    48
Name: title, dtype: int64

In [456]:
# split into data sci and non data sci titles
jobs['datasci'] = jobs['title'].map(lambda x: 1 if x == 'Data Scientist' else 0)

jobs.datasci.value_counts()

0    1643
1      59
Name: datasci, dtype: int64

In [457]:
# generate tf-idf matrix from job details
df = pd.DataFrame(data=jobs['details'].todense(), columns=vect.get_feature_names())
df.head(2)

AttributeError: 'Series' object has no attribute 'todense'