# Imports

In [6]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
import pandas as pd
import numpy as np
import time

from commons import *

# sklearn
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split,RandomizedSearchCV
from sklearn.pipeline import Pipeline

# from commons.metrics import *
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score


In [4]:
df = pd.read_csv('datasets\processed_data.csv')

In [1]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['Description'], df['JobTitle'], test_size=0.3,
                                                    random_state=42, stratify=df['JobTitle'])

## LinearSVC

In [18]:

pipline = Pipeline(steps=[
    ('bow', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('model', LinearSVC())
])

# hyperparameter tuning
def grid_search():
    param_grid = {
        'model__penalty': ['l1', 'l2'],
        'model__loss': ['hinge','squared_hinge'],
        'model__C': [0.1,1,10,100,1000]
    }

    search = RandomizedSearchCV(pipline, param_grid, cv=5, n_jobs=-1, verbose=3)
    search.fit(X_train,y_train)

    print("Best parameter (CV score=%0.3f):" % search.best_score_)
    print(search.best_params_)

In [19]:
start = time.time()

grid_search()

end = time.time()
print('execution time in minutes: ', (end - start)/60) 

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 27.1min finished


Best parameter (CV score=0.779):
{'model__penalty': 'l2', 'model__loss': 'squared_hinge', 'model__C': 10}
execution time in minutes:  30.293813602129617


In [15]:
# adding best hyperparameter

svc_model = LinearSVC(random_state=777, penalty = 'l2', loss = 'squared_hinge', C = 10)

pipline = Pipeline(steps=[
    ('bow', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('model', svc_model)
])

In [16]:
pipline.fit(X_train,y_train)

Pipeline(steps=[('bow', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('model', LinearSVC(C=10, random_state=777))])

In [17]:
predictions = pipline.predict(X_test)

In [18]:
print(classification_report(y_test, predictions))

                                                       precision    recall  f1-score   support

                                      .NET Programmer       0.00      0.00      0.00         1
                                         A&R Director       0.00      0.00      0.00         0
                                 ASIC Design Engineer       1.00      1.00      1.00         1
                                           Accountant       0.50      0.44      0.47         9
                             Accountant - Multifamily       1.00      1.00      1.00         1
                                   Accounting Analyst       0.00      0.00      0.00         1
                                 Accounting Assistant       0.00      0.00      0.00         1
                                Accounting Consultant       0.50      1.00      0.67         1
                Accounts Payable / Receivable Analyst       0.50      1.00      0.67         1
                Accounts Payable / Receivable Man

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [54]:
# model evaluation

evaluate(y_test, predictions)

{'precision': '0.78', 'recall': '0.80', 'f1': '0.78', 'accuracy': '0.80'}