# Imports

In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
import pandas as pd
import numpy as np
import time

# sklearn
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split,RandomizedSearchCV
from sklearn.pipeline import Pipeline
from commons import *
from sklearn.metrics import classification_report

In [4]:
df = pd.read_csv('datasets\processed_data.csv')

In [8]:
X_train, X_test, y_train, y_test = train_test_split(df['Description'], df['JobTitle'], test_size=0.3,
                                                    random_state=42, stratify=df['JobTitle'])

## SGDClassifier

In [7]:

pipline = Pipeline(steps=[
    ('bow', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('model', SGDClassifier())
])

# hyperparameter tuning
def grid_search():
    param_grid = {
        'model__penalty': ['l1', 'l2'],
        'model__loss': ['hinge','squared_hinge'],
        'tfidf__use_idf': (True, False),
        'model__alpha': (1e-2, 1e-3, 1e-4)
    }

    search = RandomizedSearchCV(pipline, param_grid, cv=5, n_jobs=-1)
    search.fit(X_train,y_train)

    print("Best parameter (CV score=%0.3f):" % search.best_score_)
    print(search.best_params_)

In [8]:
start = time.time()

grid_search()

end = time.time()
print('execution time in minutes: ', (end - start)/60) 



Best parameter (CV score=0.735):
{'tfidf__use_idf': False, 'model__penalty': 'l2', 'model__loss': 'hinge', 'model__alpha': 0.0001}
execution time in minutes:  58.78668849070867


In [9]:
# adding best hyperparameter

sgd_model = SGDClassifier(loss='hinge', penalty='l2',
                               alpha=1e-4, random_state=777)
                               

pipline = Pipeline(steps=[
    ('bow', CountVectorizer()),
#     ('tfidf', TfidfTransformer()),
    ('model', sgd_model)
])

In [10]:
pipline.fit(X_train,y_train)

Pipeline(steps=[('bow', CountVectorizer()),
                ('model', SGDClassifier(random_state=777))])

In [11]:
predictions = pipline.predict(X_test)

In [12]:
print(classification_report(y_test, predictions))

                                                       precision    recall  f1-score   support

                                      .NET Programmer       0.00      0.00      0.00         1
                                 ASIC Design Engineer       0.00      0.00      0.00         1
                                           Accountant       0.50      0.11      0.18         9
                             Accountant - Multifamily       1.00      1.00      1.00         1
                                   Accounting Analyst       0.00      0.00      0.00         1
                                 Accounting Assistant       0.00      0.00      0.00         1
                                Accounting Consultant       0.00      0.00      0.00         1
                Accounts Payable / Receivable Analyst       0.00      0.00      0.00         1
                Accounts Payable / Receivable Manager       0.00      0.00      0.00         2
            Accounts Payable or Receivable Specia

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [13]:
# model evaluation

evaluate(y_test, predictions)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'precision': '0.76', 'recall': '0.75', 'f1': '0.74', 'accuracy': '0.75'}