### Importing Machine Learning Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns # used for plot interactive graph. 
import seaborn as sns
import streamlit as st 

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report


import warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

### Reads an Excel file for performing classification

In [None]:
df = pd.read_excel("/home/kushal/Documents/projects/tf-idf-implementation/data/ConcatenatedDigitalAdData.xlsx")

### Creating a new column in the DataFrame called "TitleandDesc" by concatenating the values of the "title" and "Job_Description" columns

In [None]:
df["TitleandDesc"] = df["title"] + df["Job_Description"]

In [None]:
df["TitleandDesc"]

### Cleaning and Preprocessing the data before further analysis

In [None]:
df.drop(['Index', 'title','url','Posted-Date', 'Job_Description'], axis=1, inplace=True)
df

In [None]:
first_column = df.pop('TitleandDesc')
df.insert(0, 'TitleandDesc', first_column)
df

In [None]:
df.shape

In [None]:
total = df['TitleandDesc'].notnull().sum()
round((total/len(df)*100),1)

In [None]:
# df.head(15)

### Displaying the class 'JobType'

In [None]:
pd.DataFrame(df.JobType.unique()).values

### Calculating the number of data belonging to the classes

In [None]:
from collections import Counter
Counter(df["JobType"])

### Making ease to feed to Machine Learning
Categorical variables such as 'JobType' cannot be directly used in many machine learning algorithms, as they are typically designed to work with numerical data. In order to use the 'JobType' column in these algorithms, it needs to be converted to numerical form. One way to do this is through a process called factorization, which assigns a unique integer value to each unique category. This allows for the categorical data to be used in machine learning algorithms as numerical data.

Additionally, creating the dictionaries 'job_to_id' and 'id_to_job' allows for easy mapping between the original categorical values and the numerical values. This will be useful for interpreting the results of the model later on.

In [None]:
df['Job_Id'] = df['JobType'].factorize()[0]
job_id_df = df[['JobType', 'Job_Id']].drop_duplicates()


# Dictionaries for future use
job_to_id = dict(job_id_df.values)
id_to_job = dict(job_id_df[['Job_Id', 'JobType']].values)

# New dataframe
df

### Checking for missing values in Dataframe
**If there are no mising values, it returns '0'**

In [None]:
df.isnull().sum()

### Distribution of different JobTypes in the dataset.


In [None]:
fig = plt.figure(figsize=(8,6))
colors = ['red', 'blue', 'green', 'grey', 'darkblue']
df.groupby('JobType').TitleandDesc.count().sort_values().plot.barh(
    ylim=0, color=colors, title= 'No. of Jobs in Each JobType \n')
plt.xlabel('Number of ocurrences', fontsize = 10);

### Feature Extraction 
Using **TfidfVectorizer** from scikit-learn.

In [None]:
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5,
                        ngram_range=(1, 3), 
                        stop_words='english')

# We transform each complaint into a vector
features = tfidf.fit_transform(df.TitleandDesc).toarray()

labels = df.Job_Id

print("Each of the %d complaints is represented by %d features (TF-IDF score of unigrams and bigrams)" %(features.shape))

In [None]:
# features

In [None]:
# labels

### Generating Unigrams and Bigrams
Performing feature selection using chi-squared test on the Tf-Idf features and labels. It prints the top N correlated unigrams, and bigrams for each JobType.
This is done to improve the performance of text classification task.

In [None]:
N = 5
for JobType, Job_Id in sorted(job_to_id.items()):
    features_chi2 = chi2(features, labels == Job_Id)
    indices = np.argsort(features_chi2[0])
    
    vocab = {v: k for k, v in tfidf.vocabulary_.items()}
    feature_names = [vocab[i] for i in indices]   
     
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]

    print("\n==> %s:" %(JobType))
    
    print("  * Most Correlated Unigrams are: %s" %(', '.join(unigrams[-N:])))
    print("  * Most Correlated Bigrams are: %s" %(', '.join(bigrams[-N:])))


In [None]:
X = np.array(df["TitleandDesc"])
y = np.array(df["JobType"])

In [None]:
job_types = pd.DataFrame(df.JobType.unique()).values.tolist()
job_types


### Train Test Split
Splitting the data into training and testing sets for model evaluation

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=45)

### OneVsRestClassifier 
The OneVsRestClassifier is a class in scikit-learn that allows for multi-label classification by training a binary classifier for each label separately, and then using these binary classifiers to make predictions for new instances. It is used for multi-class problems.


In [None]:
#pipeline of feature engineering and model
model = Pipeline([('vectorizer', CountVectorizer()),
 ('tfidf', TfidfTransformer()),
 ('clf', OneVsRestClassifier(LinearSVC(class_weight='balanced')))])
#the class_weight="balanced" option tries to remove the biasedness of model towards majority sample


### Training the Text Classification Model on the training data

In [None]:
#fit model with training data
model.fit(X_train, y_train)


### Confusion Matrix
A confusion matrix is a table that is used to define the performance of a classification algorithm. It gives an idea of how well the algorithm is classifying the problem by comparing the predicted values with the true values in the test set.

In [None]:
y_pred = model.predict(X_test)
conf_mat = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(6,6))
sns.heatmap(
    conf_mat, 
    annot=True, 
    cmap="Blues", 
    fmt='d',
    xticklabels=job_id_df.JobType.values, 
    yticklabels=job_id_df.JobType.values
)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title("CONFUSION MATRIX \n", size=16);

### Classification Report
It is a summary of the performance of a classifier for a classification problem. It displays several evaluation metrics for each class, including precision, recall, f1-score and support. It is a text report that contains several evaluation metrics for each class and it helps to understand the performance of a classifier.


In [None]:
print('\t\t\tCLASSIFICATIION METRICS\n')
print(classification_report(
        y_test, 
        y_pred, 
        target_names = df['JobType'].unique()
    )
)

### Save the model

In [None]:
import pickle
filename = '/home/kushal/Documents/projects/tf-idf-implementation/model/adv_model.sav'
# pickle.dump(model, open(filename, 'wb'))

### Load The Model

In [None]:
loaded_model = pickle.load(open(filename, 'rb'))

### Accuracy

In [None]:
result = loaded_model.score(X_test, y_test)
print(f"The Accuracy of the Classification is:  {round(result*100, 3)}%")

### Sample Input and Testing The Model

In [None]:
job = input("Enter the text that we need to classify")
print(job)
model.predict([job])[0]