# LinkedIn Company Industry Classification based on Company Description

Submitted by Ya Yun Huang, Emma Wang, Heather Qiu

## I. Data Preprocessing

In [15]:
## Import Dependencies
import pandas as pd
import re
import nltk
import numpy as np
from wordcloud import WordCloud, STOPWORDS

from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

import math
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors


In [16]:
## Import data from csv
df = pd.read_csv("linkedin_company_data.csv", encoding="latin1")
## Confirm all fields have a value
assert ((df.isnull() == False).all()).all()


In [17]:
## Confirm that the dataset is balanced among classes
df["industry"].value_counts()


Hospitals and Health Care        946
Financial Services               926
IT Services and IT Consulting    833
Name: industry, dtype: int64

This dataset is balanced across different classes.

In [22]:
# Encode labels to numbers
le = LabelEncoder()
df["label"] = le.fit_transform(df["industry"])
le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(le_name_mapping)
df.head()



{'Financial Services': 0, 'Hospitals and Health Care': 1, 'IT Services and IT Consulting': 2}


Unnamed: 0,name,description,industry,label,text
0,3M Health Care,We believe nothing is more important than good...,Hospitals and Health Care,1,believe nothing important good health belief i...
1,AHIMA,AHIMA is a global nonprofit association of hea...,Hospitals and Health Care,1,ahima global nonprofit association health info...
2,Allina Health,People at Allina Health have a career of makin...,Hospitals and Health Care,1,people allina health career making difference ...
3,American College of Cardiology,The American College of Cardiology envisions a...,Hospitals and Health Care,1,american college cardiology envisions world in...
4,Apria,Apria is a leading provider of home healthcare...,Hospitals and Health Care,1,apria leading provider home healthcare equipme...


In [19]:
# Define Text Preprocessing Function
def utils_preprocess_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=None):

    ## Remove unicode characters in company descriptions
    text = text.encode("ascii", "ignore").decode()

    ## Convert company descriptions to lowercase
    ## and remove punctuations and characters and then strip
    text = re.sub(r"[^\w\s]", "", str(text).lower().strip())

    ## Tokenize (convert from string to list)
    lst_text = text.split()

    ## Remove Stopwords
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in lst_stopwords]

    ## Stemming (remove -ing, -ly, ...)
    if flg_stemm == True:
        ps = nltk.stem.porter.PorterStemmer()
        lst_text = [ps.stem(word) for word in lst_text]

    ## Lemmatisation (convert the word into root word)
    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]

    ## Convert lists back to strings
    text = " ".join(lst_text)
    return text


# Preprocess LinkedIn Company Description
# specify stopwords
lst_stopwords = nltk.corpus.stopwords.words("english") + [
    "said",
    "mr",
    "could",
    "doe",
    "ha",
    "might",
    "must",
    "need",
    "sha",
    "wa",
    "wo",
    "would",
]

df["text"] = df["description"].apply(
    lambda x: utils_preprocess_text(
        x, flg_stemm=False, flg_lemm=True, lst_stopwords=lst_stopwords
    )
)


# Model

In [21]:
## Split the dataset into test (20%) and train (80%)
train_X, test_X, train_y, test_y = train_test_split(
    df["text"], df["label"], test_size=0.2, random_state=1
)


## II. Train Multinomial Naive Bayes Model (Generative) on Original Data

In [24]:
## Instantiate the Count Vectorizer Object
count = CountVectorizer(lowercase=True, min_df=2, max_df=0.9, ngram_range=(1, 1))
train_count_X = count.fit_transform(train_X)
test_count_X = count.transform(test_X)

# Instantiate the multinomial model and fit the model
mod_nb = MultinomialNB()
fit_model = mod_nb.fit(train_count_X, train_y.values.ravel())

## Make predictions using test set
pred_y = fit_model.predict(test_count_X)

## Evaluate accuracy
print(
    f"The accuracy of the multinomial naive bayes model on original data is {accuracy_score(test_y, list(pred_y)):,.2f}."
)
# print out classification report
target_names = ['Financial Services', 'Hospitals and Health Care', 'IT Services and IT Consulting']
print(classification_report(test_y, pred_y, target_names=target_names))

The accuracy of the multinomial naive bayes model on original data is 0.93.
                               precision    recall  f1-score   support

           Financial Services       0.94      0.91      0.92       175
    Hospitals and Health Care       0.96      0.93      0.94       188
IT Services and IT Consulting       0.88      0.94      0.91       178

                     accuracy                           0.93       541
                    macro avg       0.93      0.93      0.93       541
                 weighted avg       0.93      0.93      0.93       541



## III. Train Pre-trained BERT Model (Discriminative) from Hugging Face on Original Data

In [9]:
## Import Additional Dependencies
from transformers import (
    AutoModelForSequenceClassification,
    TFAutoModelForSequenceClassification,
    TFBertForSequenceClassification,
)
from transformers import AutoTokenizer
import tensorflow as tf
from transformers import BertTokenizerFast
from tqdm import tqdm

tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")


def get_train_ds(train_X, train_y, batch_size=32):
    X_train, y_train = train_X, train_y
    train_encodings = tokenizer(list(train_X), truncation=True, padding=True)
    train_ds = tf.data.Dataset.from_tensor_slices((dict(train_encodings), y_train))

    train_ds = train_ds.batch(batch_size)
    return train_ds


## Train and save model
model = TFBertForSequenceClassification.from_pretrained("bert-base-cased", num_labels=3)

lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=5e-5, decay_steps=10000, decay_rate=0.9
)
optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
model.compile(
    optimizer=optimizer,
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=tf.metrics.SparseCategoricalAccuracy(),
)


def train_on_slice(train_ds):
    model.fit(train_ds, epochs=1)
    tokenizer.save_pretrained("./trained")
    model.save_pretrained("./trained")


train_ds = get_train_ds(train_X, train_y)
train_on_slice(train_ds)


In [10]:
## Import Additional Dependencies
from transformers import BertTokenizerFast, TFBertForSequenceClassification
from transformers import TextClassificationPipeline

## Load and Predict using saved model
model_path = "./trained"
tokenizer = BertTokenizerFast.from_pretrained(model_path)
model = TFBertForSequenceClassification.from_pretrained(
    model_path
)  # modify labels as needed.
text = list(test_X)
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer)
ans = [int(i["label"][-1]) for i in pipe(text, truncation=True)]

## Calculate Accuracy Score
print(
    f"The accuracy of the pre-trained BERT model on original data is {accuracy_score(list(test_y), ans):,.2f}."
)
## print classification report
print(classification_report(test_y, ans, target_names=target_names))


## IV. Generate Synthesize Data

In [26]:
# get vocabulary used as features in our NB model
vocab = count.get_feature_names_out()
dims = len(vocab)

# get probability distribution of each feature of NB model
dist = {}
for i in range(0,3):
    dist[str(i)] =  np.exp(fit_model.feature_log_prob_[i])

X = np.empty((0, dims))
y = []

# specify text length as 100 (100 words for each text)
n = 100
all_text = []

# generate 300 texts for each class
synth_size = 300

for label in dist.keys():
    synth_text = []  
    X_synth = np.random.multinomial(n, dist[label], synth_size)
    for i in range(synth_size):
        indexlist = []
        l = X_synth[i].tolist()
        for i,val in enumerate(l):
            if val>=1:
                indexlist.append(i)
        text = []
        for index, value in enumerate(vocab):
            if index in indexlist:
                text.append(value)
        synth_text.append(text)

    all_text.append(synth_text)
    
    X = np.concatenate((X, X_synth), axis=0)
    yi = np.array([int(label)] * synth_size)
    y = np.concatenate((y, yi), axis=0)
    

In [27]:
# generative text data
synthetic_text = []
for label in all_text:
    for synth_text in label:
        synth_text = " ".join(synth_text)
        synthetic_text.append(synth_text)


In [28]:
## Split the sytheic dataset into test (20%) and train (80%)
train_syn_X, test_syn_X, train_syn_y, test_syn_y = train_test_split(
    synthetic_text, y, test_size=0.2, random_state=1
)


## V. Train Multinomial Naive Bayes Model (Generative) on Synthetic Data

In [32]:
train_count_syn_X = count.fit_transform(train_syn_X)
test_count_syn_X = count.transform(test_syn_X)

# Instantiate the multinomial model and fit the model
mod_nb_syn = MultinomialNB()
fit_model_syn = mod_nb_syn.fit(train_count_syn_X, train_syn_y)

## Make predictions using test set
pred_y_syn = fit_model_syn.predict(test_count_syn_X)

## Evaluate accuracy
print(
    f"The accuracy of the multinomial naive bayes model on syntheic data is {accuracy_score(test_syn_y, list(pred_y_syn)):,.2f}."
)
# print out classification report
target_names = ['Financial Services', 'Hospitals and Health Care', 'IT Services and IT Consulting']
print(classification_report(test_syn_y, pred_y_syn, target_names=target_names))

The accuracy of the multinomial naive bayes model on syntheic data is 1.00.
                               precision    recall  f1-score   support

           Financial Services       1.00      1.00      1.00        60
    Hospitals and Health Care       1.00      1.00      1.00        55
IT Services and IT Consulting       1.00      1.00      1.00        65

                     accuracy                           1.00       180
                    macro avg       1.00      1.00      1.00       180
                 weighted avg       1.00      1.00      1.00       180



## VI. Train Pre-trained Bert Model (Discriminative) from Hugging Face on Synthetic Data


In [52]:
## Import Additional Dependencies
from transformers import BertTokenizerFast, TFBertForSequenceClassification
from transformers import TextClassificationPipeline

## Load and Predict using saved model
# model_path = "./trained"
tokenizer = BertTokenizerFast.from_pretrained("/workspaces/assimilate-pytorch/trained")
model = TFBertForSequenceClassification.from_pretrained(
    "/workspaces/assimilate-pytorch/trained"
)  # modify labels as needed.

pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer)
ans = [int(i["label"][-1]) for i in pipe(synthetic_text, truncation=True)]

## Calculate Accuracy Score
print(
    f"The accuracy of the pre-trained BERT model on synthetic data is {accuracy_score(list(y), ans):,.2f}."
)


Some layers from the model checkpoint at /workspaces/assimilate-pytorch/trained were not used when initializing TFBertForSequenceClassification: ['dropout_37']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at /workspaces/assimilate-pytorch/trained.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


The accuracy of the pre-trained BERT model on synthetic data is 0.87.
