In [None]:
import pandas as pd
import regex as re
import string
import unicodedata
import nltk
import spacy
nltk.download('wordnet')
!python -m spacy download en_core_web_sm >> /dev/null
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
2021-10-27 20:56:48.496509: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-10-27 20:56:48.496566: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.[0m


In [None]:
raw_data = pd.read_csv('employer_raw_data_group_2.csv')
sentences = raw_data['description'].values

In [None]:
stemmer = nltk.stem.SnowballStemmer("english")
stop_words = []
with open("stopwords.txt", "r") as f_in:
        stop_words = [i.strip().lower() for i in f_in.readlines()]
lemmatizer = nltk.stem.WordNetLemmatizer()

# Task 1: Generalize all the preprocessing tasks into one single function that can be use in the Vectorizer


In [None]:
def get_preprocessing_function(
    use_lower: bool = True,
    use_alpha: bool = True,
    use_stemming: bool = False,
    use_nodates: bool = False,
    use_nourl: bool = True,
    use_stopwords: bool=False,
    use_lemmatizer: bool=False,
    use_nocity: bool=False
):
    
    def alpha(text: str):
        return re.sub("[^a-z]+", " ", text) if use_alpha else text

    def lower(text: str):
        return text.lower() if use_lower else text
        
    def stemming(text: str):
        if use_stemming:
            text = ' '.join(stemmer.stem(x) for x in text.split())
        return text
    
    def dates(text: str):
        dates = ['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december', 'jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 
    'sept', 'oct', 'nov', 'dec'] #added abbreviations to the months list
        return " ".join([word for word in text.split(" ") if word not in dates]) if use_nodates else text

    def url(text: str):
        url_pattern = re.compile('\\S*\\.com\\b|https?://\S+|www\.\S+')
        return url_pattern.sub('', text) if use_nourl else text
    
    def remove_stopwords(text):
        return " ".join([word for word in text.split(" ") if word not in stop_words]) if use_stopwords else text
    
    def lemmatize(text: str):
        if use_lemmatizer:
            text=' '.join(lemmatizer.lemmatize(x) for x in text.split())
        return text
    
    def cityremover(text: str):
        city_state_pattern = re.compile("(?<![A-Za-z])[A-Z][a-z]+, [A-Z]{2}(?![A-Za-z])") # Matches strings like "Nashville, TN"
        return city_state_pattern.sub('city', text) if use_nocity else text

    def preprocess(text: str):
        #Create list of steps
        steps = [lower,url, alpha, dates, cityremover, remove_stopwords, lemmatize, stemming]
        for step in steps:
            text = step(text)
        return text
    
    return preprocess

In [None]:
preprocess = get_preprocessing_function(
    use_lower= True,
    use_alpha= True,
    use_stemming= False,
    use_nodates= True,
    use_nourl= True,
    use_stopwords= True,
    use_lemmatizer= True,
    use_nocity=True
)

In [None]:
preprocessed_data=raw_data
preprocessed_data['description'] = preprocessed_data['description'].apply(preprocess)

In [None]:
print(preprocessed_data['description'].loc[100])

rehababilities designed mind therapist owned speak language value high clinical standard ethic pride qualified experienced scheduling team therapy personnel social worker dedicated providing excellent patient care physical therapist assistant inpatient outpatient former employee corona ca rehababilities pro white male racist company accepting assignment assignment often taken away given white male replacement assignment lieu withdrawn assignment additional compensation work well rehababilities people know best inside scoop job salary top office location ceo insight compare pay popular role read team work life balance uncover rehababilities best company review rehababilities experiencing staffing agency would longer using type service longer initial hr assistance reached screened nice recruiter mark quite pushy disrespectful know staffing agency commission like car sale people matched hired rehababilities inc new mexico foreign profit corporation filed company filing status listed revok

# Task 2: Work and research on Hashing Vectorizer. 

Pros:
<uli><li>Utilizes very little memory compared to other methods due to hashing strings avoid the need to store words in a dictionary

<uli><li>Incredibly fast with converting objects into character streams because it does not utilize state

<uli><li>Again, because it does not have a state computation, it can be used streaming or parallel pipeline

Cons:

<uli><li>Because the fact that it uses hashing, there is no way to revert back to the original version before the hashing.
<uli><li>Another problem that can occur due to hashing is the possibility of collisions (when 2 or more different things hash to the same thing).
<uli><li>Because the function does not have a state, it does not have IDF weighting.

Source: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.HashingVectorizer.html

In [None]:
hash_vector = HashingVectorizer(
    preprocessor=preprocess,
    ngram_range=(1,1),
    tokenizer=lambda s:s.split(),
    n_features=100
)
tfidf_vector = TfidfVectorizer(
    preprocessor=preprocess,
    ngram_range=(1,1),
    tokenizer=lambda s: s.split(),
    min_df=0.15,
    max_df=0.45,
    max_features=100,
    use_idf=True,
    smooth_idf=True
)

In [None]:
vector = hash_vector.fit_transform(sentences) #using hashvectoring

  "The parameter 'token_pattern' will not be used"


In [None]:
vector.todense() #the matrix returned by hashvectoring

matrix([[-0.0557856 ,  0.0371904 , -0.0185952 , ..., -0.0557856 ,
         -0.1115712 , -0.0557856 ],
        [-0.03092332, -0.03092332,  0.04638497, ...,  0.07730829,
          0.        , -0.10823161],
        [ 0.04662524,  0.        ,  0.02331262, ...,  0.        ,
          0.02331262, -0.06993786],
        ...,
        [ 0.        ,  0.        , -0.02154652, ...,  0.        ,
          0.        , -0.1077326 ],
        [ 0.01761995, -0.05285985,  0.0352399 , ..., -0.07047979,
          0.01761995,  0.        ],
        [ 0.01557187,  0.06228747,  0.03114373, ..., -0.01557187,
          0.03114373,  0.        ]])

In [None]:
vector1 = tfidf_vector.fit_transform(sentences)

In [None]:
vector1.todense()

matrix([[0.04598267, 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.03494895,
         0.        ],
        [0.        , 0.08190099, 0.        , ..., 0.2814436 , 0.08299097,
         0.        ],
        ...,
        [0.18789787, 0.        , 0.        , ..., 0.36630952, 0.16202384,
         0.        ],
        [0.03456915, 0.        , 0.03647552, ..., 0.06739305, 0.        ,
         0.33270152],
        [0.1854991 , 0.        , 0.19572871, ..., 0.09040827, 0.        ,
         0.        ]])

In [None]:
sorted(tfidf_vector.vocabulary_.items(),key=lambda x: x[1])

[('address', 0),
 ('best', 1),
 ('c', 2),
 ('call', 3),
 ('care', 4),
 ('center', 5),
 ('client', 6),
 ('co', 7),
 ('college', 8),
 ('community', 9),
 ('contact', 10),
 ('corporation', 11),
 ('county', 12),
 ('customer', 13),
 ('data', 14),
 ('day', 15),
 ('department', 16),
 ('design', 17),
 ('development', 18),
 ('director', 19),
 ('e', 20),
 ('education', 21),
 ('employee', 22),
 ('family', 23),
 ('financial', 24),
 ('firm', 25),
 ('first', 26),
 ('founded', 27),
 ('free', 28),
 ('full', 29),
 ('global', 30),
 ('group', 31),
 ('health', 32),
 ('help', 33),
 ('high', 34),
 ('home', 35),
 ('inc', 36),
 ('industry', 37),
 ('international', 38),
 ('investment', 39),
 ('job', 40),
 ('largest', 41),
 ('leading', 42),
 ('life', 43),
 ('limited', 44),
 ('linkedin', 45),
 ('llc', 46),
 ('make', 47),
 ('manager', 48),
 ('market', 49),
 ('medical', 50),
 ('member', 51),
 ('month', 52),
 ('n', 53),
 ('national', 54),
 ('need', 55),
 ('network', 56),
 ('number', 57),
 ('office', 58),
 ('online',

# Task 3: Research on Non-negative Matrix Factorization 

https://docs.google.com/presentation/d/1HPOqddXEz9BKKSnpjZYOvnfgKeNlZZOxulPgakfCQGw/edit?usp=sharing

# Task 4: Implementation of a NNMF 

In [None]:
from sklearn.decomposition import NMF

In [None]:
nmf = NMF(n_components=100)

In [None]:
W = nmf.fit_transform(vector1)



In [None]:
def display_topics(model, feature_names, num_top_words,topic_names=None):
     # iterate through topics in topic-term matrix, 'H' aka
    # model.components_
    for ix, topic in enumerate(model.components_):
        #print topic, topic number, and top words
        if not topic_names or not topic_names[ix]:
            print("\nCompanies ", ix)
        else:
            print("\nCompanies: '",topic_names[ix],"'")
        print(", ".join([feature_names[i] \
             for i in topic.argsort()[:-num_top_words - 1:-1]]))

In [None]:
H= nmf.components_

In [None]:
display_topics(nmf, tfidf_vector.get_feature_names(), 5)


Companies  0
help, oregon, york, inc, first

Companies  1
health, care, provider, county, inc

Companies  2
inc, oregon, sale, county, founded

Companies  3
university, school, oregon, research, college

Companies  4
investment, financial, firm, market, call

Companies  5
llc, oregon, firm, limited, sale

Companies  6
design, oregon, county, inc, founded

Companies  7
profile, view, linkedin, oregon, limited

Companies  8
n, oregon, e, york, inc

Companies  9
group, oregon, co, th, york

Companies  10
center, oregon, industry, founded, free

Companies  11
school, high, oregon, york, home

Companies  12
corporation, oregon, product, sale, home

Companies  13
care, oregon, industry, founded, free

Companies  14
limited, oregon, financial, product, address

Companies  15
oregon, portland, county, inc, founded

Companies  16
product, health, sale, york, financial

Companies  17
co, york, industry, founded, free

Companies  18
firm, client, office, salary, partner

Companies  19
internatio

# Task 5: Evaluation

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=262b3d28-05ef-49db-b57a-efab2f090880' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>