In [54]:
import pandas as pd

from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from textblob import Word

import spacy

from commons import *

In [4]:
pd.set_option('max_colwidth',150)

In [14]:
df = pd.read_csv(r"C:\Users\User\Downloads\data_clean2.csv")
df = df[['job_description','job_title']]
df.rename(columns={'job_description': 'Description', 'job_title': 'JobTitle'}, inplace=True)

In [12]:
df.shape

(185983, 2)

## Doing some resampling 

In [18]:
df = reshape_df(df, the_min_amount_of_rows = 10, the_max_amount_of_rows = 100)

In [20]:
df.shape

(37550, 2)

## Text cleaning methods

### Stemming

In [42]:
def stemming_func(row):
    stemming = PorterStemmer()
    row = [word for word in row.split()]
    row = [stemming.stem(w) for w in row]
    return (" ").join(row)

In [118]:
df['stemming'] = df.Description.apply(stemming_func)

## Lematization

* nltk library

In [45]:
def nltk_lemmatization_func(row):
    nltk_lemmatizer = WordNetLemmatizer()
    row = [word for word in row.split()]
    row = [nltk_lemmatizer.lemmatize(w) for w in row]
    return (" ").join(row)

In [117]:
df['nltk_lemmatization'] = df.Description.apply(nltk_lemmatization_func)

* textblob library

In [52]:
def textblob_lemmatization_func(row):
    row = [word for word in row.split()]
    row = [Word(w).lemmatize() for w in row]
    return (" ").join(row)

In [116]:
df['textblob_lemmatization'] = df.Description.apply(textblob_lemmatization_func)

* spacy library

In [55]:
nlp = spacy.load('en_core_web_sm')

def spacy_lemmatization_func(words_spacy):
    doc_words = nlp(words_spacy)
    tokens_words = []
    for token in doc_words:
        tokens_words.append(token)
    row = [token.lemma_ for token in tokens_words]
    return (" ").join(row)

In [115]:
df['spacy_lemmatization'] = df.Description.apply(spacy_lemmatization_func)

## Stemming after spacy lemmatization

In [114]:
df['stemming_after_spacy_lemmatization'] = df.spacy_lemmatization.apply(stemming_func)

## Comparing the results

In [119]:
length_of_vocabulary = {}
avarage_len_of_word = {}
for column in df:
    if column not in 'JobTitle':
        vocabulary = ' '.join([i for i in df[column]]).split()
        voc_len = len(set(vocabulary))
        length_of_vocabulary.update({column:voc_len})
        avarage_len_of_word.update({column:(sum(len(word) for word in vocabulary) / len(vocabulary))})

In [113]:
pd.DataFrame([length_of_vocabulary,avarage_len_of_word], index=["length_of_vocabulary","avarage_len_of_word"]).T

Unnamed: 0,length_of_vocabulary,avarage_len_of_word
Description,89663.0,7.294501
stemming,71882.0,5.738655
nltk_lemmatization,83894.0,7.109987
textblob_lemmatization,83894.0,7.109987
spacy_lemmatization,76894.0,6.867337
stemming_after_spacy_lemmatization,66347.0,5.683338


## Summary

stemming_after_spacy_lemmatization method seems the best option, 
but we should also try stemming and spacy_lemmatization independently.

Only after we run the models we can se wich one works better.