# Pre Processing Text collected
## Importing libraries

In [1]:
import pandas as pd
import numpy as np
import nltk
import string
import fasttext
import contractions
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer

## Accessing data collected

In [2]:
df = pd.read_csv('indeed_scrape.csv')
df

Unnamed: 0.1,Unnamed: 0,rating,rating_title,rating_description,rating_pros,rating_cons
0,0,5.0,Fun Place to work,"Google has their own culture, what you see in ...","Free lunches, Lear alot, gBus commute",No cons
1,1,2.0,Poor Management,The team I work for is structured of people wi...,,
2,2,4.0,Fun place to work!,"Fun place to work, but the industry can be com...",,
3,3,5.0,Productive and fun place to work,Definitely a great place to work. Loads of act...,The cafe and lounge areas,"For me, wasn't in my field of study"
4,4,1.0,No job security,"Very bad local Management, they are not worth ...",Better job outside,"Do not work under INDIAN PEOPLE, They have the..."
...,...,...,...,...,...,...
3124,3124,4.0,Positive and fun place to work,great supportive atmosphere. lots of opportuni...,"free food, onsite gym, development opportunities",
3125,3125,4.0,Lots of Professional Experience,I learned the ethics working at such a competi...,"Free lunches, free pickups and dropping, snack...",Na
3126,3126,4.0,"Fun place to work, cool gear, local jobs prosp...","Great place to work, great peers,perks and man...",Free lunches,hindered upward mobility for technical staff t...
3127,3127,4.0,Nice place to work,I work as a contractor at Google. I was happy ...,,


## Removing unwanted columns

In [3]:
df.drop('Unnamed: 0', axis=1, inplace=True)
df

Unnamed: 0,rating,rating_title,rating_description,rating_pros,rating_cons
0,5.0,Fun Place to work,"Google has their own culture, what you see in ...","Free lunches, Lear alot, gBus commute",No cons
1,2.0,Poor Management,The team I work for is structured of people wi...,,
2,4.0,Fun place to work!,"Fun place to work, but the industry can be com...",,
3,5.0,Productive and fun place to work,Definitely a great place to work. Loads of act...,The cafe and lounge areas,"For me, wasn't in my field of study"
4,1.0,No job security,"Very bad local Management, they are not worth ...",Better job outside,"Do not work under INDIAN PEOPLE, They have the..."
...,...,...,...,...,...
3124,4.0,Positive and fun place to work,great supportive atmosphere. lots of opportuni...,"free food, onsite gym, development opportunities",
3125,4.0,Lots of Professional Experience,I learned the ethics working at such a competi...,"Free lunches, free pickups and dropping, snack...",Na
3126,4.0,"Fun place to work, cool gear, local jobs prosp...","Great place to work, great peers,perks and man...",Free lunches,hindered upward mobility for technical staff t...
3127,4.0,Nice place to work,I work as a contractor at Google. I was happy ...,,


## Noting null values

In [4]:
for col in df.columns:
    print(col, df[col].isnull().sum())

rating 0
rating_title 7
rating_description 0
rating_pros 1814
rating_cons 1929


## Expanding contractions

In [5]:
rws = df.loc[:, ['rating', 'rating_description']]
rws['no_contract'] = rws['rating_description'].apply(lambda x: [contractions.fix(word) for word in x.split()])
rws.head()

Unnamed: 0,rating,rating_description,no_contract
0,5.0,"Google has their own culture, what you see in ...","[Google, has, their, own, culture,, what, you,..."
1,2.0,The team I work for is structured of people wi...,"[The, team, I, work, for, is, structured, of, ..."
2,4.0,"Fun place to work, but the industry can be com...","[Fun, place, to, work,, but, the, industry, ca..."
3,5.0,Definitely a great place to work. Loads of act...,"[Definitely, a, great, place, to, work., Loads..."
4,1.0,"Very bad local Management, they are not worth ...","[Very, bad, local, Management,, they, are, not..."


## Back to string

In [6]:
rws['rating_description_str'] = [' '.join(map(str, l)) for l in rws['no_contract']]
rws.head()


Unnamed: 0,rating,rating_description,no_contract,rating_description_str
0,5.0,"Google has their own culture, what you see in ...","[Google, has, their, own, culture,, what, you,...","Google has their own culture, what you see in ..."
1,2.0,The team I work for is structured of people wi...,"[The, team, I, work, for, is, structured, of, ...",The team I work for is structured of people wi...
2,4.0,"Fun place to work, but the industry can be com...","[Fun, place, to, work,, but, the, industry, ca...","Fun place to work, but the industry can be com..."
3,5.0,Definitely a great place to work. Loads of act...,"[Definitely, a, great, place, to, work., Loads...",Definitely a great place to work. Loads of act...
4,1.0,"Very bad local Management, they are not worth ...","[Very, bad, local, Management,, they, are, not...","Very bad local Management, they are not worth ..."


## Identifying english sentences only

In [7]:
pretrained_model = "lid.176.bin"
model = fasttext.load_model(pretrained_model)
langs = []
for sent in rws['rating_description_str']:
    lang = model.predict(sent)[0]
    langs.append(str(lang)[11:13])
rws['langs'] = langs



## Tokenization

In [8]:
rws['tokenized'] = rws['rating_description_str'].apply(word_tokenize)
rws.head()

Unnamed: 0,rating,rating_description,no_contract,rating_description_str,langs,tokenized
0,5.0,"Google has their own culture, what you see in ...","[Google, has, their, own, culture,, what, you,...","Google has their own culture, what you see in ...",en,"[Google, has, their, own, culture, ,, what, yo..."
1,2.0,The team I work for is structured of people wi...,"[The, team, I, work, for, is, structured, of, ...",The team I work for is structured of people wi...,en,"[The, team, I, work, for, is, structured, of, ..."
2,4.0,"Fun place to work, but the industry can be com...","[Fun, place, to, work,, but, the, industry, ca...","Fun place to work, but the industry can be com...",en,"[Fun, place, to, work, ,, but, the, industry, ..."
3,5.0,Definitely a great place to work. Loads of act...,"[Definitely, a, great, place, to, work., Loads...",Definitely a great place to work. Loads of act...,en,"[Definitely, a, great, place, to, work, ., Loa..."
4,1.0,"Very bad local Management, they are not worth ...","[Very, bad, local, Management,, they, are, not...","Very bad local Management, they are not worth ...",en,"[Very, bad, local, Management, ,, they, are, n..."


## lowercasing

In [9]:
rws['lower'] = rws['tokenized'].apply(lambda x: [word.lower() for word in x])
rws.head()


Unnamed: 0,rating,rating_description,no_contract,rating_description_str,langs,tokenized,lower
0,5.0,"Google has their own culture, what you see in ...","[Google, has, their, own, culture,, what, you,...","Google has their own culture, what you see in ...",en,"[Google, has, their, own, culture, ,, what, yo...","[google, has, their, own, culture, ,, what, yo..."
1,2.0,The team I work for is structured of people wi...,"[The, team, I, work, for, is, structured, of, ...",The team I work for is structured of people wi...,en,"[The, team, I, work, for, is, structured, of, ...","[the, team, i, work, for, is, structured, of, ..."
2,4.0,"Fun place to work, but the industry can be com...","[Fun, place, to, work,, but, the, industry, ca...","Fun place to work, but the industry can be com...",en,"[Fun, place, to, work, ,, but, the, industry, ...","[fun, place, to, work, ,, but, the, industry, ..."
3,5.0,Definitely a great place to work. Loads of act...,"[Definitely, a, great, place, to, work., Loads...",Definitely a great place to work. Loads of act...,en,"[Definitely, a, great, place, to, work, ., Loa...","[definitely, a, great, place, to, work, ., loa..."
4,1.0,"Very bad local Management, they are not worth ...","[Very, bad, local, Management,, they, are, not...","Very bad local Management, they are not worth ...",en,"[Very, bad, local, Management, ,, they, are, n...","[very, bad, local, management, ,, they, are, n..."


## removing punctuation marks

In [10]:
punc = string.punctuation
rws['no_punc'] = rws['lower'].apply(lambda x: [word for word in x if word not in punc])
rws.head()

Unnamed: 0,rating,rating_description,no_contract,rating_description_str,langs,tokenized,lower,no_punc
0,5.0,"Google has their own culture, what you see in ...","[Google, has, their, own, culture,, what, you,...","Google has their own culture, what you see in ...",en,"[Google, has, their, own, culture, ,, what, yo...","[google, has, their, own, culture, ,, what, yo...","[google, has, their, own, culture, what, you, ..."
1,2.0,The team I work for is structured of people wi...,"[The, team, I, work, for, is, structured, of, ...",The team I work for is structured of people wi...,en,"[The, team, I, work, for, is, structured, of, ...","[the, team, i, work, for, is, structured, of, ...","[the, team, i, work, for, is, structured, of, ..."
2,4.0,"Fun place to work, but the industry can be com...","[Fun, place, to, work,, but, the, industry, ca...","Fun place to work, but the industry can be com...",en,"[Fun, place, to, work, ,, but, the, industry, ...","[fun, place, to, work, ,, but, the, industry, ...","[fun, place, to, work, but, the, industry, can..."
3,5.0,Definitely a great place to work. Loads of act...,"[Definitely, a, great, place, to, work., Loads...",Definitely a great place to work. Loads of act...,en,"[Definitely, a, great, place, to, work, ., Loa...","[definitely, a, great, place, to, work, ., loa...","[definitely, a, great, place, to, work, loads,..."
4,1.0,"Very bad local Management, they are not worth ...","[Very, bad, local, Management,, they, are, not...","Very bad local Management, they are not worth ...",en,"[Very, bad, local, Management, ,, they, are, n...","[very, bad, local, management, ,, they, are, n...","[very, bad, local, management, they, are, not,..."


## Removing stopwords

In [11]:
stop_words = set(stopwords.words('english'))
rws['stopwords_removed'] = rws['no_punc'].apply(lambda x: [word for word in x if word not in stop_words])
rws.head()

Unnamed: 0,rating,rating_description,no_contract,rating_description_str,langs,tokenized,lower,no_punc,stopwords_removed
0,5.0,"Google has their own culture, what you see in ...","[Google, has, their, own, culture,, what, you,...","Google has their own culture, what you see in ...",en,"[Google, has, their, own, culture, ,, what, yo...","[google, has, their, own, culture, ,, what, yo...","[google, has, their, own, culture, what, you, ...","[google, culture, see, companies, organization..."
1,2.0,The team I work for is structured of people wi...,"[The, team, I, work, for, is, structured, of, ...",The team I work for is structured of people wi...,en,"[The, team, I, work, for, is, structured, of, ...","[the, team, i, work, for, is, structured, of, ...","[the, team, i, work, for, is, structured, of, ...","[team, work, structured, people, poor, working..."
2,4.0,"Fun place to work, but the industry can be com...","[Fun, place, to, work,, but, the, industry, ca...","Fun place to work, but the industry can be com...",en,"[Fun, place, to, work, ,, but, the, industry, ...","[fun, place, to, work, ,, but, the, industry, ...","[fun, place, to, work, but, the, industry, can...","[fun, place, work, industry, complex, dependin..."
3,5.0,Definitely a great place to work. Loads of act...,"[Definitely, a, great, place, to, work., Loads...",Definitely a great place to work. Loads of act...,en,"[Definitely, a, great, place, to, work, ., Loa...","[definitely, a, great, place, to, work, ., loa...","[definitely, a, great, place, to, work, loads,...","[definitely, great, place, work, loads, activi..."
4,1.0,"Very bad local Management, they are not worth ...","[Very, bad, local, Management,, they, are, not...","Very bad local Management, they are not worth ...",en,"[Very, bad, local, Management, ,, they, are, n...","[very, bad, local, management, ,, they, are, n...","[very, bad, local, management, they, are, not,...","[bad, local, management, worth, act, like, org..."


## POS tagging

In [12]:
rws['pos_tags'] = rws['stopwords_removed'].apply(nltk.tag.pos_tag)
rws.head()

Unnamed: 0,rating,rating_description,no_contract,rating_description_str,langs,tokenized,lower,no_punc,stopwords_removed,pos_tags
0,5.0,"Google has their own culture, what you see in ...","[Google, has, their, own, culture,, what, you,...","Google has their own culture, what you see in ...",en,"[Google, has, their, own, culture, ,, what, yo...","[google, has, their, own, culture, ,, what, yo...","[google, has, their, own, culture, what, you, ...","[google, culture, see, companies, organization...","[(google, JJ), (culture, NN), (see, NN), (comp..."
1,2.0,The team I work for is structured of people wi...,"[The, team, I, work, for, is, structured, of, ...",The team I work for is structured of people wi...,en,"[The, team, I, work, for, is, structured, of, ...","[the, team, i, work, for, is, structured, of, ...","[the, team, i, work, for, is, structured, of, ...","[team, work, structured, people, poor, working...","[(team, NN), (work, NN), (structured, VBD), (p..."
2,4.0,"Fun place to work, but the industry can be com...","[Fun, place, to, work,, but, the, industry, ca...","Fun place to work, but the industry can be com...",en,"[Fun, place, to, work, ,, but, the, industry, ...","[fun, place, to, work, ,, but, the, industry, ...","[fun, place, to, work, but, the, industry, can...","[fun, place, work, industry, complex, dependin...","[(fun, NN), (place, NN), (work, NN), (industry..."
3,5.0,Definitely a great place to work. Loads of act...,"[Definitely, a, great, place, to, work., Loads...",Definitely a great place to work. Loads of act...,en,"[Definitely, a, great, place, to, work, ., Loa...","[definitely, a, great, place, to, work, ., loa...","[definitely, a, great, place, to, work, loads,...","[definitely, great, place, work, loads, activi...","[(definitely, RB), (great, JJ), (place, NN), (..."
4,1.0,"Very bad local Management, they are not worth ...","[Very, bad, local, Management,, they, are, not...","Very bad local Management, they are not worth ...",en,"[Very, bad, local, Management, ,, they, are, n...","[very, bad, local, management, ,, they, are, n...","[very, bad, local, management, they, are, not,...","[bad, local, management, worth, act, like, org...","[(bad, JJ), (local, JJ), (management, NN), (wo..."


In [13]:
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

rws['wordnet_pos'] = rws['pos_tags'].apply(lambda x: [(word, get_wordnet_pos(pos_tag)) for (word, pos_tag) in x])
rws.head()


Unnamed: 0,rating,rating_description,no_contract,rating_description_str,langs,tokenized,lower,no_punc,stopwords_removed,pos_tags,wordnet_pos
0,5.0,"Google has their own culture, what you see in ...","[Google, has, their, own, culture,, what, you,...","Google has their own culture, what you see in ...",en,"[Google, has, their, own, culture, ,, what, yo...","[google, has, their, own, culture, ,, what, yo...","[google, has, their, own, culture, what, you, ...","[google, culture, see, companies, organization...","[(google, JJ), (culture, NN), (see, NN), (comp...","[(google, a), (culture, n), (see, n), (compani..."
1,2.0,The team I work for is structured of people wi...,"[The, team, I, work, for, is, structured, of, ...",The team I work for is structured of people wi...,en,"[The, team, I, work, for, is, structured, of, ...","[the, team, i, work, for, is, structured, of, ...","[the, team, i, work, for, is, structured, of, ...","[team, work, structured, people, poor, working...","[(team, NN), (work, NN), (structured, VBD), (p...","[(team, n), (work, n), (structured, v), (peopl..."
2,4.0,"Fun place to work, but the industry can be com...","[Fun, place, to, work,, but, the, industry, ca...","Fun place to work, but the industry can be com...",en,"[Fun, place, to, work, ,, but, the, industry, ...","[fun, place, to, work, ,, but, the, industry, ...","[fun, place, to, work, but, the, industry, can...","[fun, place, work, industry, complex, dependin...","[(fun, NN), (place, NN), (work, NN), (industry...","[(fun, n), (place, n), (work, n), (industry, n..."
3,5.0,Definitely a great place to work. Loads of act...,"[Definitely, a, great, place, to, work., Loads...",Definitely a great place to work. Loads of act...,en,"[Definitely, a, great, place, to, work, ., Loa...","[definitely, a, great, place, to, work, ., loa...","[definitely, a, great, place, to, work, loads,...","[definitely, great, place, work, loads, activi...","[(definitely, RB), (great, JJ), (place, NN), (...","[(definitely, r), (great, a), (place, n), (wor..."
4,1.0,"Very bad local Management, they are not worth ...","[Very, bad, local, Management,, they, are, not...","Very bad local Management, they are not worth ...",en,"[Very, bad, local, Management, ,, they, are, n...","[very, bad, local, management, ,, they, are, n...","[very, bad, local, management, they, are, not,...","[bad, local, management, worth, act, like, org...","[(bad, JJ), (local, JJ), (management, NN), (wo...","[(bad, a), (local, a), (management, n), (worth..."


## Lemmatization

In [14]:
wnl = WordNetLemmatizer()
rws['lemmatized'] = rws['wordnet_pos'].apply(lambda x: [wnl.lemmatize(word, tag) for word, tag in x])
rws.head()

Unnamed: 0,rating,rating_description,no_contract,rating_description_str,langs,tokenized,lower,no_punc,stopwords_removed,pos_tags,wordnet_pos,lemmatized
0,5.0,"Google has their own culture, what you see in ...","[Google, has, their, own, culture,, what, you,...","Google has their own culture, what you see in ...",en,"[Google, has, their, own, culture, ,, what, yo...","[google, has, their, own, culture, ,, what, yo...","[google, has, their, own, culture, what, you, ...","[google, culture, see, companies, organization...","[(google, JJ), (culture, NN), (see, NN), (comp...","[(google, a), (culture, n), (see, n), (compani...","[google, culture, see, company, organizations,..."
1,2.0,The team I work for is structured of people wi...,"[The, team, I, work, for, is, structured, of, ...",The team I work for is structured of people wi...,en,"[The, team, I, work, for, is, structured, of, ...","[the, team, i, work, for, is, structured, of, ...","[the, team, i, work, for, is, structured, of, ...","[team, work, structured, people, poor, working...","[(team, NN), (work, NN), (structured, VBD), (p...","[(team, n), (work, n), (structured, v), (peopl...","[team, work, structure, people, poor, work, ex..."
2,4.0,"Fun place to work, but the industry can be com...","[Fun, place, to, work,, but, the, industry, ca...","Fun place to work, but the industry can be com...",en,"[Fun, place, to, work, ,, but, the, industry, ...","[fun, place, to, work, ,, but, the, industry, ...","[fun, place, to, work, but, the, industry, can...","[fun, place, work, industry, complex, dependin...","[(fun, NN), (place, NN), (work, NN), (industry...","[(fun, n), (place, n), (work, n), (industry, n...","[fun, place, work, industry, complex, dependin..."
3,5.0,Definitely a great place to work. Loads of act...,"[Definitely, a, great, place, to, work., Loads...",Definitely a great place to work. Loads of act...,en,"[Definitely, a, great, place, to, work, ., Loa...","[definitely, a, great, place, to, work, ., loa...","[definitely, a, great, place, to, work, loads,...","[definitely, great, place, work, loads, activi...","[(definitely, RB), (great, JJ), (place, NN), (...","[(definitely, r), (great, a), (place, n), (wor...","[definitely, great, place, work, load, activit..."
4,1.0,"Very bad local Management, they are not worth ...","[Very, bad, local, Management,, they, are, not...","Very bad local Management, they are not worth ...",en,"[Very, bad, local, Management, ,, they, are, n...","[very, bad, local, management, ,, they, are, n...","[very, bad, local, management, they, are, not,...","[bad, local, management, worth, act, like, org...","[(bad, JJ), (local, JJ), (management, NN), (wo...","[(bad, a), (local, a), (management, n), (worth...","[bad, local, management, worth, act, like, org..."


## Saving data for future use

In [15]:
rws.to_csv('indeed_scrape_clean.csv')