# SET 1: ML Based Approach

In [1]:
%pip install scikit-learn numpy pandas
import numpy as np
import pandas as pd
import re

Note: you may need to restart the kernel to use updated packages.


In [2]:
from textblob import TextBlob

In [5]:
data = pd.read_csv('HumanAnnotatedDataset.csv')
print(data)
data.Reviews=data.Reviews.astype(str)

#Transform text to lowercase
data['Reviews'].apply(lambda x: x.lower()) 
#Removing all punctuations and special characters
data['Reviews'] = data['Reviews'].apply(lambda x: re.sub('[^a-zA-z0-9\s]', '', x)) 

     Column1                Company  \
0          0                Netflix   
1          1                   Uber   
2          2                   Yext   
3          3                   CACI   
4          4          Covenant Eyes   
..       ...                    ...   
117      117  Reynolds and Reynolds   
118      118                  Sabre   
119      119  Legrand North America   
120      120    Samsung Electronics   
121      121         Teladoc Health   

                                               Reviews HUMAN_LABEL  
0    high performance culture challenge netflix ask...    positive  
1    pay gone years first started pay great! gas pr...    negative  
2    excellent ceo -lucky worked given opportunity ...    negative  
3    nice place work great company work benefits gr...    negative  
4    best company i’ve ever worked covenant eyes ca...    positive  
..                                                 ...         ...  
117  5 stars best part working company? manageme

StopWord Removal

In [6]:
english_stopwords = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]
stopwords_list = english_stopwords
data['Reviews'] = data['Reviews'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords_list)]))

In [7]:
data.head()

Unnamed: 0,Column1,Company,Reviews,HUMAN_LABEL
0,0,Netflix,high performance culture challenge netflix ask...,positive
1,1,Uber,pay gone years first started pay great gas pri...,negative
2,2,Yext,excellent ceo lucky worked given opportunity f...,negative
3,3,CACI,nice place work great company work benefits gr...,negative
4,4,Covenant Eyes,best company ive ever worked covenant eyes car...,positive


In [8]:
X = data.iloc[:, 2]
y = data.iloc[:, -1]

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)

In [10]:
X_train

15     low paced environment day consisted learning c...
72     good place work fun caring people guess starte...
22     good wlb terrible culture never work later sch...
116    overall great place intern first internship co...
91     fun workplace ive worked almost two years hone...
                             ...                        
106    good people challenges sort great product visi...
14     excellent employer microsoft excellent employe...
92     great place work great worklife balance welcom...
51     flexible flexible hours management help sucked...
102    disconnected benefits mediocre high turnaround...
Name: Reviews, Length: 91, dtype: object

In [11]:
X_test

18     productive fun place work would definitely rec...
45     good work ethics working help client stay clea...
47     productive strong work environment good workli...
89     much game culture dedicated product ip making ...
4      best company ive ever worked covenant eyes car...
40     good place work people passionate products tak...
62     great place work great place work never job tr...
107    small company feel good people small company f...
31     overall great place work overall great place w...
55     okay often left manage difficult accounts cand...
53     feel well cared working gdms nature work fulfi...
119    best place work managers jobs correctly always...
10     productive fun environment overall kasisto fun...
90     electrical repair technician great place work ...
109    great wonderful work environment everyone real...
11     productive fun workplace learned new things co...
76     good pay terrible management management terrib...
56     fast moving environment 

TfidVectorization

In [12]:
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer

my_stop_words = list(text.ENGLISH_STOP_WORDS)
# Create feature vectors
vectorizer = TfidfVectorizer(min_df=5,
                             max_df=0.95,
                             stop_words=my_stop_words,
                             sublinear_tf = True,
                             use_idf = True,
                             ngram_range=(1,1))
train_vectors = vectorizer.fit_transform(X_train.values.astype('U'))
test_vectors = vectorizer.transform(X_test.values.astype('U'))

Random Forest

In [13]:
import time
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
# Perform classification with SVM, kernel=linear
classifier = RandomForestClassifier()
t0 = time.time()
classifier.fit(train_vectors, y_train)
t1 = time.time()
prediction = classifier.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1
# results
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
report = classification_report(y_test, prediction, output_dict=True,labels=np.unique(prediction))
print('positive: ', report['positive'])
print('negative: ', report['negative'])

Training time: 0.275105s; Prediction time: 0.033982s
positive:  {'precision': 0.8, 'recall': 0.6153846153846154, 'f1-score': 0.6956521739130435, 'support': 13}
negative:  {'precision': 0.7619047619047619, 'recall': 0.8888888888888888, 'f1-score': 0.8205128205128205, 'support': 18}


In [14]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier()
t0 = time.time()
classifier.fit(train_vectors, y_train)
t1 = time.time()
prediction = classifier.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1
# results
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
report = classification_report(y_test, prediction, output_dict=True)
print('positive: ', report['positive'])
print('negative: ', report['negative'])

Training time: 0.003096s; Prediction time: 0.018920s
positive:  {'precision': 0.7, 'recall': 0.5384615384615384, 'f1-score': 0.608695652173913, 'support': 13}
negative:  {'precision': 0.7142857142857143, 'recall': 0.8333333333333334, 'f1-score': 0.7692307692307692, 'support': 18}


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


# SET 2: Using Textblob with PreProcessing

In [15]:
%pip install numpy pandas nltk textblob statistics
import nltk
nltk.download('stopwords')

Collecting statistics
  Downloading statistics-1.0.3.5.tar.gz (8.3 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: statistics
  Building wheel for statistics (setup.py): started
  Building wheel for statistics (setup.py): finished with status 'done'
  Created wheel for statistics: filename=statistics-1.0.3.5-py3-none-any.whl size=7439 sha256=ffc41222cfedaa3f1a87710813aeb906a2dfc2686fbb80e930905f03bdfd91cf
  Stored in directory: c:\users\vaishnavi\appdata\local\pip\cache\wheels\26\3c\70\9467407f3aa90862061eadcd286627b23a8bab6789b667776f
Successfully built statistics
Installing collected packages: statistics
Successfully installed statistics-1.0.3.5
Note: you may need to restart the kernel to use updated packages.


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Vaishnavi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [16]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords

In [17]:
data = pd.read_csv('dataset.csv', sep='|')
data.Reviews=data.Reviews.astype(str)

#Transform text to lowercase
data['Reviews'] = data['Reviews'].apply(lambda x: x.lower()) 
#Removing all punctuations and special characters
data['Reviews'] = data['Reviews'].apply(lambda x: re.sub('[,.]', '', x))

In [18]:
stopwords_list = stopwords.words('english')
data['Reviews'] = data['Reviews'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords_list)]))

In [19]:
print(data)

                   Company                                            Reviews
0                  Netflix  high performance culture challenge netflix ask...
1                     Uber  pay gone years first started pay great! gas pr...
2                     Yext  excellent ceo -lucky worked given opportunity ...
3                     CACI  nice place work great company work benefits gr...
4            Covenant Eyes  best company i’ve ever worked covenant eyes ca...
..                     ...                                                ...
117  Reynolds and Reynolds  5 stars best part working company? management ...
118                  Sabre  easy going workplace “you always deliver work ...
119  Legrand North America  best place work managers jobs correctly always...
120    Samsung Electronics  interesting challenging work one favorite comp...
121         Teladoc Health  well-organized inclusive excellent management ...

[122 rows x 2 columns]


In [20]:
from textblob import TextBlob

In [21]:
score_list=[]

for text in data['Reviews']:
  
  sentence = TextBlob(text)
  
  score = sentence.sentiment.polarity
  score_list.append(score)

In [22]:
import statistics
median = statistics.median(score_list)
print(median)

0.24523521411378624


In [23]:
sentiment_label = []
for score in score_list:
  if score > median:
    sentiment_label.append('positive')
  elif score < median:
    sentiment_label.append('negative')
  else:
    sentiment_label.append('neutral')
print(sentiment_label)

['positive', 'negative', 'negative', 'negative', 'positive', 'negative', 'positive', 'negative', 'positive', 'negative', 'positive', 'negative', 'positive', 'negative', 'positive', 'negative', 'positive', 'positive', 'negative', 'negative', 'negative', 'negative', 'negative', 'positive', 'positive', 'positive', 'negative', 'positive', 'positive', 'positive', 'negative', 'positive', 'negative', 'negative', 'positive', 'positive', 'positive', 'negative', 'negative', 'negative', 'positive', 'positive', 'negative', 'negative', 'negative', 'positive', 'positive', 'positive', 'positive', 'positive', 'negative', 'positive', 'positive', 'positive', 'negative', 'negative', 'positive', 'positive', 'positive', 'positive', 'negative', 'negative', 'positive', 'negative', 'positive', 'positive', 'positive', 'positive', 'negative', 'negative', 'negative', 'negative', 'positive', 'positive', 'positive', 'positive', 'negative', 'positive', 'negative', 'negative', 'negative', 'negative', 'positive', 'po

In [24]:
dataset = data
dataset['SCORE']=score_list
dataset['SENTIMENT_LABEL']=sentiment_label
print(dataset)
dataset.to_csv("TextBlobResultswithMedian.csv", sep='|')

                   Company                                            Reviews  \
0                  Netflix  high performance culture challenge netflix ask...   
1                     Uber  pay gone years first started pay great! gas pr...   
2                     Yext  excellent ceo -lucky worked given opportunity ...   
3                     CACI  nice place work great company work benefits gr...   
4            Covenant Eyes  best company i’ve ever worked covenant eyes ca...   
..                     ...                                                ...   
117  Reynolds and Reynolds  5 stars best part working company? management ...   
118                  Sabre  easy going workplace “you always deliver work ...   
119  Legrand North America  best place work managers jobs correctly always...   
120    Samsung Electronics  interesting challenging work one favorite comp...   
121         Teladoc Health  well-organized inclusive excellent management ...   

        SCORE SENTIMENT_LAB

In [25]:
human_data = pd.read_csv('HumanAnnotatedDataset.csv', sep=',')

Pos_Fal_neg = 0.0
Pos_Tru_pos = 0.0
Neg_Tru_neg = 0.0
Neg_Fal_pos = 0.0


for ind in range(len(sentiment_label)):
  
    if sentiment_label[ind] == 'positive' and human_data['HUMAN_LABEL'][ind] == 'negative':
        Neg_Fal_pos += 1
    elif sentiment_label[ind] == 'negative' and human_data['HUMAN_LABEL'][ind] == 'positive':
        Pos_Fal_neg += 1
    elif sentiment_label[ind] == 'positive' and human_data['HUMAN_LABEL'][ind] == 'positive':
        Pos_Tru_pos += 1
    elif sentiment_label[ind] == 'negative' and human_data['HUMAN_LABEL'][ind] == 'negative':
        Neg_Tru_neg += 1


Pos_Prec = (Pos_Tru_pos)/(Pos_Tru_pos + Neg_Fal_pos)
Pos_Recal = (Pos_Tru_pos)/(Pos_Tru_pos + Pos_Fal_neg)
Pos_FScore = (2*Pos_Prec*Pos_Recal)/(Pos_Prec + Pos_Recal)

Neg_Prec = (Neg_Tru_neg)/(Neg_Tru_neg + Pos_Fal_neg)
Neg_Recal = (Neg_Tru_neg)/(Neg_Tru_neg + Neg_Fal_pos)
Neg_FScore = (2*Neg_Prec*Neg_Recal)/(Neg_Prec + Neg_Recal)
        
    
print("Positive Precision: ", Pos_Prec)
print("Positive Recall: ", Pos_Recal)
print("Positive F-Score: ", Pos_FScore)

print("\n")

print("Negative Precision: ", Neg_Prec)
print("Negative Recall: ", Neg_Recal)
print("Negative F-Score: ", Neg_FScore)

Positive Precision:  0.6721311475409836
Positive Recall:  0.8913043478260869
Positive F-Score:  0.7663551401869158


Negative Precision:  0.9180327868852459
Negative Recall:  0.7368421052631579
Negative F-Score:  0.8175182481751824


# SET 3: Web Scraper

In [28]:
!pip install selenium

Collecting selenium
  Using cached selenium-4.9.1-py3-none-any.whl (6.6 MB)
Collecting trio~=0.17
  Downloading trio-0.22.0-py3-none-any.whl (384 kB)
     -------------------------------------- 384.9/384.9 kB 3.4 MB/s eta 0:00:00
Collecting trio-websocket~=0.9
  Downloading trio_websocket-0.10.2-py3-none-any.whl (17 kB)
Collecting outcome
  Downloading outcome-1.2.0-py2.py3-none-any.whl (9.7 kB)
Collecting exceptiongroup>=1.0.0rc9
  Downloading exceptiongroup-1.1.1-py3-none-any.whl (14 kB)
Collecting async-generator>=1.9
  Downloading async_generator-1.10-py3-none-any.whl (18 kB)
Collecting wsproto>=0.14
  Downloading wsproto-1.2.0-py3-none-any.whl (24 kB)
Collecting h11<1,>=0.9.0
  Downloading h11-0.14.0-py3-none-any.whl (58 kB)
     ---------------------------------------- 58.3/58.3 kB 1.0 MB/s eta 0:00:00
Installing collected packages: outcome, h11, exceptiongroup, async-generator, wsproto, trio, trio-websocket, selenium
Successfully installed async-generator-1.10 exceptiongroup-1.1

In [30]:
!pip install webdriver_manager

Collecting webdriver_manager
  Using cached webdriver_manager-3.8.6-py2.py3-none-any.whl (27 kB)
Collecting python-dotenv
  Downloading python_dotenv-1.0.0-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv, webdriver_manager
Successfully installed python-dotenv-1.0.0 webdriver_manager-3.8.6


In [32]:
%pip install webdriver-manager selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import time
import csv

Note: you may need to restart the kernel to use updated packages.


In [33]:
indeedCompanyBaseURL = 'https://www.indeed.com/'

# Change these constants
# Note - do not set the number of reviews to scrape per company over 150
# Otherwise indeed.com will block the scraper, by temporarily blocking your IP Address
# Trust me, I found out the hard way
# Number of Reviews needs to be in multiples of 20
NUMBER_OF_COMPANIES_TO_SCRAPE = 150
NUMBER_OF_REVIEWS_TO_SCRAPE_PER_COMPANY = 100

In [37]:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager

# Initialize the driver
driver = webdriver.Chrome(ChromeDriverManager().install())

  driver = webdriver.Chrome(ChromeDriverManager().install())


In [38]:
companyURLs = {}
print("Beginning to scrape")
companyScraperBaseURL = 'https://www.indeed.com/jobs?q=software+intern&start='
for i in range(0,NUMBER_OF_COMPANIES_TO_SCRAPE,10):
    print("Scraping companies - ",i+10,"/",NUMBER_OF_COMPANIES_TO_SCRAPE)
    url = companyScraperBaseURL+str(i)
    driver.get(url)
    time.sleep(1)
    companies = driver.find_elements(By.CLASS_NAME,'companyOverviewLink')
    for company in companies:
        if company.text not in companyURLs:
            companyURLs[company.text] = (company.get_property('href'))

print("Company Scraping done")

Beginning to scrape
Scraping companies -  10 / 150
Scraping companies -  20 / 150
Scraping companies -  30 / 150
Scraping companies -  40 / 150
Scraping companies -  50 / 150
Scraping companies -  60 / 150
Scraping companies -  70 / 150
Scraping companies -  80 / 150
Scraping companies -  90 / 150
Scraping companies -  100 / 150
Scraping companies -  110 / 150
Scraping companies -  120 / 150
Scraping companies -  130 / 150
Scraping companies -  140 / 150
Scraping companies -  150 / 150
Company Scraping done


In [39]:
reviews = {}

for i,company in enumerate(companyURLs):
    url = companyURLs[company]
    print()
    print("({0}/{1}) Scraping company reviews - {2}".format(i+1, len(companyURLs), company))
    for i in range(0,NUMBER_OF_REVIEWS_TO_SCRAPE_PER_COMPANY,20):
        print("Progress - ",i+20,"/",NUMBER_OF_REVIEWS_TO_SCRAPE_PER_COMPANY)
        newUrl = url+'/reviews?&start='+str(i)
        driver.get(newUrl)
        elems = driver.find_elements(By.CLASS_NAME,'eu4oa1w0')
        for elem in elems:
            if elem.tag_name=="span":
                txt = elem.text
                if txt!='':
                    if company not in reviews:
                        reviews[company] = ''
                    reviews[company]+=' '+txt

In [40]:
import pandas as pd
df = pd.DataFrame()
for company in reviews:
    tempm = {}
    tempm['Company'] = company
    tempm['Reviews'] = reviews[company]
    df = df.append(tempm, ignore_index=True)

df.to_csv('out.csv',sep='|', index=False)