# Company Profile EDA

In [None]:
import pandas as pd
import nltk

import matplotlib.pyplot as plt
from sklearn.manifold import MDS
from matplotlib import pyplot as plt

from nltk.tokenize import PunktSentenceTokenizer,RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

from sklearn.feature_extraction.text import CountVectorizer

!pip install lda
import lda

import re
from collections import Counter
import math

!pip install vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

## Read In Data

In [None]:
fake_jobs = pd.read_csv('fake_job_postings.csv')

In [None]:
fake_jobs.head()
fake_jobs['company_profile']

## Frequency

In [None]:
def removeStopwords(textData):
    stop_words_list = stopwords.words('english') # Choose the stop English stopwords
    stop_words_list.append('–')# Add - to the stopwords
    stop_words = set(stop_words_list)
    word_tokens = [t for t in textData.split()] # Split the comment into words
    # For every word, return the word if it is not a stopword, a doubl-quote, a dash, a comma or a period.
    filtered_sentence = [w for w in word_tokens if (not w == '"') & (not w == '-') & (not w == '.') & (not w == ',') & (not w.lower() in stop_words)]
    return filtered_sentence

def filteredFreq(postList):
    wordFrecSeries = [] # Create empty list
    wordFrecSeries = pd.Series(wordFrecSeries) # Convert the list into a Series
    # For each post in the list of posts (the parameter) get the frequency of each of its words
    for post in postList:
        # For each word in the post, get its frequency
        for word in post:
            # If the word had already been found sum 1, else add the word to the Series
            if word in wordFrecSeries:
                wordFrecSeries[word] = wordFrecSeries[word]+1
            else:
                wordFrecSeries[word] = 1
    return wordFrecSeries.sort_values(ascending=False)[:400] # Order the series and return the top 400 values

company_profile = fake_jobs['company_profile'].astype(str)
word_list = company_profile.map(removeStopwords)
frequency_series = filteredFreq(word_list)

In [None]:
print(frequency_series[:50])
#print(frequency_series[51:100])
#print(frequency_series[101:150])
print('Important Attributes: services, technology, platform, sofwtare, communications, marketing, education, design, startup, HR')
plt.figure(figsize=(20, 5))
plt.bar(frequency_series[:25].index, frequency_series[:25], color='mediumaquamarine')
plt.xticks(rotation='vertical')
plt.show()

# fake_jobs['company_profile'][fake_jobs['company_profile'] == 'nan'] MADE SURE THERE WERE NO FALSE nan

## Lift

In [None]:
def removeStopwords(textData):
    stop_words_list = stopwords.words('english') # Choose the stop English stopwords
    stop_words_list.append('–')# Add - to the stopwords
    stop_words = set(stop_words_list)
    word_tokens = [t for t in textData.split()] # Split the comment into words
    # For every word, return the word if it is not a stopword, a doubl-quote, a dash, a comma or a period.
    filtered_sentence = [w for w in word_tokens if (not w == '"') & (not w == '-') & (not w == '.') & (not w == ',') & (not w.lower() in stop_words)]
    return filtered_sentence

def getLiftAB(termA, termB, postList):
    """
    Function that calculates the lift between two terms
    """
    flagInPostA = 0
    flagInPostB = 0
    flagInPostAB = 0
    freqAB = 0
    freqA = 0
    freqB = 0
    for post in postList: # For each post in the list of posts (the parameter) get the frequency of each of its words
        for word in post: # For each word in the post, get its frequency and then set the flags back to the original value
            if flagInPostAB == 0: # If both terms have not been found inside the post
                if flagInPostA == 0: # If term A has not been previously found in the post check if the current word is term A
                    if word == termA: # If the current word is term A add 1 to the frequency and set the flagA to 1
                        freqA += 1
                        flagInPostA = 1
                if flagInPostB == 0: # If term B has not been previously found in the post check if the current word is term B
                    if word == termB: # If the current word is term B add 1 to the frequency and set the flagB to 1
                        freqB += 1
                        flagInPostB = 1
                if (flagInPostA == 1) & (flagInPostB == 1): # If after this iteration both terms have been found inside the post, add 1 to the frequncy AB and set the flagAB to 1
                    freqAB += 1
                    flagInPostAB = 1
        # After all words have been checked, return the flags to zero and move on to the next post
        flagInPostA = 0
        flagInPostB = 0
        flagInPostAB = 0
    # Make sure that the lifts can be computed (no divisions over zero)
    if freqA == 0:
        freqA = 1
    if freqB == 0:
        freqB = 1
    # Compute lift between term A and B
    liftAB = (len(postList) * freqAB) / (freqA * freqB)
    return liftAB  

def liftTable(importantList, postList):
    """
    Function that gets the lifts between many words (the ones with highest frequncies)
    """
    liftList = []
    liftSeries = pd.Series(liftList) # Create empty series
    liftDataFrame = pd.DataFrame(liftSeries) # Create empty dataframe
    for i in range(len(importantList)): # For each brand
        liftDataFrame.loc[importantList[i], importantList[i]] = 0
        for j in range(i+1,len(importantList)): # For each pair of brands
            liftAB = getLiftAB(importantList[i], importantList[j], postList) # Get the lift of this brand-pair
            liftDataFrame.loc[importantList[i], importantList[j]] = liftAB
            liftDataFrame.loc[importantList[j], importantList[i]] = liftAB
    liftDataFrame = liftDataFrame.drop([0], axis=1)
    return liftDataFrame

company_profile = fake_jobs['company_profile'].dropna().astype(str) # Get data
word_list = company_profile.map(removeStopwords) # For the company profile column, map the function removeStopWords
attribute_list = ['services', 'technology', 'platform', 'sofwtare', 'communications', 'marketing', 'education', 'design', 'startup', 'HR']
lift_table = liftTable(attribute_list, word_list)
print(lift_table)

## MDS Plot

In [None]:
# Source1: https://stackabuse.com/guide-to-multidimensional-scaling-in-python-with-scikit-learn/
# Source2: https://stackoverflow.com/questions/14432557/scatter-plot-with-different-text-at-each-data-point

def removeStopwords(textData):
    stop_words_list = stopwords.words('english') # Choose the stop English stopwords
    stop_words_list.append('–')# Add - to the stopwords
    stop_words = set(stop_words_list)
    word_tokens = [t for t in textData.split()] # Split the comment into words
    # For every word, return the word if it is not a stopword, a doubl-quote, a dash, a comma or a period.
    filtered_sentence = [w for w in word_tokens if (not w == '"') & (not w == '-') & (not w == '.') & (not w == ',') & (not w.lower() in stop_words)]
    return filtered_sentence

def getLiftAB(termA, termB, postList):
    """
    Function that calculates the lift between two terms (brands)
    """
    flagInPostA = 0
    flagInPostB = 0
    flagInPostAB = 0
    freqAB = 0
    freqA = 0
    freqB = 0
    for post in postList: # For each post in the list of posts (the parameter) get the frequency of each of its words
        for word in post: # For each word in the post, get its frequency and then set the flags back to the original value
            if flagInPostAB == 0: # If both terms have not been found inside the post
                if flagInPostA == 0: # If term A has not been previously found in the post check if the current word is term A
                    if word == termA: # If the current word is term A add 1 to the frequency and set the flagA to 1
                        freqA += 1
                        flagInPostA = 1
                if flagInPostB == 0: # If term B has not been previously found in the post check if the current word is term B
                    if word == termB: # If the current word is term B add 1 to the frequency and set the flagB to 1
                        freqB += 1
                        flagInPostB = 1
                if (flagInPostA == 1) & (flagInPostB == 1): # If after this iteration both terms have been found inside the post, add 1 to the frequncy AB and set the flagAB to 1
                    freqAB += 1
                    flagInPostAB = 1
        # After all words have been checked, return the flags to zero and move on to the next post
        flagInPostA = 0
        flagInPostB = 0
        flagInPostAB = 0
    # Make sure that the lifts can be computed (no divisions over zero)
    if freqA == 0:
        freqA = 1
    if freqB == 0:
        freqB = 1
    # Compute lift between term A and B
    liftAB = (len(postList) * freqAB) / (freqA * freqB)
    return liftAB  

def liftTable(importantList, postList):
    """
    Function that gets the lifts between many words (the ones with highest frequncies)
    """
    liftList = []
    liftSeries = pd.Series(liftList) # Create empty series
    liftDataFrame = pd.DataFrame(liftSeries) # Create empty dataframe
    for i in range(len(importantList)): # For each brand
        liftDataFrame.loc[importantList[i], importantList[i]] = 0
        for j in range(i+1,len(importantList)): # For each pair of brands
            liftAB = getLiftAB(importantList[i], importantList[j], postList) # Get the lift of this brand-pair
            liftDataFrame.loc[importantList[i], importantList[j]] = liftAB
            liftDataFrame.loc[importantList[j], importantList[i]] = liftAB
    liftDataFrame = liftDataFrame.drop([0], axis=1)
    return liftDataFrame

def inverseLift(importantList, postList):
    """
    Function that gets the lifts between many brands (the ones with highest frequncies)
    """
    liftList = []
    liftSeries = pd.Series(liftList) # Create empty series
    liftDataFrame = pd.DataFrame(liftSeries) # Create empty dataframe
    for i in range(len(importantList)): # For each brand
        liftDataFrame.loc[importantList[i], importantList[i]] = 0
        for j in range(i+1,len(importantList)): # For each pair of brands
            liftAB = getLiftAB(importantList[i], importantList[j], postList) # Get the lift of this brand-pair
            if liftAB == 0:
                inverseLift = 8
            else:
                inverseLift = 1/liftAB
            liftDataFrame.loc[importantList[i], importantList[j]] = inverseLift
            liftDataFrame.loc[importantList[j], importantList[i]] = inverseLift
    liftDataFrame = liftDataFrame.drop([0], axis=1)
    return liftDataFrame

word_list = fake_jobs['company_profile'].dropna().astype(str).map(removeStopwords) # Get data
attribute_list = ['services', 'technology', 'platform', 'sofwtare', 'communications', 'marketing', 'education', 'design', 'startup', 'HR']
attribute_lifts = inverseLift(attribute_list, word_list)
mdsAttributes = MDS(random_state=0)
liftTransform = mdsAttributes.fit_transform(attribute_lifts) # Transform distances into 2D
colors = ['lightcoral', 'yellowgreen', 'teal', 'orangered', 'gold', 'forestgreen', 'firebrick', 'maroon', 'goldenrod', 'darkviolet']
plt.scatter(liftTransform[:,0], liftTransform[:,1], c=colors)#, s=size, c=colors)
plt.title('MDS Plot')
for i, txt in enumerate(attribute_list):
    plt.annotate(txt, (liftTransform[:,0][i], liftTransform[:,1][i]))
plt.show()

## Creat Company ID Column

In [None]:
company_id = []
for i in range(len(fake_jobs['company_profile'].value_counts())):
    company_id.append('company_{}'.format(i))
company_id_series = pd.Series(company_id, fake_jobs['company_profile'].value_counts().index)

def getCompanyID(company_profile):
    return company_id_series[company_profile]

company_profile_LDA = fake_jobs.copy()
company_profile_LDA = company_profile_LDA[company_profile_LDA['company_profile'].notnull()]
company_profile_LDA['company_id'] = company_profile_LDA['company_profile'].map(getCompanyID)
company_profile_LDA[:2]

## Topic Modeling with LDA

In [None]:
# Source: Prof. Barua

fakeJob_id = input('provide the column name for id: ') # job_id
company_id = input('provide the column name for company id: ') # company_id
company_profile = input('provide the column name for text: ') # company_profile
ntopics= input('Provide the number of latent topics: ');


word_tokenizer=RegexpTokenizer(r'\w+')
wordnet_lemmatizer = WordNetLemmatizer()
stopwords_nltk = stopwords.words('english') # Choose the stop English stopwords
extraStopWords = ['–']
for word in extraStopWords: # Add – to the stopwords
    stopwords_nltk.append(word)
stopwords_nltk = set(stopwords_nltk)

In [None]:
def tokenize_text(version_desc):
    lowercase=version_desc.lower()
    text = wordnet_lemmatizer.lemmatize(lowercase)
    tokens = word_tokenizer.tokenize(text)
    return tokens

vec_words = CountVectorizer(tokenizer=tokenize_text,stop_words=stopwords_nltk,decode_error='ignore')
total_features_words = vec_words.fit_transform(company_profile_LDA[company_profile])

print(total_features_words.shape)

In [None]:
model = lda.LDA(n_topics=int(ntopics), n_iter=300, random_state=1) # CHANGE n_iter BACK TO 500
model.fit(total_features_words)

In [None]:
topic_word = model.topic_word_
doc_topic=model.doc_topic_
doc_topic=pd.DataFrame(doc_topic)
company_profile_LDA=company_profile_LDA.join(doc_topic)
jobs=pd.DataFrame()

for i in range(int(ntopics)):
    topic="topic_"+str(i)
    jobs[topic]=company_profile_LDA.groupby([company_id])[i].mean()

jobs=jobs.reset_index()
topics=pd.DataFrame(topic_word)
topics.columns=vec_words.get_feature_names()
topics1=topics.transpose()
print ("Topics word distribution written in file AMLproject_topic_word_dist.xlsx ")
topics1.to_excel("AMLproject_topic_word_dist.xlsx")
jobs.to_excel("AMLproject_document_topic_dist.xlsx",index=False)
print ("Document topic distribution written in file AMLproject_document_topic_dist.xlsx ")

## LDA Analysis

In [None]:
companyTopics = pd.read_excel("AMLproject_document_topic_dist.xlsx")

wordTopics = pd.read_excel("AMLproject_topic_word_dist.xlsx")

wordTopics.rename(columns={'Unnamed: 0': 'word'}, inplace=True)
wordTopics.head()

In [None]:
wordTopics.sort_values(by=0, ascending=False)[:20] # general
wordTopics.sort_values(by=2, ascending=False)[:20] #

## Company ID Columns

In [None]:
company_id = []
for i in range(len(fake_jobs['company_profile'].value_counts())):
    company_id.append('company_{}'.format(i))
company_id_series = pd.Series(company_id, fake_jobs['company_profile'].value_counts().index)

def getCompanyID(company_profile):
    return company_id_series[company_profile]

company_profile_Cosine = fake_jobs.copy()[['job_id', 'company_profile', 'location']]
company_profile_Cosine = company_profile_Cosine[company_profile_Cosine['company_profile'].notnull()]
company_profile_Cosine['company_id'] = company_profile_Cosine['company_profile'].map(getCompanyID)
company_profile_Cosine[:2]

## Cosine Similarity

In [None]:
# Source: https://stackoverflow.com/questions/15173225/calculate-cosine-similarity-given-2-sentence-strings
WORD = re.compile(r"[^,-]\w+")

def text_to_vector(text):
    """
    Convert text to vector.
    """
    if type(text) == list:
        text1 = []
        for word in text:
            text1.append(word.replace("'", ""))
        text1 = ' '.join(text1).lower()
        words = WORD.findall(text1)
    else:
        words = WORD.findall(text)
    return Counter(words)

def get_cosine(vec1, vec2):
    """
    Get the cosine similarity between attributes and reviews.
    """
    intersection = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])
    sum1 = sum([vec1[x] ** 2 for x in list(vec1.keys())])
    sum2 = sum([vec2[x] ** 2 for x in list(vec2.keys())])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)
    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator

# Get attributes from user
attributes = str(input('What attributes are you looking for? (separate you attributes by commas): ')).lower()
attributeList = attributes.split(', ')
attributeDF = pd.DataFrame(attributeList)
attributeDF.rename(columns={0: "Attributes"}, inplace=True)
attributeDF.to_csv('CompanyAttributes.csv', header=True, index=False)
attributeDF

company_profile_Cosine # data
to_compareFile = pd.read_csv('CompanyAttributes.csv') # Open attributes CSV
to_compareText = ' '.join(list(to_compareFile['Attributes']))
to_compareVector = text_to_vector(to_compareText)
noStopWords = company_profile_Cosine['company_profile'].map(removeStopwords) # For the company profile map the function removeStopWords
company_profileVector = noStopWords.map(text_to_vector)
cosines = pd.Series([get_cosine(to_compareVector, company_profile) for company_profile in company_profileVector])
company_profile_DF = pd.DataFrame(company_profile_Cosine['company_id'])
company_profile_DF['company_profile'] = company_profile_Cosine['company_profile']
company_profile_DF['company_profile'] = company_profile_DF['company_profile'].apply(lambda x: x.replace('\n', ' '))
company_profile_DF['Similarity'] = cosines
company_profile_DF.sort_values(by='Similarity', ascending=False, inplace=True)        
company_profile_DF

## Sentiment Analysis with VADER

In [None]:
# Source: https://www.geeksforgeeks.org/python-sentiment-analysis-using-vader/
def get_sentiment(sentence):
    """
    Get the sentiments of a sentence.
    """
    # Create a SentimentIntensityAnalyzer object.
    sid_obj = SentimentIntensityAnalyzer()
 
    # polarity_scores method of SentimentIntensityAnalyzer
    # object gives a sentiment dictionary.
    # which contains pos, neg, neu, and compound scores.
    sentiment_dict = sid_obj.polarity_scores(sentence)

    # decide sentiment as positive, negative and neutral
    if sentiment_dict['compound'] >= 0.05 :
        reviewSentiment = 'Positive: {}%'.format(sentiment_dict['pos']*100)
    elif sentiment_dict['compound'] <= - 0.05 :
        reviewSentiment = 'Negative: {}%'.format(sentiment_dict['neg']*100)
    else :
        reviewSentiment = 'Neutral: {}%'.format(sentiment_dict['neu']*100)
    
    return reviewSentiment

company_profile_DF['Sentiment'] = company_profile_DF['company_profile'].map(get_sentiment) # For the company_profile map the function get_sentiment
company_profile_DF.sort_values(by='Similarity')[:4]

## Realizing Companies Are Either Fake or Not Fake

In [None]:
company_id = []
for i in range(len(fake_jobs['company_profile'].value_counts())):
    company_id.append('company_{}'.format(i))
company_id_series = pd.Series(company_id, fake_jobs['company_profile'].value_counts().index)

def getCompanyID(company_profile):
    return company_id_series[company_profile]

FakeJobs = fake_jobs.copy()
FakeJobs = FakeJobs[FakeJobs['company_profile'].notnull()]
FakeJobs['company_id'] = FakeJobs['company_profile'].map(getCompanyID)

print(FakeJobs['company_id'].value_counts())

fraudulents = FakeJobs[FakeJobs['fraudulent'] == 1]
NOTfraudulents = FakeJobs[FakeJobs['fraudulent'] == 0]

In [None]:
for company in fraudulents['company_id'].value_counts().index:
    if company in NOTfraudulents['company_id'].value_counts().index:
        print('In Not Fake')