In [132]:
import pandas as pd
import numpy as np


import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.probability import FreqDist
from nltk.sentiment import SentimentIntensityAnalyzer
import string

# Download NLTK resources (if not already downloaded)
#nltk.download('punkt')
#nltk.download('stopwords')
#nltk.download('wordnet')
#nltk.download('vader_lexicon')


In [133]:
df = pd.read_csv("clean1.csv")

In [134]:
df.head(3)

Unnamed: 0.1,Unnamed: 0,job_id,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent,country_abb,country
0,0,1,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0,US,United States
1,1,2,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0,NZ,New Zealand
2,2,3,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0,US,United States


In [135]:
#Merge Columns company_profile,description,requirements and benefits as Language_processing(LP) and drop orginal columns
df['LP'] = df['company_profile'].fillna('')+' '+df['requirements'].fillna('')+' '+df['description'].fillna('')+' '+df['benefits'].fillna('')+' '
df = df.drop(columns = ['company_profile','description','requirements','benefits', 'job_id'], axis = 1)

In [136]:
df.head(3)

Unnamed: 0.1,Unnamed: 0,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent,country_abb,country,LP
0,0,0,1,0,Other,Internship,,,Marketing,0,US,United States,"We're Food52, and we've created a groundbreaki..."
1,1,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0,NZ,New Zealand,"90 Seconds, the worlds Cloud Video Production ..."
2,2,0,1,0,,,,,,0,US,United States,Valor Services provides Workforce Solutions th...


In [137]:
##Create a Class for Analyzing Text
class TextAnalyzer:
    def __init__(self):
        # Initialize WordNet Lemmatizer
        self.lemmatizer = WordNetLemmatizer()

    def preprocess_text(self, text):
        """
        Preprocesses the given text by converting it to lowercase, removing punctuations,
        tokenizing into words, removing stopwords, and lemmatizing words to their base form.

        Parameters:
            text (str): The text to be preprocessed.

        Returns:
            str: The preprocessed text.
        """
        # Convert text to lowercase
        text = text.lower()

        # Remove punctuations
        text = text.translate(str.maketrans('', '', string.punctuation))

        # Tokenize text into words
        tokens = word_tokenize(text)

        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        tokens = [word for word in tokens if word not in stop_words]

        # Lemmatize words to their base form
        tokens = [self.lemmatizer.lemmatize(word) for word in tokens]

        # Join tokens back into a single string
        preprocessed_text = ' '.join(tokens)

        return preprocessed_text

    def split_phrase_into_words(self, phrase):
        """
        Splits the input phrase into individual words.

        Parameters:
            phrase (str): The phrase to be split.

        Returns:
            list: A list containing individual words from the phrase.
        """
        return phrase.split()

    def ratings(self, texts, fraudulent):
        """
        Calculates the percentage of fraudulent words in each text relative to the total number of words.

        Parameters:
            texts (list): A list of strings representing the texts to analyze.
            fraudulent (list): A list of fraudulent words.

        Returns:
            list: A list containing the percentage of fraudulent words in each text.
        """
        result = []

        # Convert fraudulent phrases into a single list of words
        fraudulent = fraudulent.map(self.split_phrase_into_words).sum()
        fraudulent_set = set(fraudulent)

        for text in texts:
            count = 0
            tokenized_text = text.split()  # Tokenize the text
            total_words = len(tokenized_text)

            for word in tokenized_text:
                if word in fraudulent_set:
                    count += 1

            if total_words != 0:
                percentage = round((count / total_words) * 100)  # Calculate the percentage
            else:
                percentage = np.nan  # Assign np.nan if total_words is zero

            result.append(percentage)

        return result


In [139]:
#Creating an instance of the class
text_analyzer = TextAnalyzer()

In [140]:
df['preprocessed_column'] = df['LP'].map(text_analyzer.preprocess_text)

In [141]:
df.drop(columns =  "LP", axis = 1, inplace = True)

##### Getting fake job phrases and processing it 


In [142]:
from fraudulent_Job_Phrases import fraudulent_job_phrases

phrases = pd.DataFrame(fraudulent_job_phrases, columns = ['text'])

#Preprocessing

fraudulent_words = phrases['text'].map(text_analyzer.preprocess_text)

In [143]:
rating_col = text_analyzer.ratings(df['preprocessed_column'],fraudulent_words)

In [144]:
df = df.merge(pd.DataFrame(rating_col,columns=['Rating']),left_index=True, right_index=True)

In [145]:
df.dropna(subset = ['Rating'], inplace = True)
df['Rating'] = df['Rating'].astype('Int32')

In [146]:
df

Unnamed: 0.1,Unnamed: 0,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent,country_abb,country,preprocessed_column,Rating
0,0,0,1,0,Other,Internship,,,Marketing,0,US,United States,food52 weve created groundbreaking awardwinnin...,10
1,1,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0,NZ,New Zealand,90 second world cloud video production service...,14
2,2,0,1,0,,,,,,0,US,United States,valor service provides workforce solution meet...,6
3,3,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0,US,United States,passion improving quality life geography heart...,11
4,4,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0,US,United States,spotsource solution llc global human capital m...,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17875,17875,0,1,1,Full-time,Mid-Senior level,,Computer Software,Sales,0,CA,Canada,vend looking awesome new talent come join u yo...,9
17876,17876,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Internet,Accounting/Auditing,0,US,United States,weblinc ecommerce platform service provider fa...,8
17877,17877,0,0,0,Full-time,,,,,0,US,United States,provide full time permanent position many medi...,20
17878,17878,0,0,1,Contract,Not Applicable,Professional,Graphic Design,Design,0,NG,Nigeria,1 must fluent latest version corel amp adobe c...,8


In [147]:
#drop country, preprocessed_column,county_abb
df.drop(columns = ["country","preprocessed_column", "country_abb", "Unnamed: 0"], inplace = True)

In [148]:
df_clean = df.copy()

In [149]:
df_clean.to_csv("NLP_processed_data.csv")