In [42]:
import pandas as pd
import numpy as np


import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.probability import FreqDist
from nltk.sentiment import SentimentIntensityAnalyzer
import string

# Download NLTK resources (if not already downloaded)
#nltk.download('punkt')
#nltk.download('stopwords')
#nltk.download('wordnet')
#nltk.download('vader_lexicon')


In [43]:
df = pd.read_csv("clean1.csv")

In [44]:
df.head(3)

Unnamed: 0.1,Unnamed: 0,job_id,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent,country_abb,country
0,0,1,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0,US,United States
1,1,2,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0,NZ,New Zealand
2,2,3,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0,US,United States


In [45]:
#Merge Columns company_profile,description,requirements and benefits as Language_processing(LP) and drop orginal columns
df['LP'] = df['company_profile'].fillna('')+' '+df['requirements'].fillna('')+' '+df['description'].fillna('')+' '+df['benefits'].fillna('')+' '
df = df.drop(columns = ['company_profile','description','requirements','benefits', 'job_id'], axis = 1)

In [46]:
df.head(3)

Unnamed: 0.1,Unnamed: 0,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent,country_abb,country,LP
0,0,0,1,0,Other,Internship,,,Marketing,0,US,United States,"We're Food52, and we've created a groundbreaki..."
1,1,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0,NZ,New Zealand,"90 Seconds, the worlds Cloud Video Production ..."
2,2,0,1,0,,,,,,0,US,United States,Valor Services provides Workforce Solutions th...


In [47]:

# Initialize WordNet Lemmatizer
lemmatizer = WordNetLemmatizer()

# Define function for text preprocessing
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove punctuations
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenize text into words
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatize words to their base form
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # Join tokens back into a single string
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text



In [48]:
df['preprocessed_column'] = df['LP'].map(preprocess_text)

In [49]:
df.drop(columns =  "LP", axis = 1, inplace = True)

# Testing code

### Get fraudulent records and get common words
> Classify the records as negative if contains 80 percent of the words 
### Get non fraudulent records and get common words
>Classify the records as positive if contains 80 percent of the words or less than 40 perent of fraudulent commone words
### Else
> Classify as Neutral


In [50]:
def common_words(text):
    """This gets the common words in the given text"""
    words = []
    from nltk.probability import FreqDist
    tokenized_text = ' '.join(text).split()
    
    # Calculate the frequency distribution of words
    freq_dist = FreqDist(tokenized_text)
    
    for word, frequency in freq_dist.most_common(1500):
        words.append(word)
    return words
    




In [51]:
#Get fraudulent records and the common words
fraudulent_records = df[df['fraudulent'] == 1]
fraudulent_words = common_words(fraudulent_records['preprocessed_column'])



#Get non-fraudulent records and the common words
non_fraudulent_records = df[df['fraudulent'] == 0]
non_fraudulent_words = common_words(non_fraudulent_records['preprocessed_column'])



In [52]:
def ratings(texts,fraudulent):
    """This function finds the percentage ratings of the of each column text """
    result = []
    fraudulent_set = set(fraudulent)
    for text in texts:
        count = 0
        tokenized_text = text.split()  # Tokenize the text
        total_words = len(tokenized_text)
        
        for word in tokenized_text:
            if word in fraudulent_set:
                count =count+1
                
        
        if total_words != 0:  # Check if total_words is not zero
            percentage = round((count / total_words) * 100 )  # Getting the percentage
        else:
            percentage = np.nan  # Assign np.nan if total_words is zero
        
        result.append(percentage)
    
    return result

In [53]:
rating_col = ratings(df['preprocessed_column'],fraudulent_words)

In [54]:
df = df.merge(pd.DataFrame(rating_col,columns=['Rating']),left_index=True, right_index=True)
df

Unnamed: 0.1,Unnamed: 0,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent,country_abb,country,preprocessed_column,Rating
0,0,0,1,0,Other,Internship,,,Marketing,0,US,United States,food52 weve created groundbreaking awardwinnin...,53.0
1,1,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0,NZ,New Zealand,90 second world cloud video production service...,65.0
2,2,0,1,0,,,,,,0,US,United States,valor service provides workforce solution meet...,65.0
3,3,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0,US,United States,passion improving quality life geography heart...,63.0
4,4,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0,US,United States,spotsource solution llc global human capital m...,65.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17875,17875,0,1,1,Full-time,Mid-Senior level,,Computer Software,Sales,0,CA,Canada,vend looking awesome new talent come join u yo...,63.0
17876,17876,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Internet,Accounting/Auditing,0,US,United States,weblinc ecommerce platform service provider fa...,65.0
17877,17877,0,0,0,Full-time,,,,,0,US,United States,provide full time permanent position many medi...,79.0
17878,17878,0,0,1,Contract,Not Applicable,Professional,Graphic Design,Design,0,NG,Nigeria,1 must fluent latest version corel amp adobe c...,61.0


In [55]:
df['Rating'].fillna(0, inplace=True)
df['Rating'] = df['Rating'].astype('Int64')

df
#df['flags'] = 0
#df.loc[df.Rating >= 70, 'flags'] = 2
#df.loc[df.Rating <= 50, 'flags'] = 0
#df.loc[(df['Rating'] > 50) & (df['Rating'] < 70), 'flags'] = 1

Unnamed: 0.1,Unnamed: 0,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent,country_abb,country,preprocessed_column,Rating
0,0,0,1,0,Other,Internship,,,Marketing,0,US,United States,food52 weve created groundbreaking awardwinnin...,53
1,1,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0,NZ,New Zealand,90 second world cloud video production service...,65
2,2,0,1,0,,,,,,0,US,United States,valor service provides workforce solution meet...,65
3,3,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0,US,United States,passion improving quality life geography heart...,63
4,4,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0,US,United States,spotsource solution llc global human capital m...,65
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17875,17875,0,1,1,Full-time,Mid-Senior level,,Computer Software,Sales,0,CA,Canada,vend looking awesome new talent come join u yo...,63
17876,17876,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Internet,Accounting/Auditing,0,US,United States,weblinc ecommerce platform service provider fa...,65
17877,17877,0,0,0,Full-time,,,,,0,US,United States,provide full time permanent position many medi...,79
17878,17878,0,0,1,Contract,Not Applicable,Professional,Graphic Design,Design,0,NG,Nigeria,1 must fluent latest version corel amp adobe c...,61


In [56]:
#drop country, preprocessed_column,county_abb
df.drop(columns = ["country","preprocessed_column", "country_abb"], inplace = True)

In [57]:
df_clean = df.copy()