# Preprocessing

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from library.sb_utils import save_file

As always, I'll start by loading the data.

In [2]:
data = pd.read_csv("../data/clean_data.csv")

In [3]:
data.head(3)

Unnamed: 0,salary_range,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,function,fraudulent,text
0,0,0,1,0,Other,Internship,Unspecified,Marketing,0,marketing intern us ny new york we re food52 w...
1,0,0,1,0,Full-time,Not Applicable,Unspecified,Customer Service,0,customer service cloud video production nz auc...
2,0,0,1,0,Other,Not Applicable,Unspecified,Other,0,commissioning machinery assistant cma us ia we...


## Encoding

There are several categorical features here that need to be encoded. I'll start by refreshing myself on the values each feature contains.

In [4]:
for col in data[['employment_type', 'required_experience', 'required_education', 'function']].columns:
    print(col + ":", set(data[col]), "\n")

employment_type: {'Part-time', 'Temporary', 'Full-time', 'Other', 'Contract'} 

required_experience: {'Executive', 'Internship', 'Director', 'Not Applicable', 'Mid-Senior level', 'Associate', 'Entry level'} 

required_education: {'Vocational - HS Diploma', 'Certification', 'Doctorate', 'Unspecified', 'Associate Degree', "Master's Degree", 'Vocational', 'Some High School Coursework', "Bachelor's Degree", 'Some College Coursework Completed', 'High School or equivalent', 'Vocational - Degree', 'Professional'} 

function: {'Data Analyst', 'Art/Creative', 'Administrative', 'Distribution', 'Engineering', 'Quality Assurance', 'Financial Analyst', 'General Business', 'Science', 'Production', 'Marketing', 'Writing/Editing', 'Product Management', 'Purchasing', 'Training', 'Supply Chain', 'Finance', 'Health Care Provider', 'Education', 'Human Resources', 'Accounting/Auditing', 'Management', 'Research', 'Project Management', 'Strategy/Planning', 'Advertising', 'Public Relations', 'Manufacturing', 

### Label Encoding

From the dataset, there are two columns that would require a label encoder: `employment_type` and `function`.

In [5]:
le = LabelEncoder()

In [6]:
le.fit(data['employment_type'])
data['employment_type'] = le.transform(data['employment_type'])

In [7]:
le.fit(data['function'])
data['function'] = le.transform(data['function'])

### Ordinal Encoder

In [8]:
set(data['required_experience'])

{'Associate',
 'Director',
 'Entry level',
 'Executive',
 'Internship',
 'Mid-Senior level',
 'Not Applicable'}

In [9]:
emply_types = ['Not Applicable', 'Internship', 'Associate', 
              'Entry level', 'Associate', 'Mid-Senior level', 'Director', 'Executive']
oe = OrdinalEncoder(categories=[emply_types])
oe.fit(data[['required_experience']])
data['required_experience'] = oe.transform(data[['required_experience']])

In [10]:
set(data['required_education'])

{'Associate Degree',
 "Bachelor's Degree",
 'Certification',
 'Doctorate',
 'High School or equivalent',
 "Master's Degree",
 'Professional',
 'Some College Coursework Completed',
 'Some High School Coursework',
 'Unspecified',
 'Vocational',
 'Vocational - Degree',
 'Vocational - HS Diploma'}

In [11]:
edu_types = ['Unspecified', 'Some High School Coursework', 'High School or equivalent',
            'Vocational - HS Diploma', 'Some College Coursework Completed', 'Associate Degree',
            'Vocational', 'Professional', 'Certification', 'Vocational - Degree',
            'Bachelor\'s Degree', 'Master\'s Degree', 'Doctorate']
oe = OrdinalEncoder(categories=[edu_types])
oe.fit(data[['required_education']])
data['required_education'] = oe.transform(data[['required_education']])

In [12]:
data.head(3)

Unnamed: 0,salary_range,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,function,fraudulent,text
0,0,0,1,0,2,1.0,0.0,22,0,marketing intern us ny new york we re food52 w...
1,0,0,1,0,1,0.0,0.0,7,0,customer service cloud video production nz auc...
2,0,0,1,0,2,0.0,0.0,23,0,commissioning machinery assistant cma us ia we...


## Text Processing

Now all there is left to is deal with all the text data. I have already consolidated the text data to one column, converted everything to lowercase and removed all the stop words.

In [13]:
vectorizer = TfidfVectorizer()

In [14]:
vectorizer.fit(data['text'])

TfidfVectorizer()

In [15]:
vector = vectorizer.transform(data['text'])

In [None]:
features = vectorizer.get_feature_names()
dense = vector.todense().tolist()
df = pd.DataFrame(dense, columns=features)

In [None]:
df