# Cleaning and Wrangling

In [4]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from library.sb_utils import save_file
from nltk.corpus import stopwords

Alright, let's see what we're are working with...

In [5]:
data = pd.read_csv("../data/fake_job_postings.csv")
data.head(3)

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0


A lot of info here. I can already see a lot of NaNs. Lets check out the datatypes.

In [12]:
data.dtypes()

TypeError: 'Series' object is not callable

Looks like `job_id` can be set as the numeric index. `salary_range` should also be inspected since it's data type is an object, and I was expecting a numeric.

I'll start with `job_id`. `job_id` *should* be unique, so lets check on that first and make it the index if it is indeed unique.

In [None]:
data['job_id'].is_unique

In [None]:
clean_data = data.set_index('job_id')

In [None]:
clean_data.head(2)

Looking good so far. Time to inspect `salary_range`.

In [None]:
salary_range = clean_data['salary_range']
salary_range

Lots of NaN here... what happens if we drop those...?

In [None]:
salary_range.dropna()

Ah, okay, I see now. I'll create three new columns based on `salary_range`: `min_salary`, `max_salary` and a redefinition of `salary_range` (which will be the difference of the two).

In [None]:
clean_data['min_salary'] = 0
clean_data['max_salary'] = 0

In [None]:
clean_data['salary_range'] = clean_data['salary_range'].str.split('-')

In [None]:
for i in range(len(clean_data)):
    try:
        value = clean_data['salary_range'].iloc[i]
        if type(value) is list:
            clean_data['min_salary'].iloc[i] = int(value[0])
            clean_data['max_salary'].iloc[i] = int(value[1])
            clean_data['salary_range'].iloc[i] = int(value[1]) - int(value[0])
        else:
            clean_data['salary_range'].iloc[i] = 0
            
    except ValueError:
        clean_data['salary_range'].iloc[i] = 0
    except IndexError:
        clean_data['salary_range'].iloc[i] = 0
clean_data['salary_range'] = clean_data['salary_range'].astype(str).astype(int)

In [None]:
clean_data['salary_range'].dtype

In [None]:
clean_data[clean_data['salary_range']!=0].head(3)

Now, I'll drop the `min_salary` and `max_salary` features I created earlier.

In [None]:
clean_data.drop(['min_salary', 'max_salary'], axis=1, inplace=True)

`salary_range` and `job_id` have now been taken care of.

### Cleaning null values

I can still see a bunch of NaNs. Let's take a look at the those now.

In [None]:
clean_data["required_experience"].value_counts(dropna=False)

Plenty of NaN values in `required_experience`. It wouldn't be smart to drop them all since there are so many. I'll fill them with an actual string value for now.

In [None]:
clean_data["required_experience"] = clean_data["required_experience"].fillna(value='Not Applicable')
clean_data["required_experience"].value_counts(dropna=False)

Much better. Now for `required_education`.

In [None]:
clean_data["required_education"].value_counts(dropna=False)

Same issue as before. I'll set the NaNs to Unspecified.

In [None]:
clean_data["required_education"] = clean_data["required_education"].fillna(value='Unspecified')
clean_data["required_education"].value_counts(dropna=False)

Now for `employment_type`.

In [None]:
clean_data["employment_type"].value_counts(dropna=False)

In [None]:
clean_data["employment_type"] = clean_data["employment_type"].fillna(value='Other')
clean_data["employment_type"].value_counts(dropna=False)

Next up, `department`. 

In [None]:
data["department"].value_counts(dropna=False)

Hmmm... there are a lot of null values here. Not enough data in the column. I'll drop it later.

Almost there...

In [None]:
clean_data["industry"].value_counts(dropna=False)

Hmmm... a lot of nulls again. I'll take note and drop this column later.

Last up, `function`.

In [None]:
clean_data["function"].value_counts(dropna=False)

A lot of null values here too, but I can group them with Other.

In [None]:
clean_data["function"] = clean_data["function"].fillna(value='Other')
clean_data["function"].value_counts(dropna=False)

Finally done with looking for null values. Time to drop `department` and `industry`.

In [None]:
clean_data=clean_data.drop(['department', 'industry'], axis=1)

Let's see how the data looks now. 

In [None]:
clean_data.head(3)

Now its time to fill in the null values for the columns that contain substantial textual data.

Let's see how many nulls exsist in `title`, `location`, `company_profile`, `description`, `requirements` and `benefits`.

In [None]:
title_null = sum(clean_data['title'].isnull())
location_null = sum(clean_data['location'].isnull())
company_profile_null = sum(clean_data['company_profile'].isnull())
description_null = sum(clean_data['description'].isnull())
requirements_null = sum(clean_data['requirements'].isnull())
benefits_null = sum(clean_data['benefits'].isnull())
print(title_null, location_null, company_profile_null, description_null, requirements_null, benefits_null)

Plenty to fill in here. Let's just fill these in with empty strings.

In [None]:
clean_data['location'] = clean_data['location'].fillna("")
clean_data['company_profile'] = clean_data['company_profile'].fillna("")
clean_data['description'] = clean_data['description'].fillna("")
clean_data['requirements'] = clean_data['requirements'].fillna("")
clean_data['benefits'] = clean_data['benefits'].fillna("")

In [None]:
title_null = sum(clean_data['title'].isnull())
location_null = sum(clean_data['location'].isnull())
company_profile_null = sum(clean_data['company_profile'].isnull())
description_null = sum(clean_data['description'].isnull())
requirements_null = sum(clean_data['requirements'].isnull())
benefits_null = sum(clean_data['benefits'].isnull())
print(title_null, location_null, company_profile_null, description_null, requirements_null, benefits_null)

#### Combining all the text columns into one

Now its time to do something with these text columns. For sake of processing and simplicity, I am going to combine all the text columns into one `text` column.

In [None]:
clean_data['text'] = clean_data[['title', 'location', 'company_profile', 'description', 
                                 'requirements', 'benefits']].apply('-'.join, axis=1)
clean_data = clean_data.drop(['title', 'location', 'company_profile', 'description', 
                              'requirements', 'benefits'], axis=1)

In [None]:
clean_data.head(2)

Now that there is a single text column, I will remove all the stopwords and special characters.

In [None]:
clean_data['text'] = clean_data['text'].str.lower()
stop = stopwords.words('english')
clean_data['text'] = clean_data['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
clean_data['text'] = clean_data['text'].str.replace('[0-9]', ' ')
clean_data['text'] = clean_data['text'].str.replace('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' ')
clean_data['text'] = clean_data['text'].map(lambda x: re.sub(r'\W+', ' ', x))

In [None]:
clean_data.head(3)

#### Apply Lemmatizer

The last thing I am going to apply a lemmatizer to the text column, this will make for easier processing and classification when I run my model with the text data.

In [None]:
lemmatizer = WordNetLemmatizer()

In [None]:
def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in word_tokenize(text)]

In [None]:
clean_data['text'] = clean_data['text'].apply(lemmatize_text)

In [None]:
clean_data['text'] = [' '.join(map(str, l)) for l in clean_data['text']]

In [None]:
clean_data.head(3)

This looks good so far. The next step will be EDA. Hopefully, more will be learned there and I can start picking the data apart some more.

##### Save File

In [None]:
save_file(clean_data, 'clean_data.csv', '../data')