In [171]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [172]:
file_path = "Data/Train_rev1.csv"
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,Id,Title,FullDescription,LocationRaw,LocationNormalized,ContractType,ContractTime,Company,Category,SalaryRaw,SalaryNormalized,SourceName
0,12612628,Engineering Systems Analyst,Engineering Systems Analyst Dorking Surrey Sal...,"Dorking, Surrey, Surrey",Dorking,,permanent,Gregory Martin International,Engineering Jobs,20000 - 30000/annum 20-30K,25000,cv-library.co.uk
1,12612830,Stress Engineer Glasgow,Stress Engineer Glasgow Salary **** to **** We...,"Glasgow, Scotland, Scotland",Glasgow,,permanent,Gregory Martin International,Engineering Jobs,25000 - 35000/annum 25-35K,30000,cv-library.co.uk
2,12612844,Modelling and simulation analyst,Mathematical Modeller / Simulation Analyst / O...,"Hampshire, South East, South East",Hampshire,,permanent,Gregory Martin International,Engineering Jobs,20000 - 40000/annum 20-40K,30000,cv-library.co.uk
3,12613049,Engineering Systems Analyst / Mathematical Mod...,Engineering Systems Analyst / Mathematical Mod...,"Surrey, South East, South East",Surrey,,permanent,Gregory Martin International,Engineering Jobs,25000 - 30000/annum 25K-30K negotiable,27500,cv-library.co.uk
4,12613647,"Pioneer, Miser Engineering Systems Analyst","Pioneer, Miser Engineering Systems Analyst Do...","Surrey, South East, South East",Surrey,,permanent,Gregory Martin International,Engineering Jobs,20000 - 30000/annum 20-30K,25000,cv-library.co.uk


In [173]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244768 entries, 0 to 244767
Data columns (total 12 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   Id                  244768 non-null  int64 
 1   Title               244767 non-null  object
 2   FullDescription     244768 non-null  object
 3   LocationRaw         244768 non-null  object
 4   LocationNormalized  244768 non-null  object
 5   ContractType        65442 non-null   object
 6   ContractTime        180863 non-null  object
 7   Company             212338 non-null  object
 8   Category            244768 non-null  object
 9   SalaryRaw           244768 non-null  object
 10  SalaryNormalized    244768 non-null  int64 
 11  SourceName          244767 non-null  object
dtypes: int64(2), object(10)
memory usage: 22.4+ MB


In [174]:
# based on this those 3 features should be dropped before removing the null values
df.isnull().sum()

Id                         0
Title                      1
FullDescription            0
LocationRaw                0
LocationNormalized         0
ContractType          179326
ContractTime           63905
Company                32430
Category                   0
SalaryRaw                  0
SalaryNormalized           0
SourceName                 1
dtype: int64

In [175]:
# Drop the columns that we won't need, Company, ContractType, ContractTime, SalaryRaw, LocationRaw, ID
new_df = df.drop(columns = ["Company", "ContractType", "ContractTime", "SalaryRaw", "LocationRaw", "Id", "SourceName"])
new_df.head()

Unnamed: 0,Title,FullDescription,LocationNormalized,Category,SalaryNormalized
0,Engineering Systems Analyst,Engineering Systems Analyst Dorking Surrey Sal...,Dorking,Engineering Jobs,25000
1,Stress Engineer Glasgow,Stress Engineer Glasgow Salary **** to **** We...,Glasgow,Engineering Jobs,30000
2,Modelling and simulation analyst,Mathematical Modeller / Simulation Analyst / O...,Hampshire,Engineering Jobs,30000
3,Engineering Systems Analyst / Mathematical Mod...,Engineering Systems Analyst / Mathematical Mod...,Surrey,Engineering Jobs,27500
4,"Pioneer, Miser Engineering Systems Analyst","Pioneer, Miser Engineering Systems Analyst Do...",Surrey,Engineering Jobs,25000


In [176]:
len(new_df)

244768

In [177]:
# check datatypes again
new_df.dtypes

Title                 object
FullDescription       object
LocationNormalized    object
Category              object
SalaryNormalized       int64
dtype: object

In [178]:
# find how many null values are in the new_df
new_df.isnull().sum()

Title                 1
FullDescription       0
LocationNormalized    0
Category              0
SalaryNormalized      0
dtype: int64

In [179]:
# drop the null values of this dataframe
new_df.dropna(inplace = True)

In [180]:
new_df.isnull().sum()

Title                 0
FullDescription       0
LocationNormalized    0
Category              0
SalaryNormalized      0
dtype: int64

In [181]:
sliced_df = new_df.loc[:1000, :]

In [182]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import re

# Code to download corpora
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Justi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Justi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Justi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [183]:
lemmetizer = WordNetLemmatizer()

In [184]:
stemmer = SnowballStemmer("english")

In [185]:
def clean_text(article):
    sw = set(stopwords.words('english'))
    sw_addons = {"k", "uk","also"} 
    # Substitute everything that is not a letter with an empty string
    regex = re.compile("[^a-zA-Z ]")
    # we sub in an extra character for anything that is not a character from the
    # above line of code
    re_clean = regex.sub('', article)
    # tokenize each word in the sentence
    words = word_tokenize(re_clean)
    # obtain the root word for each word 
    #stem = [stemmer.stem(word) for word in words]
    lem = [lemmatizer.lemmatize(word) for word in words]
    # obtain an output that is all lowercase and not in the stop words
    #output = [word.lower() for word in stem if word.lower() not in sw.union(sw_addons)]
    output = [word.lower() for word in lem if word.lower() not in sw.union(sw_addons)]
    output = ' '.join(output)
    return output

In [186]:
# test function on sliced df to make sure it is correct
clean_text(sliced_df["FullDescription"][0])

'engineering systems analyst dorking surrey salary client located dorking surrey looking engineering systems analyst client provides specialist software development keywords mathematical modelling risk analysis system modelling optimisation miser pioneeer engineering systems analyst dorking surrey salary'

In [187]:
# create new column that has the clean description of the job
sliced_df['CleanDescription'] = sliced_df['FullDescription'].apply(clean_text)
sliced_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,Title,FullDescription,LocationNormalized,Category,SalaryNormalized,CleanDescription
0,Engineering Systems Analyst,Engineering Systems Analyst Dorking Surrey Sal...,Dorking,Engineering Jobs,25000,engineering systems analyst dorking surrey sal...
1,Stress Engineer Glasgow,Stress Engineer Glasgow Salary **** to **** We...,Glasgow,Engineering Jobs,30000,stress engineer glasgow salary currently looki...
2,Modelling and simulation analyst,Mathematical Modeller / Simulation Analyst / O...,Hampshire,Engineering Jobs,30000,mathematical modeller simulation analyst opera...
3,Engineering Systems Analyst / Mathematical Mod...,Engineering Systems Analyst / Mathematical Mod...,Surrey,Engineering Jobs,27500,engineering systems analyst mathematical model...
4,"Pioneer, Miser Engineering Systems Analyst","Pioneer, Miser Engineering Systems Analyst Do...",Surrey,Engineering Jobs,25000,pioneer miser engineering systems analyst dork...


In [188]:
# Now we want to get the TD-IDF of the CleanDescription Column

# create varialbe to store tfidf vectorizer
#vectorizer = TfidfVectorizer(stop_words="english", min_df = 3)

In [189]:
# Calculating the COUNT for the working corpus.
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(stop_words="english",min_df=3)
count_vectorizer = vectorizer.fit_transform(sliced_df["CleanDescription"])
words_df = pd.DataFrame(count_vectorizer.toarray(), columns=vectorizer.get_feature_names())
words_df.head()

Unnamed: 0,aa,abaqus,aberdeen,aberdeenshire,abergavenny,abi,ability,able,absence,abuse,...,yarmouth,year,yes,york,yorkshire,youd,youll,young,younger,youre
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,3,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [190]:
filtered_df = words_df.replace([2,3,4,5,6,7,8,9],1)

In [191]:
# Now set the vectorizer on the CleanDescription column and store it in a new variable
#tf_idf_2 = vectorizer.fit_transform(sliced_df['CleanDescription'])
# we now want to set the tf_idf to a dataframe
#tf_score_df_2 = pd.DataFrame(tf_idf_2.toarray(), columns = vectorizer.get_feature_names())
#tf_score_df_2.head()

In [192]:
combined_df = pd.concat([sliced_df, filtered_df], axis = 1)
combined_df.head()

Unnamed: 0,Title,FullDescription,LocationNormalized,Category,SalaryNormalized,CleanDescription,aa,abaqus,aberdeen,aberdeenshire,...,yarmouth,year,yes,york,yorkshire,youd,youll,young,younger,youre
0,Engineering Systems Analyst,Engineering Systems Analyst Dorking Surrey Sal...,Dorking,Engineering Jobs,25000,engineering systems analyst dorking surrey sal...,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Stress Engineer Glasgow,Stress Engineer Glasgow Salary **** to **** We...,Glasgow,Engineering Jobs,30000,stress engineer glasgow salary currently looki...,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Modelling and simulation analyst,Mathematical Modeller / Simulation Analyst / O...,Hampshire,Engineering Jobs,30000,mathematical modeller simulation analyst opera...,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,Engineering Systems Analyst / Mathematical Mod...,Engineering Systems Analyst / Mathematical Mod...,Surrey,Engineering Jobs,27500,engineering systems analyst mathematical model...,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Pioneer, Miser Engineering Systems Analyst","Pioneer, Miser Engineering Systems Analyst Do...",Surrey,Engineering Jobs,25000,pioneer miser engineering systems analyst dork...,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [193]:
combined_df.isnull().sum()

Title                 0
FullDescription       0
LocationNormalized    0
Category              0
SalaryNormalized      0
                     ..
youd                  0
youll                 0
young                 0
younger               0
youre                 0
Length: 2891, dtype: int64

In [194]:
# drop the 2 description columns as we no longer need them
combined_df = combined_df.drop(columns = ["FullDescription", "CleanDescription"])
combined_df.head()

Unnamed: 0,Title,LocationNormalized,Category,SalaryNormalized,aa,abaqus,aberdeen,aberdeenshire,abergavenny,abi,...,yarmouth,year,yes,york,yorkshire,youd,youll,young,younger,youre
0,Engineering Systems Analyst,Dorking,Engineering Jobs,25000,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Stress Engineer Glasgow,Glasgow,Engineering Jobs,30000,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Modelling and simulation analyst,Hampshire,Engineering Jobs,30000,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,Engineering Systems Analyst / Mathematical Mod...,Surrey,Engineering Jobs,27500,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Pioneer, Miser Engineering Systems Analyst",Surrey,Engineering Jobs,25000,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [195]:
# use get dummies to turn the category columns into number columns
encoded_df = pd.get_dummies(combined_df)
encoded_df.head()

Unnamed: 0,SalaryNormalized,aa,abaqus,aberdeen,aberdeenshire,abergavenny,abi,ability,able,absence,...,Category_HR & Recruitment Jobs,Category_Healthcare & Nursing Jobs,Category_Hospitality & Catering Jobs,Category_IT Jobs,Category_Manufacturing Jobs,Category_Other/General Jobs,Category_Sales Jobs,Category_Teaching Jobs,Category_Trade & Construction Jobs,Category_Travel Jobs
0,25000,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,30000,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,30000,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,27500,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,25000,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [196]:
len(encoded_df)

1001

In [197]:
# split the dataset into X and y
X = encoded_df.drop(columns = ["SalaryNormalized"])
y = encoded_df["SalaryNormalized"].values.reshape(-1,1)

In [198]:
# look at the shape of each data set
X.shape

(1001, 4195)

In [199]:
y.shape

(1001, 1)

In [200]:
# lets import train test split to split the data up
X_train, X_test, y_train, y_test = train_test_split(X,
                                                   y,
                                                   random_state=78)

In [201]:
# import the scaler
scaler = StandardScaler()

In [202]:
# scale the training data
scaler = scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [203]:
# import PCA for demionality reduction, we have alot of columns so lets condense them down
from sklearn.decomposition import PCA

In [204]:
# set the parameters for the number of components
pca = PCA(n_components=4)