## Teste Gria - Salary

### Libraries

In [1]:
# Data manipulation
import pandas as pd

# Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
from scipy.sparse import hstack

# Model
from sklearn.linear_model import Ridge
from sklearn.linear_model import LogisticRegression

# Accuracy
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error



### Data Loader

In [2]:
train = pd.read_csv("/home/ives/Desktop/Gria /Data/Train_rev1.csv")
test = pd.read_csv("/home/ives/Desktop/Gria /Data/Test_rev1.csv")
rand = pd.read_csv("/home/ives/Desktop/Gria /Data/random_forest_benchmark_test_rev1.csv")

### Data Exploratory

In [None]:
ContractType	ContractTime	Company	Category

In [None]:
Title	

### Data preprocessing:

#### MIssing Values

In [3]:
train.Title.fillna(train.Title.mode()[0], inplace= True)
train.ContractType.fillna(train.ContractType.mode()[0],inplace=True)
train.ContractTime.fillna(train.ContractTime.mode()[0],inplace=True)
train.Company.fillna(train.Company.mode()[0],inplace=True)
train.SourceName.fillna(train.SourceName.mode()[0],inplace=True)

#### Normalize

In [None]:
train['FullDescription'].replace('[^a-zA-Z0-9]', ' ', regex = True)



In [None]:
train.columns = train.columns.str.replace('[#,@,&,****,]', '')


In [None]:
train["Title"] = train["Title"].str.lower()
train["FullDescription"]= train["FullDescription"].str.lower()
train["LocationRaw"]= train["LocationRaw"].str.lower()
train["LocationNormalized"]= train["LocationNormalized"].str.lower()
train["Company"]= train["Company"].str.lower()
train["Category"]= train["Category"].str.lower()

In [5]:
def  normalizer(doc):
    dat = ["Title","FullDescription","LocationRaw","LocationNormalized","Company","Category"]
    for i in dat:
        doc[i] = doc[i].str.lower()
        doc[i] = doc[i].str.replace('[#,@,&,****,]', '')
    doc['FullDescription'].replace('[^a-zA-Z0-9]', ' ', regex = True)

In [6]:
normalizer(train)

  doc[i] = doc[i].str.replace('[#,@,&,****,]', '')


In [7]:
train

Unnamed: 0,Id,Title,FullDescription,LocationRaw,LocationNormalized,ContractType,ContractTime,Company,Category,SalaryRaw,SalaryNormalized,SourceName
0,12612628,engineering systems analyst,engineering systems analyst dorking surrey sal...,dorking surrey surrey,dorking,full_time,permanent,gregory martin international,engineering jobs,20000 - 30000/annum 20-30K,25000,cv-library.co.uk
1,12612830,stress engineer glasgow,stress engineer glasgow salary to we re curr...,glasgow scotland scotland,glasgow,full_time,permanent,gregory martin international,engineering jobs,25000 - 35000/annum 25-35K,30000,cv-library.co.uk
2,12612844,modelling and simulation analyst,mathematical modeller / simulation analyst / o...,hampshire south east south east,hampshire,full_time,permanent,gregory martin international,engineering jobs,20000 - 40000/annum 20-40K,30000,cv-library.co.uk
3,12613049,engineering systems analyst / mathematical mod...,engineering systems analyst / mathematical mod...,surrey south east south east,surrey,full_time,permanent,gregory martin international,engineering jobs,25000 - 30000/annum 25K-30K negotiable,27500,cv-library.co.uk
4,12613647,pioneer miser engineering systems analyst,pioneer miser engineering systems analyst dor...,surrey south east south east,surrey,full_time,permanent,gregory martin international,engineering jobs,20000 - 30000/annum 20-30K,25000,cv-library.co.uk
...,...,...,...,...,...,...,...,...,...,...,...,...
244763,72705211,teacher of science,position: qualified teacher subject/specialism...,swindon,swindon,full_time,contract,ukstaffsearch,teaching jobs,450 - 500 per week,22800,hays.co.uk
244764,72705212,teacher of business studies and ict,position: qualified teacher or nqt subject/spe...,swindon,swindon,full_time,contract,ukstaffsearch,teaching jobs,450 - 500 per week,22800,hays.co.uk
244765,72705213,english teacher,position: qualified teacher subject/specialism...,swindon,swindon,full_time,contract,ukstaffsearch,teaching jobs,450 - 500 per week,22800,hays.co.uk
244766,72705216,supply teachers,position: qualified teacher subject/specialism...,wiltshire,wiltshire,full_time,contract,ukstaffsearch,teaching jobs,450 to 500 per week,22800,hays.co.uk


## Vectorization:

**Vetorização da coluna FullDescription**

In [8]:
vectorizer = TfidfVectorizer(min_df=5)
X_tfidf = vectorizer.fit_transform(train['FullDescription']) 

In [9]:
X_tfidf 

<244768x46178 sparse matrix of type '<class 'numpy.float64'>'
	with 34338764 stored elements in Compressed Sparse Row format>

**Vetorização das demais variáveis**

In [10]:
enc = DictVectorizer()
X_train_categ = enc.fit_transform(train[['Title','LocationRaw','LocationNormalized','ContractType', 'ContractTime','Company','Category','SourceName']].to_dict('records'))
x_train_vec = hstack([X_tfidf,X_train_categ])


### Model

In [11]:
y_true = rand["SalaryNormalized"]

### Dados de Teste

In [15]:
test.Title.fillna(test.Title.mode()[0], inplace= True)
test.ContractType.fillna(test.ContractType.mode()[0],inplace=True)
test.ContractTime.fillna(test.ContractTime.mode()[0],inplace=True)
test.Company.fillna(test.Company.mode()[0],inplace=True)
test.SourceName.fillna(test.SourceName.mode()[0],inplace=True)

In [17]:
normalizer(test)

  doc[i] = doc[i].str.replace('[#,@,&,****,]', '')


In [20]:
X_tfidf_test = vectorizer.transform(test['FullDescription']) 

In [21]:
X_test_categ = enc.transform(test[['Title','LocationRaw','LocationNormalized','ContractType', 'ContractTime','Company','Category','SourceName']].to_dict('records'))
x_test = hstack([X_tfidf_test,X_test_categ])

### Ridge


In [12]:
# Classifier: 
clf = Ridge(alpha=1.0, random_state=241)

# The target value (algorithm has to predict) is SalaryNormalized
y_train = train['SalaryNormalized']

# train model on data
clf.fit(x_train_vec, y_train) 


Ridge(random_state=241)

In [22]:
y_pred_rid = clf.predict(x_test)


In [23]:
y_pred_rid

array([23075.22301196, 29685.23139397, 31307.25653375, ...,
       29846.64158526, 12947.43674796, 29648.06670299])

#### MAE Ridge 

In [24]:
# MAE Ridge
from sklearn.metrics import mean_absolute_error
#mean_absolute_error(y_true, y_pred)
mae_ridg=mean_absolute_error(y_true, y_pred_rid)
mae_ridg

6519.81645616773

### Naive Bayes

In [25]:
from sklearn.naive_bayes import MultinomialNB

# 2. instantiate a Multinomial Naive Bayes model
nb = MultinomialNB()

In [26]:
nb.fit(x_train_vec, y_train)

MemoryError: Unable to allocate 15.4 GiB for an array with shape (244768, 8454) and data type int64

In [None]:
y_pred_bay = nb.predict(X_test)


In [None]:
# MAE Naive
from sklearn.metrics import mean_absolute_error

#mean_absolute_error(y_true, y_pred)
mae_bay=mean_absolute_error(y_true, y_pred_bay)
mae_bay

### Ligistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [None]:
logreg = LogisticRegression(solver='lbfgs')
logreg.fit(x_train_vec, y_train)


In [None]:
y_pred_log = nb.predict(X_test)


In [None]:
# MAE Log
from sklearn.metrics import mean_absolute_error

#mean_absolute_error(y_true, y_pred)
mae_log=mean_absolute_error(y_true, y_pred_log)
mae_log

### Pipeline:

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB

model = Pipeline([
    ('normalizer', TextNormalizer()),
    ('vectorizer', GensimVectorizer()),
    ('bayes', MultinomialNB()),
])