In [164]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text
import re
from scipy.sparse import hstack

from sklearn.linear_model import Ridge

In [173]:
train = pd.read_csv('../../../data/salary-train.csv')
test = pd.read_csv('../../../data/salary-test-mini.csv')

In [174]:
train.head()

Unnamed: 0,FullDescription,LocationNormalized,ContractTime,SalaryNormalized
0,International Sales Manager London ****k ****...,London,permanent,33000
1,An ideal opportunity for an individual that ha...,London,permanent,50000
2,Online Content and Brand Manager// Luxury Reta...,South East London,permanent,40000
3,A great local marketleader is seeking a perman...,Dereham,permanent,22500
4,Registered Nurse / RGN Nursing Home for Young...,Sutton Coldfield,,20355


In [175]:
test.head()

Unnamed: 0,FullDescription,LocationNormalized,ContractTime,SalaryNormalized
0,We currently have a vacancy for an HR Project ...,Milton Keynes,contract,
1,A Web developer opportunity has arisen with an...,Manchester,permanent,


In [176]:
train.ContractTime.value_counts()

permanent    37169
contract      7249
Name: ContractTime, dtype: int64

In [177]:
train.isna().sum()

FullDescription           0
LocationNormalized        0
ContractTime          15582
SalaryNormalized          0
dtype: int64

## Preprocessing

In [178]:
test.drop('SalaryNormalized', axis=1, inplace=True)
train, y_train = train.drop('SalaryNormalized', axis=1), train['SalaryNormalized']

In [182]:
train.ContractTime.fillna('nan', inplace=True)

In [183]:
train.FullDescription = train.FullDescription.apply(lambda x: x.lower())
test.FullDescription = test.FullDescription.apply(lambda x: x.lower())

train.LocationNormalized = train.LocationNormalized.apply(lambda x: x.lower())
test.LocationNormalized = test.LocationNormalized.apply(lambda x: x.lower())

train.ContractTime = train.ContractTime.apply(lambda x: x.lower())
test.ContractTime = test.ContractTime.apply(lambda x: x.lower())

train.ContractTime.fillna('nan', inplace=True)

train.FullDescription = train.FullDescription.apply(lambda x: re.sub('[^a-zA-Z0-9]', ' ', x))
test.FullDescription = test.FullDescription.apply(lambda x: re.sub('[^a-zA-Z0-9]', ' ', x))

In [184]:
enc = DictVectorizer()
X_train_enc = enc.fit_transform(train[['LocationNormalized', 'ContractTime']].to_dict('records'))
X_test_enc = enc.transform(test[['LocationNormalized', 'ContractTime']].to_dict('records'))

In [185]:
tfidf = TfidfVectorizer(min_df=5)
train_tfidf_features = tfidf.fit_transform(train.FullDescription)
test_tfidf_features = tfidf.transform(test.FullDescription)

In [186]:
X_train = hstack([train_tfidf_features, X_train_enc])
X_test = hstack([test_tfidf_features, X_test_enc])

In [187]:
X_train.shape, X_test.shape

((60000, 24627), (2, 24627))

In [188]:
ridge = Ridge(alpha=1, random_state=241)

In [190]:
%%time
ridge.fit(X_train, y_train)

CPU times: user 6.93 s, sys: 768 ms, total: 7.7 s
Wall time: 3.15 s


Ridge(alpha=1, random_state=241)

In [191]:
ridge.predict(X_test)

array([56563.70916865, 37141.85292874])