In [17]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack

In [9]:
train = pd.read_csv("salary-train.zip")
print("train:", train.shape)
test = pd.read_csv("salary-test-mini.csv")
print("test:", test.shape)
print("columns:", train.columns)
train.head(2)

train: (60000, 4)
test: (2, 4)
columns: Index(['FullDescription', 'LocationNormalized', 'ContractTime',
       'SalaryNormalized'],
      dtype='object')


Unnamed: 0,FullDescription,LocationNormalized,ContractTime,SalaryNormalized
0,International Sales Manager London ****k ****...,London,permanent,33000
1,An ideal opportunity for an individual that ha...,London,permanent,50000


In [12]:
for df in [train, test]:
    df['FullDescription'] = df['FullDescription'].str.lower().replace('[^a-zA-Z0-9]', ' ', regex = True)
    df['LocationNormalized'].fillna('nan', inplace=True)
    df['ContractTime'].fillna('nan', inplace=True)

In [20]:
tfidf = TfidfVectorizer(min_df=5)
X_train_text = tfidf.fit_transform(train.FullDescription)
X_test_text = tfidf.transform(test.FullDescription)

enc = DictVectorizer()
X_train_categ = enc.fit_transform(train[['LocationNormalized', 'ContractTime']].to_dict('records'))
X_test_categ = enc.transform(test[['LocationNormalized', 'ContractTime']].to_dict('records'))

X_train = hstack([X_train_text, X_train_categ])
X_test = hstack([X_test_text, X_test_categ])
y_train = train.SalaryNormalized
y_test = test.SalaryNormalized

In [21]:
model = Ridge(alpha=1, random_state=241)
model.fit(X_train, y_train)

Ridge(alpha=1, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=241, solver='auto', tol=0.001)

In [25]:
print(np.round(model.predict(X_test), 2))

[56555.62 37188.32]
