In [127]:
import pandas as pd
import scipy
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge

In [128]:
data_train = pd.read_csv('salary-train.csv')
data_train

Unnamed: 0,FullDescription,LocationNormalized,ContractTime,SalaryNormalized
0,International Sales Manager London ****k ****...,London,permanent,33000
1,An ideal opportunity for an individual that ha...,London,permanent,50000
2,Online Content and Brand Manager// Luxury Reta...,South East London,permanent,40000
3,A great local marketleader is seeking a perman...,Dereham,permanent,22500
4,Registered Nurse / RGN Nursing Home for Young...,Sutton Coldfield,,20355
...,...,...,...,...
59995,"As a result of continued growth, First Class S...",Whitley Bay,contract,26400
59996,PHP / MVC Web Developer MacclesfieldCirca ***...,Macclesfield,permanent,26000
59997,"Staff Nurse, Nursing Home, Baldock White Recru...",Baldock,,24500
59998,This is one of the best agency side opportunit...,The City,permanent,65000


In [129]:
data_test = pd.read_csv('salary-test-mini.csv')
data_test

Unnamed: 0,FullDescription,LocationNormalized,ContractTime,SalaryNormalized
0,We currently have a vacancy for an HR Project ...,Milton Keynes,contract,
1,A Web developer opportunity has arisen with an...,Manchester,permanent,


In [130]:
data_train['FullDescription'] = data_train['FullDescription'].str.lower()
data_train['FullDescription'] = data_train['FullDescription'].replace('[^a-zA-Z0-9]', ' ', regex = True)
data_train

Unnamed: 0,FullDescription,LocationNormalized,ContractTime,SalaryNormalized
0,international sales manager london k ...,London,permanent,33000
1,an ideal opportunity for an individual that ha...,London,permanent,50000
2,online content and brand manager luxury reta...,South East London,permanent,40000
3,a great local marketleader is seeking a perman...,Dereham,permanent,22500
4,registered nurse rgn nursing home for young...,Sutton Coldfield,,20355
...,...,...,...,...
59995,as a result of continued growth first class s...,Whitley Bay,contract,26400
59996,php mvc web developer macclesfieldcirca ...,Macclesfield,permanent,26000
59997,staff nurse nursing home baldock white recru...,Baldock,,24500
59998,this is one of the best agency side opportunit...,The City,permanent,65000


In [131]:
data_test['FullDescription'] = data_test['FullDescription'].str.lower()
data_test['FullDescription'] = data_test['FullDescription'].replace('[^a-zA-Z0-9]', ' ', regex = True)
data_test

Unnamed: 0,FullDescription,LocationNormalized,ContractTime,SalaryNormalized
0,we currently have a vacancy for an hr project ...,Milton Keynes,contract,
1,a web developer opportunity has arisen with an...,Manchester,permanent,


In [132]:
vectorizer = TfidfVectorizer(min_df=5)
X_train_1 = vectorizer.fit_transform(data_train['FullDescription'])
X_train_1

<60000x22861 sparse matrix of type '<class 'numpy.float64'>'
	with 8365759 stored elements in Compressed Sparse Row format>

In [133]:
X_test_1 = vectorizer.transform(data_test['FullDescription'])
X_test_1

<2x22861 sparse matrix of type '<class 'numpy.float64'>'
	with 300 stored elements in Compressed Sparse Row format>

In [134]:
data_train['LocationNormalized'].fillna('nan', inplace=True)
data_train['ContractTime'].fillna('nan', inplace=True)
data_train

Unnamed: 0,FullDescription,LocationNormalized,ContractTime,SalaryNormalized
0,international sales manager london k ...,London,permanent,33000
1,an ideal opportunity for an individual that ha...,London,permanent,50000
2,online content and brand manager luxury reta...,South East London,permanent,40000
3,a great local marketleader is seeking a perman...,Dereham,permanent,22500
4,registered nurse rgn nursing home for young...,Sutton Coldfield,,20355
...,...,...,...,...
59995,as a result of continued growth first class s...,Whitley Bay,contract,26400
59996,php mvc web developer macclesfieldcirca ...,Macclesfield,permanent,26000
59997,staff nurse nursing home baldock white recru...,Baldock,,24500
59998,this is one of the best agency side opportunit...,The City,permanent,65000


In [135]:
data_test['LocationNormalized'].fillna('nan', inplace=True)
data_test['ContractTime'].fillna('nan', inplace=True)
data_test

Unnamed: 0,FullDescription,LocationNormalized,ContractTime,SalaryNormalized
0,we currently have a vacancy for an hr project ...,Milton Keynes,contract,
1,a web developer opportunity has arisen with an...,Manchester,permanent,


In [140]:
enc = DictVectorizer()
X_train_2 = enc.fit_transform(data_train[['LocationNormalized', 'ContractTime']].to_dict('records'))
X_train_2

<60000x1766 sparse matrix of type '<class 'numpy.float64'>'
	with 120000 stored elements in Compressed Sparse Row format>

In [141]:
X_test_2 = enc.transform(data_test[['LocationNormalized', 'ContractTime']].to_dict('records'))
X_test_2

<2x1766 sparse matrix of type '<class 'numpy.float64'>'
	with 4 stored elements in Compressed Sparse Row format>

In [144]:
X_train = scipy.sparse.hstack([X_train_1, X_train_2])
X_train

<60000x24627 sparse matrix of type '<class 'numpy.float64'>'
	with 8485759 stored elements in COOrdinate format>

In [145]:
X_test = scipy.sparse.hstack([X_test_1, X_test_2])
X_test

<2x24627 sparse matrix of type '<class 'numpy.float64'>'
	with 304 stored elements in COOrdinate format>

In [146]:
y_train = data_train['SalaryNormalized']
y_train

0        33000
1        50000
2        40000
3        22500
4        20355
         ...  
59995    26400
59996    26000
59997    24500
59998    65000
59999    23040
Name: SalaryNormalized, Length: 60000, dtype: int64

In [147]:
clf = Ridge(alpha=1, random_state=241)
clf.fit(X_train, y_train)

Ridge(alpha=1, random_state=241)

In [150]:
y_pred = clf.predict(X_test)

In [151]:
out = open('q1.txt', 'w')
out.write(' '.join([str(x) for x in y_pred]))
out.close()