In [18]:
import data_io
from features import FeatureMapper, SimpleTransform
import numpy as np
import pandas as pd
import pickle
from tqdm import tqdm
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import BayesianRidge

In [3]:
def feature_extractor():
    features = [('FullDescription-Bag of Words', 'FullDescription', CountVectorizer(max_features=100)),
                ('Title-Bag of Words', 'Title', CountVectorizer(max_features=100)),
                ('LocationRaw-Bag of Words', 'LocationRaw', CountVectorizer(max_features=100)),
                ('LocationNormalized-Bag of Words', 'LocationNormalized', CountVectorizer(max_features=100))]
    combined = FeatureMapper(features)
    return combined

In [4]:
def get_pipeline():
    features = feature_extractor()
    steps = [("extract_features", features),
             ("classify", BayesianRidge(alpha_1=1e-06, 
                                        alpha_2=1e-06, 
                                        compute_score=False, 
                                        copy_X=True,
                                        fit_intercept=True, 
                                        lambda_1=1e-06, 
                                        lambda_2=1e-06, 
                                        n_iter=300,
                                        tol=0.001, 
                                        verbose=True))]
    return Pipeline(steps)

In [25]:
print("Reading in the training data")
train = data_io.get_train_df()

print("Extracting features and training model")
classifier = get_pipeline()
classifier.fit(train, train["SalaryNormalized"])

print("Making predictions") 
valid = data_io.get_valid_df()
predictions = classifier.predict(valid)   
predictions = predictions.reshape(len(predictions), 1)

print("Writing predictions to file")
data_io.write_submission(predictions)    

Reading in the training data
Extracting features and training model
Convergence after  10  iterations
Making predictions
Writing predictions to file


In [27]:
#10-fold cross-validation for mean error
scores = cross_val_score(classifier, train, train["SalaryNormalized"], cv=10, scoring='neg_mean_absolute_error')
print(scores)

Convergence after  10  iterations
Convergence after  10  iterations
Convergence after  10  iterations
Convergence after  10  iterations
Convergence after  10  iterations
Convergence after  10  iterations
Convergence after  10  iterations
Convergence after  10  iterations
Convergence after  10  iterations
Convergence after  10  iterations
[-9298.87828195 -9844.93023152 -9382.66027581 -9493.33851091
 -9817.59724675 -9715.86945166 -9594.17281322 -9991.65019999
 -9705.29637546 -9782.10277655]


In [14]:
real = pd.read_csv('realsalary.csv')
result = pd.read_csv('BayesianRidge_pre_rev1.csv')

In [15]:
salary_real = real['SalaryNormalized']
salary_result = result['SalaryNormalized']
n = 0
for i in tqdm(range(0,len(salary_real))):
    n += abs(salary_real[i]-salary_result[i])
n = n/len(salary_real)
print('ME of this model is')
print(n)

100%|█████████████████████████████████████████████████████████████████████████| 36750/36750 [00:01<00:00, 27488.05it/s]


ME of this model is
9641.277253591246
