In [1]:
import data_io
from features import FeatureMapper, SimpleTransform
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score

In [2]:
def feature_extractor():
    features = [('FullDescription-Bag of Words', 'FullDescription', CountVectorizer(max_features=100)),
                ('Title-Bag of Words', 'Title', CountVectorizer(max_features=100)),
                ('LocationRaw-Bag of Words', 'LocationRaw', CountVectorizer(max_features=100)),
                ('LocationNormalized-Bag of Words', 'LocationNormalized', CountVectorizer(max_features=100))]
    combined = FeatureMapper(features)
    return combined

In [3]:
def get_pipeline():
    features = feature_extractor()
    steps = [("extract_features", features),
             ("classify", MultinomialNB(alpha=0.1, 
                                        class_prior=None, 
                                        fit_prior=True))]
    return Pipeline(steps)

In [4]:
print("Reading in the training data")
train = data_io.get_train_df()

print("Extracting features and training model")
classifier = get_pipeline()
classifier.fit(train, train["SalaryNormalized"])

print("Making predictions") 
valid = data_io.get_valid_df()
predictions = classifier.predict(valid)   
predictions = predictions.reshape(len(predictions), 1)
    
print("Writing predictions to file")
data_io.write_submission(predictions)

Reading in the training data
Extracting features and training model
Making predictions
Writing predictions to file


In [5]:
real = pd.read_csv('realsalary.csv')
result = pd.read_csv('MultinomialNB_pre_rev1.csv')
salary_real = real['SalaryNormalized']
salary_result = result['SalaryNormalized']

In [9]:
n = 0
for i in tqdm(range(0,len(salary_real))):
    n += abs(salary_real[i]-salary_result[i])
n = (n/len(salary_real))
print('ME of this model is')
print(n)

100%|█████████████████████████████████████████████████████████████████████████| 36750/36750 [00:01<00:00, 24948.40it/s]


ME of this model is
10729.722421768707
