In [198]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split

In [199]:
data = pd.read_csv('traindataset.csv',memory_map=True) #importing cleaned dataset

In [200]:
categorical_cols = ["Housing Situation","Satisfation with employer",
                    "Gender","Country","Profession","University Degree","Hair Color"]
conti_cols = ["Year of Record","Crime Level in the City of Employement",
              "Work Experience in Current Job [years]","Age","Size of City","Wears Glasses",
              "Body Height [cm]","Yearly Income in addition to Salary (e.g. Rental Income)"]

In [201]:
#this function takes a string column name and returns a list
#containing indices of dataframe that have outliers in that column
#Refer: hs://towardsdatascience.com/5-ways-to-detect-outliers-that-every-data-scientist-should-know-python-code-70a54335a623
def OutlierByColumn(colname):
    rows = data.shape[0]
    col_std = np.std(data[colname])
    col_mean = np.mean(data[colname])
    anomaly_cut_off = col_std * 3
    lower_limit  = col_mean - anomaly_cut_off 
    upper_limit = col_mean + anomaly_cut_off
    anomalies_indices = []
    for i in range(rows):
        ele = data[colname][i]
        if ele > upper_limit or ele < lower_limit:
            anomalies_indices.append(i)
    return anomalies_indices        

In [202]:
from sklearn.covariance import EllipticEnvelope
#https://chrisalbon.com/machine_learning/preprocessing_structured_data/detecting_outliers/
def outlierdetect(colname):
    outlier_detector = EllipticEnvelope(contamination=.52)
    val = data[colname].values
    # Fit detector
    outlier_detector.fit(val.reshape(-1, 1))
    # Predict outliers
    res = outlier_detector.predict(val.reshape(-1, 1))
    return np.where(res == -1)[0]

In [203]:
arr1 = outlierdetect('Work Experience in Current Job [years]')
arr2 = outlierdetect('Age')
arr3 = outlierdetect('Body Height [cm]')
#arr4 = outlierdetect('Yearly Income in addition to Salary (e.g. Rental Income)')
arr5 = outlierdetect('Size of City')
arr6 = outlierdetect('Total Yearly Income [EUR]')

In [204]:
arr4 = OutlierByColumn('Yearly Income in addition to Salary (e.g. Rental Income)')#oulierdetect() was not working for this

In [205]:
data.shape #checking no of rows and no of columns for reference

(991712, 18)

In [206]:
#Union of all lists 
union_index = np.union1d(arr1,arr2)
union_index = np.union1d(union_index,arr3)
union_index = np.union1d(union_index,arr4)
union_index = np.union1d(union_index,arr5)
union_index = np.union1d(union_index,arr6)
len(union_index)

944985

In [207]:
data = data.drop(union_index)

In [208]:
#Read somewhere that binary encoders take less space
import category_encoders as ce
encoder = ce.BinaryEncoder(categorical_cols) 

In [209]:
#encoding categorical columns
encoded = encoder.fit_transform(data[categorical_cols])

In [210]:
y = data['Total Yearly Income [EUR]'].values 

In [211]:
#joining encoded columns with continuous columns
encoded = encoded.join(data[conti_cols])

In [212]:
x= encoded.values
x.shape

(46727, 47)

In [213]:
#I was trying to select K-best features but, as it turns out, it is best to include all features. 
'''from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import chi2
skb = SelectKBest(f_regression, k=47)         HERE K DENOTES NUMBER OF FEATURES I TRIED FROM 10 TILL 47, 
                                              THE MODEL GOT BETTER WITH MORE FEATURES, THE BEST WHEN ALL ARE THERE
skb.fit(x, y)
x = skb.transform(x)''' 


'from sklearn.feature_selection import SelectKBest\nfrom sklearn.feature_selection import f_regression\nfrom sklearn.feature_selection import chi2\nskb = SelectKBest(f_regression, k=47)         HERE K DENOTES NUMBER OF FEATURES I TRIED FROM 10 TILL 47, \n                                              THE MODEL GOT BETTER WITH MORE FEATURES, THE BEST WHEN ALL ARE THERE\nskb.fit(x, y)\nx = skb.transform(x)'

In [214]:
#feature scaling https://medium.com/coinmonks/machine-learning-tutorial-1-preprocessing-d90198e37577
from sklearn import preprocessing
scaler_model = preprocessing.StandardScaler().fit(x)
x = scaler_model.transform(x)

In [215]:
#separating data into train and test
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, random_state=0)

In [216]:
#ExtraTreesRegressor
#This will take some time to execute 
from sklearn.ensemble import ExtraTreesRegressor
gbr = ExtraTreesRegressor(n_estimators=25,max_features="sqrt")
gbr.fit(xtrain.astype(int),ytrain.astype(int))
ypred = gbr.predict(xtest.astype(int))
np.sqrt(metrics.mean_squared_error(ytest, ypred))#Root Mean Squared Error

3806.733315882951

In [217]:
ypred[:10]

array([ 1314.12,  8535.96,  3628.64,  3166.4 , 14070.32,  2470.88,
        7891.52,  1332.  ,  7708.  ,  5980.72])

In [218]:
ytest[:10]

array([ 1657.12,  2541.81,  3222.58,  1263.43, 14365.6 ,  1655.97,
        6069.35,  1332.77,  7708.66,  5835.44])