In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error
from math import sqrt

from catboost import CatBoostRegressor

import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression

#import warnings
#warnings.filterwarnings('ignore')

%matplotlib inline

In [2]:
data = pd.read_csv('Data/tcd-ml-1920-group-income-train.csv', low_memory = False)
dataTest  = pd.read_csv('Data/tcd-ml-1920-group-income-test.csv', low_memory = False)

In [3]:
del_col_list = ['Instance','Hair Color', 'Wears Glasses']
data = data.drop(del_col_list, axis=1)
data= data[data["Income"]>0]
#data.info()
data["Income"] = np.log(data["Income"])

data_test_original = dataTest
del_col_list = ['Instance','Hair Color', 'Wears Glasses']
dataTest = dataTest.drop(del_col_list, axis=1)



In [4]:
print("Old Size : " , data.shape[0])
print("Duplicates: " , data.duplicated().sum())
data.drop_duplicates(inplace=True) 
print("New Size : " , data.shape[0])

Old Size :  1048574
Duplicates:  144555
New Size :  904019


In [5]:
data[["Year of Record"]] = data[["Year of Record"]].fillna(value=data["Year of Record"].mode()[0])

dataTest[["Year of Record"]] = dataTest[["Year of Record"]].fillna(value=dataTest["Year of Record"].mode()[0])


In [6]:
year_max_value = data['Size of City'].max()
year_min_value = data['Size of City'].min()
data['Size of City'] = (data['Size of City'] - year_min_value) / (year_max_value - year_min_value)

year_max_value = dataTest['Size of City'].max()
year_min_value = dataTest['Size of City'].min()
dataTest['Size of City'] = (dataTest['Size of City'] - year_min_value) / (year_max_value - year_min_value)

In [7]:
data[["Gender"]] = data[["Gender"]].fillna(value="unknownG")
data[['Gender']] = data[['Gender']].replace('0', 'unknownG') 
data[['Gender']] = data[['Gender']].replace('unknown', 'unknownG')

data[["University Degree"]] = data[["University Degree"]].fillna(data["University Degree"].mode()[0])

data[["Profession"]] = data[["Profession"]].fillna(value="unknownP")

data[['Satisfation with employer']] = data[['Satisfation with employer']].fillna(data["Satisfation with employer"].mode()[0])






dataTest[["University Degree"]] = dataTest[["University Degree"]].fillna(dataTest["University Degree"].mode()[0])

dataTest[["Profession"]] = dataTest[["Profession"]].fillna(value="unknownP")

dataTest[["Gender"]] = dataTest[["Gender"]].fillna(value="unknownG")
dataTest[['Gender']] = dataTest[['Gender']].replace('0', 'unknownG') 
dataTest[['Gender']] = dataTest[['Gender']].replace('unknown', 'unknownG')

dataTest[['Satisfation with employer']] = dataTest[['Satisfation with employer']].fillna(data["Satisfation with employer"].mode()[0])
dataTest[["Country"]] = dataTest[["Country"]].fillna(dataTest["Country"].mode()[0])

In [8]:
print("Train size: " , data.shape[0] )
dataFull = pd.concat([data,dataTest],ignore_index=True)


Train size:  904019


In [9]:
def create_cat_con(df,cats,cons,normalize=True):
    for i,cat in enumerate(cats):
        vc = df[cat].value_counts(dropna=False, normalize=normalize).to_dict()
        nm = cat + '_FE_FULL'
        df[nm] = df[cat].map(vc)
        df[nm] = df[nm].astype('float32')
        for j,con in enumerate(cons):
#             print("cat %s con %s"%(cat,con))
            new_col = cat +'_'+ con
            #print('timeblock frequency encoding:', new_col)
            df[new_col] = df[cat].astype(str)+'_'+df[con].astype(str)
            temp_df = df[new_col]
            fq_encode = temp_df.value_counts(normalize=True).to_dict()
            df[new_col] = df[new_col].map(fq_encode)
            df[new_col] = df[new_col]/df[cat+'_FE_FULL']
    return df

In [10]:
cats = ['Year of Record', 'Gender', 'Country',
        'Profession', 'University Degree','Age','Housing Situation','Satisfation with employer']


cons = ['Size of City','Body Height [cm]','Crime Level in the City of Employement','Work Experience in Current Job [years]'
        ,'Yearly Income in addition to Salary (e.g. Rental Income)']


dataFull = create_cat_con(dataFull,cats,cons)

for col in data.dtypes[data.dtypes == 'object'].index.tolist():
    feat_le = LabelEncoder()
    feat_le.fit(dataFull[col].unique().astype(str))
    dataFull[col] = feat_le.transform(dataFull[col].astype(str))

In [11]:
data = dataFull[:904019]
dataTest = dataFull[904019:]

In [12]:

X = data.drop('Income', axis = 1)
y = data['Income']

X_train, X_validation, y_train, y_validation = train_test_split(X,y,test_size=0.25, random_state=42)

# categorical_features_indices = np.where(X.dtypes != np.float)[0]
# model=CatBoostRegressor(iterations=125, depth=6, learning_rate=0.1, loss_function='MAE', use_best_model=True , eval_metric='MAE')
# model.fit(X_train, y_train,cat_features=categorical_features_indices,eval_set=(X_validation, y_validation),plot=True)

In [None]:
params = {
          'max_depth': 20,
          'learning_rate': 0.001,
          "boosting": "gbdt",
          "bagging_seed": 11,
          "metric": 'mae',
          "verbosity": -1,
            "device":'gpu',
            "gpu_use_dp" : "true"
         }
trn_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_validation, label=y_validation)
# test_data = lgb.Dataset(X_test)
model = lgb.train(params, trn_data, 100000, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds=500)
'done'

In [17]:

dataTest = dataTest.drop('Income', axis = 1)

In [18]:
y_pred = model.predict(dataTest)
y_pred = y_pred.reshape(-1)

In [19]:
instances = data_test_original['Instance'].to_numpy()
to_print = pd.DataFrame({'Instance': instances, 'Total Yearly Income [EUR]': np.exp(y_pred)})
to_print.to_csv('result.csv', index=False)

In [12]:
'''groupedProf = data.groupby('Profession', as_index=False)['Income'].mean()
data['Profession'] = data['Profession'].map(groupedProf.set_index('Profession')['Income'])

groupedCountry = data.groupby('Country', as_index=False)['Income'].mean()
data['Country'] = data['Country'].map(groupedCountry.set_index('Country')['Income'])

groupedUD = data.groupby('University Degree', as_index=False)['Income'].mean()
data['University Degree'] = data['University Degree'].map(groupedUD.set_index('University Degree')['Income'])

groupedG = data.groupby('Gender', as_index=False)['Income'].mean()
data['Gender'] = data['Gender'].map(groupedG.set_index('Gender')['Income'])

groupedHS = data.groupby('Housing Situation', as_index=False)['Income'].mean()
data['Housing Situation'] = data['Housing Situation'].map(groupedHS.set_index('Housing Situation')['Income'])

groupedWE = data.groupby('Work Experience in Current Job [years]', as_index=False)['Income'].mean()
data['Work Experience in Current Job [years]'] = data['Work Experience in Current Job [years]'].map(groupedWE.set_index('Work Experience in Current Job [years]')['Income'])

groupedSWE = data.groupby('Satisfation with employer', as_index=False)['Income'].mean()
data['Satisfation with employer'] = data['Satisfation with employer'].map(groupedSWE.set_index('Satisfation with employer')['Income'])

groupedYIA = data.groupby('Yearly Income in addition to Salary (e.g. Rental Income)', as_index=False)['Income'].mean()
data['Yearly Income in addition to Salary (e.g. Rental Income)'] = data['Yearly Income in addition to Salary (e.g. Rental Income)'].map(groupedYIA.set_index('Yearly Income in addition to Salary (e.g. Rental Income)')['Income'])






dataTest['Profession'] = dataTest['Profession'].map(groupedProf.set_index('Profession')['Income'])
dataTest['Country'] = dataTest['Country'].map(groupedCountry.set_index('Country')['Income'])
dataTest['University Degree'] = dataTest['University Degree'].map(groupedUD.set_index('University Degree')['Income'])
dataTest['Gender'] = dataTest['Gender'].map(groupedG.set_index('Gender')['Income'])
dataTest['Housing Situation'] = dataTest['Housing Situation'].map(groupedHS.set_index('Housing Situation')['Income'])
dataTest['Work Experience in Current Job [years]'] = dataTest['Work Experience in Current Job [years]'].map(groupedWE.set_index('Work Experience in Current Job [years]')['Income'])
dataTest['Satisfation with employer'] = dataTest['Satisfation with employer'].map(groupedSWE.set_index('Satisfation with employer')['Income'])
dataTest['Yearly Income in addition to Salary (e.g. Rental Income)'] = dataTest['Yearly Income in addition to Salary (e.g. Rental Income)'].map(groupedYIA.set_index('Yearly Income in addition to Salary (e.g. Rental Income)')['Income'])


#dataTest[["Country"]] = dataTest[["Country"]].fillna(value=dataTest["Country"].mean())
#dataTest[["Profession"]] = dataTest[["Profession"]].fillna(value=dataTest["Profession"].mean())'''

'groupedProf = data.groupby(\'Profession\', as_index=False)[\'Income\'].mean()\ndata[\'Profession\'] = data[\'Profession\'].map(groupedProf.set_index(\'Profession\')[\'Income\'])\n\ngroupedCountry = data.groupby(\'Country\', as_index=False)[\'Income\'].mean()\ndata[\'Country\'] = data[\'Country\'].map(groupedCountry.set_index(\'Country\')[\'Income\'])\n\ngroupedUD = data.groupby(\'University Degree\', as_index=False)[\'Income\'].mean()\ndata[\'University Degree\'] = data[\'University Degree\'].map(groupedUD.set_index(\'University Degree\')[\'Income\'])\n\ngroupedG = data.groupby(\'Gender\', as_index=False)[\'Income\'].mean()\ndata[\'Gender\'] = data[\'Gender\'].map(groupedG.set_index(\'Gender\')[\'Income\'])\n\ngroupedHS = data.groupby(\'Housing Situation\', as_index=False)[\'Income\'].mean()\ndata[\'Housing Situation\'] = data[\'Housing Situation\'].map(groupedHS.set_index(\'Housing Situation\')[\'Income\'])\n\ngroupedWE = data.groupby(\'Work Experience in Current Job [years]\', as_in

In [13]:
'''X = data.drop('Income', axis = 1).values
y = data['Income'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=2)'''

"X = data.drop('Income', axis = 1).values\ny = data['Income'].values\n\nX_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=2)"

In [14]:
'''
model = LinearRegression()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)

y_pred = y_pred.reshape(-1)

print("RMSE: " + str(sqrt(mean_squared_error(np.exp(y_test), np.exp(y_pred)))))'''

'\nmodel = LinearRegression()\nmodel.fit(X_train,y_train)\ny_pred = model.predict(X_test)\n\ny_pred = y_pred.reshape(-1)\n\nprint("RMSE: " + str(sqrt(mean_squared_error(np.exp(y_test), np.exp(y_pred)))))'