In [2]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [7]:
def cleanUp(data):

    print("\nBeginning Cleanup...")
    #Imputing data    
    data["Age"].fillna(round(data["Age"].mean()),inplace=True)                              #replacing missing values for integer columns
    data["Year of Record"].fillna(data["Year of Record"].mode()[0],inplace=True)    
    data["Size of City"].fillna(round(data["Size of City"].mean()),inplace=True)    
    
    data["Gender"].fillna(data["Gender"].mode()[0],inplace=True)                            #replacing missing values for string columns
    data["Country"].fillna(data["Country"].mode()[0],inplace=True)
    data["University Degree"].fillna(data["University Degree"].mode()[0],inplace=True)
    data["Hair Color"].fillna(data["Hair Color"].mode()[0],inplace=True)    
    data[['Profession']]=data[['Profession']].fillna(value='9999')                          #replacing profession missing values with 9999
    
    #data["Work Experience in Current Job [years]"].fillna(data["Work Experience in Current Job [years]"].mean(),inplace=True) 
    #data['Work Experience in Current Job [years]']=data['Work Experience in Current Job [years]'].astype(str)
    data['Housing Situation']=data['Housing Situation'].astype(str)
    data['Work Experience in Current Job [years]'].fillna(round(data['Work Experience in Current Job [years]'].mean()),inplace=True)
    data['Housing Situation']=data['Housing Situation'].replace('0','zero')
    data['Yearly Income in addition to Salary (e.g. Rental Income)'] = data['Yearly Income in addition to Salary (e.g. Rental Income)'].str.replace(r' EUR$', '')
    data['Satisfation with employer'].fillna(data['Satisfation with employer'].mode()[0],inplace=True)
    data['Yearly Income in addition to Salary (e.g. Rental Income)']=data['Yearly Income in addition to Salary (e.g. Rental Income)'].astype(float)
    print("\nCleanup finished...")    
    return data

def create_cat_con(df,cats,cons,normalize=True):
    for i,cat in enumerate(cats):
        vc = df[cat].value_counts(dropna=False, normalize=normalize).to_dict()
        nm = cat + '_FE_FULL'
        df[nm] = df[cat].map(vc)
        df[nm] = df[nm].astype('float32')
        for j,con in enumerate(cons):
#             print("cat %s con %s"%(cat,con))
            new_col = cat +'_'+ con
            #print('timeblock frequency encoding:', new_col)
            df[new_col] = df[cat].astype(str)+'_'+df[con].astype(str)
            temp_df = df[new_col]
            fq_encode = temp_df.value_counts(normalize=True).to_dict()
            df[new_col] = df[new_col].map(fq_encode)
            df[new_col] = df[new_col]/df[cat+'_FE_FULL']
    return df


In [8]:
train = pd.read_csv("Data/tcd-ml-1920-group-income-train.csv", low_memory=False)

test = pd.read_csv("Data/tcd-ml-1920-group-income-test.csv", low_memory=False)

train.drop_duplicates(inplace=True)
test.drop_duplicates(inplace=True)

In [9]:
data = pd.concat([train,test],ignore_index=True)
data = cleanUp(data)
"""
fill_col_dict = {'Year of Record': 1999.0,
 'Gender':'female',
 'Age': 15,
 'Profession': 'principal administrative associate',
 'University Degree': 0,
 'Hair Color': 'Black'}
for col in fill_col_dict.keys():
    data[col] = data[col].fillna(fill_col_dict[col])
"""    



Beginning Cleanup...

Cleanup finished...


"\nfill_col_dict = {'Year of Record': 1999.0,\n 'Gender':'female',\n 'Age': 15,\n 'Profession': 'principal administrative associate',\n 'University Degree': 0,\n 'Hair Color': 'Black'}\nfor col in fill_col_dict.keys():\n    data[col] = data[col].fillna(fill_col_dict[col])\n"

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1361147 entries, 0 to 1361146
Data columns (total 17 columns):
Instance                                                    1361147 non-null int64
Year of Record                                              1361147 non-null float64
Housing Situation                                           1361147 non-null object
Crime Level in the City of Employement                      1361147 non-null int64
Work Experience in Current Job [years]                      1361147 non-null float64
Satisfation with employer                                   1361147 non-null object
Gender                                                      1361147 non-null object
Age                                                         1361147 non-null int64
Country                                                     1361147 non-null object
Size of City                                                1361147 non-null int64
Profession                                       

In [11]:
cats = ['Year of Record', 'Gender', 'Country',
        'Profession', 'University Degree','Wears Glasses',
        'Hair Color','Age','Housing Situation','Satisfation with employer']
#,'Housing Situation','Satisfation with employer']
cons = ['Size of City','Body Height [cm]','Crime Level in the City of Employement','Work Experience in Current Job [years]'
        ,'Yearly Income in addition to Salary (e.g. Rental Income)']
#,'Crime Level in the City of Employement','Work Experience in Current Job [years]','Yearly Income in addition to Salary (e.g. Rental Income)']

data = create_cat_con(data,cats,cons)

for col in train.dtypes[train.dtypes == 'object'].index.tolist():
    feat_le = LabelEncoder()
    feat_le.fit(data[col].unique().astype(str))
    data[col] = feat_le.transform(data[col].astype(str))

del_col = set(['Total Yearly Income [EUR]','Instance'])
features_col =  list(set(data) - del_col)

In [12]:
#data[:991709].to_csv('imputedtrain.csv',index=False)
#data[991709:].to_csv('imputertest.csv',index=False)
X_train,X_test = data[features_col].iloc[:991709],data[features_col].iloc[991709:]
Y_train = data['Total Yearly Income [EUR]'].iloc[:991709]
X_test_id = data['Instance'].iloc[991709:]
x_train,x_val,y_train,y_val = train_test_split(X_train,Y_train,test_size=0.2,random_state=1234)

In [18]:
params = {
          'max_depth': 20,
          'learning_rate': 0.001,
          "boosting": "gbdt",
          "bagging_seed": 11,
          "metric": 'mae',
          "verbosity": -1,
             "device":'gpu',
    "gpuusedp":"true"
         }
trn_data = lgb.Dataset(x_train, label=y_train)
val_data = lgb.Dataset(x_val, label=y_val)
# test_data = lgb.Dataset(X_test)
clf = lgb.train(params, trn_data, 100000, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds=500)
'done'

Training until validation scores don't improve for 500 rounds
[1000]	training's l1: 35261.6	valid_1's l1: 35267.8
[2000]	training's l1: 22524.7	valid_1's l1: 22534.3
[3000]	training's l1: 17851.3	valid_1's l1: 17872.7
[4000]	training's l1: 15521.7	valid_1's l1: 15577.6
[5000]	training's l1: 14172.5	valid_1's l1: 14250.3
[6000]	training's l1: 13428.7	valid_1's l1: 13524.4
[7000]	training's l1: 12968.9	valid_1's l1: 13082.3
[8000]	training's l1: 12622.3	valid_1's l1: 12753.9
[9000]	training's l1: 12347.4	valid_1's l1: 12493.9
[10000]	training's l1: 12144.6	valid_1's l1: 12305.7
[11000]	training's l1: 11949	valid_1's l1: 12120.8
[12000]	training's l1: 11815.4	valid_1's l1: 11999.6
[13000]	training's l1: 11707.9	valid_1's l1: 11904.1
[14000]	training's l1: 11614.6	valid_1's l1: 11821.9
[15000]	training's l1: 11521.6	valid_1's l1: 11740.3
[16000]	training's l1: 11449.7	valid_1's l1: 11679.6
[17000]	training's l1: 11387.5	valid_1's l1: 11629.2
[18000]	training's l1: 11329.7	valid_1's l1: 115

'done'

In [19]:
from sklearn.metrics import mean_absolute_error
pre_test_lgb = clf.predict(x_val)

val_mae = mean_absolute_error(y_val,pre_test_lgb)
val_mae

10569.476636625492

In [20]:
import matplotlib.pyplot as plt
barWidth=0.25
bars1=y_val.iloc[:10]
print(bars1)
bars2=pd.DataFrame(pre_test_lgb).iloc[:5]
print(bars2)
r1 = np.arange(len(bars1))
r2 = [x + barWidth for x in r1]

#plt.bar(r1, bars1, color='#7f6d5f', width=barWidth, edgecolor='white', label='actual')
plt.bar(r2, bars2, color='#557f2d', width=barWidth, edgecolor='white', label='calculated')
plt.show()

71122        530.76
378543      1866.95
139206      2334.03
152073      2218.83
929019    174818.58
56828       2277.67
192404      5547.22
950993    118301.24
551194     89414.48
239981      6544.38
Name: Total Yearly Income [EUR], dtype: float64
               0
0    -173.257400
1    4122.163544
2     863.439286
3    1409.336683
4  252877.123381


<Figure size 640x480 with 1 Axes>

In [21]:
plt.savefig('test.png')

<Figure size 432x288 with 0 Axes>

In [22]:
pre_test_lgbdf=pd.DataFrame(pre_test_lgb)
pre_test_lgbdf.to_csv('resultval.csv')

In [None]:
post_test_lgb = clf.predict(X_test)

sub_df = pd.DataFrame({'Instance':X_test_id,
                       'Income':post_test_lgb})
sub_df.head()

In [None]:
sub_df.to_csv("submission1.csv",index=False)
'done'

In [None]:
# libraries
import numpy as np
import matplotlib.pyplot as plt
 
# set width of bar
barWidth = 0.25
 
# set height of bar
bars1 = [12, 30, 1, 8, 22]
bars2 = [28, 6, 16, 5, 10]
bars3 = [29, 3, 24, 25, 17]
 
# Set position of bar on X axis
r1 = np.arange(len(bars1))
r2 = [x + barWidth for x in r1]
r3 = [x + barWidth for x in r2]
 
# Make the plot
plt.bar(r1, bars1, color='#7f6d5f', width=barWidth, edgecolor='white', label='var1')
plt.bar(r2, bars2, color='#557f2d', width=barWidth, edgecolor='white', label='var2')
plt.bar(r3, bars3, color='#2d7f5e', width=barWidth, edgecolor='white', label='var3')
 
# Add xticks on the middle of the group bars
plt.xlabel('group', fontweight='bold')
plt.xticks([r + barWidth for r in range(len(bars1))], ['A', 'B', 'C', 'D', 'E'])
 
# Create legend & Show graphic
plt.legend()
plt.show()
