In [12]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [13]:
def cleanUp(data):

    print("\nBeginning Cleanup...")
    #Imputing data    
    data["Age"].fillna(round(data["Age"].mean()),inplace=True)                              #replacing missing values for integer columns
    data["Year of Record"].fillna(data["Year of Record"].mode()[0],inplace=True)    
    data["Size of City"].fillna(round(data["Size of City"].mean()),inplace=True)    
    
    data["Gender"].fillna(data["Gender"].mode()[0],inplace=True)                            #replacing missing values for string columns
    data["Country"].fillna(data["Country"].mode()[0],inplace=True)
    data["University Degree"].fillna(data["University Degree"].mode()[0],inplace=True)
    data["Hair Color"].fillna(data["Hair Color"].mode()[0],inplace=True)    
    data[['Profession']]=data[['Profession']].fillna(value='9999')                          #replacing profession missing values with 9999
    
    #data["Work Experience in Current Job [years]"].fillna(data["Work Experience in Current Job [years]"].mean(),inplace=True) 
    #data['Work Experience in Current Job [years]']=data['Work Experience in Current Job [years]'].astype(str)
    data['Housing Situation']=data['Housing Situation'].astype(str)
    data['Work Experience in Current Job [years]'].fillna(round(data['Work Experience in Current Job [years]'].mean()),inplace=True)
    data['Housing Situation']=data['Housing Situation'].replace('0','zero')
    data['Yearly Income in addition to Salary (e.g. Rental Income)'] = data['Yearly Income in addition to Salary (e.g. Rental Income)'].str.replace(r' EUR$', '')
    data['Satisfation with employer'].fillna(data['Satisfation with employer'].mode()[0],inplace=True)
    data['Yearly Income in addition to Salary (e.g. Rental Income)']=data['Yearly Income in addition to Salary (e.g. Rental Income)'].astype(float)
    print("\nCleanup finished...")    
    return data

def frequency_time_blocking(data,categoricals,continuous,normalize=True):
    for i,cat in enumerate(categoricals):
        val_dict = data[cat].value_counts(dropna=False, normalize=normalize).to_dict()
        nm = cat + '_FF'
        data[nm] = data[cat].map(val_dict).astype('float32')
        for j,con in enumerate(continuous):
            new_col = cat +'_'+ con
            data[new_col] = data[cat].astype(str)+'_'+data[con].astype(str)
            temp_df = data[new_col]
            fq_encode = temp_df.value_counts(normalize=True).to_dict()
            data[new_col] = data[new_col].map(fq_encode)
            data[new_col] = data[new_col]/data[cat+'_FF']
    return data

def removeRows(data):
    
    print('Removing outliers from Size of City')
    outlierCity = detect_outlier(data['Size of City'])                                     
    data=data[~data["Size of City"].isin(outlierCity)]
    print('Outliers removed from Size of City')
    
    print('Removing outliers from Total Yearly Income [EUR]')
    outlierInc = detect_outlier(data['Total Yearly Income [EUR]'])
    data=data[~data["Total Yearly Income [EUR]"].isin(outlierInc)]
    print('Outliers removed from Total Yearly Income [EUR]')
    
    print('Removing outliers from Age')
    outlierAge = detect_outlier(data['Age'])
    data=data[~data["Age"].isin(outlierAge)]
    print('Outliers removed from Age')
    
    print('Removing outliers from Body Height [cm]')
    outlierHt = detect_outlier(data['Body Height [cm]'])
    data=data[~data["Body Height [cm]"].isin(outlierHt)]
    print('Outliers removed from Body Height [cm]')
    
    #outlierHS = detect_outlier(data['Housing Situation'])
    #data=data[~data["Housing Situation"].isin(outlierHS)]
    #outlierSE = detect_outlier(data['Satisfation with employer'])
    #data=data[~data["Satisfation with employer"].isin(outlierSE)]
    print('Removing outliers from Crime Level in the City of Employement')
    outlierCE = detect_outlier(data['Crime Level in the City of Employement'])
    data=data[~data["Crime Level in the City of Employement"].isin(outlierCE)]
    print('Outliers removed from Crime Level in the City of Employement')
    
    
    #outlierWE = detect_outlier(data['Work Experience in Current Job [years]'])
    #data=data[~data["Work Experience in Current Job [years]"].isin(outlierWE)]    
    #outlierWE = detect_outlier(data['Yearly Income in addition to Salary (e.g. Rental Income)'])
    #data=data[~data["Yearly Income in addition to Salary (e.g. Rental Income)"].isin(outlierWE)]
    
    return data

def detect_outlier(data):
    
    threshold=3
    mean_1 = np.mean(data)
    std_1 =np.std(data)
    outliers=[]
    for y in data:
        z_score= (y - mean_1)/std_1 
        if np.abs(z_score) > threshold:
            outliers.append(y)
    return outliers

In [14]:
train = pd.read_csv("C:\\Users\\SIDDHARTHA\\Dropbox\\Trinity Data Science\\ML\\tcd-ml-comp-201920-income-pred-group\\tcd-ml-1920-group-income-train.csv")

test = pd.read_csv("C:\\Users\\SIDDHARTHA\\Dropbox\\Trinity Data Science\\ML\\tcd-ml-comp-201920-income-pred-group\\tcd-ml-1920-group-income-test.csv")

train.drop_duplicates(inplace=True)
test.drop_duplicates(inplace=True)

In [15]:
print('Train shape orig: ',str(train.shape))
train=removeRows(train)
print('Train shape new: ',str(train.shape))
splitter_index=train.shape[0]
data = pd.concat([train,test],ignore_index=True)
print(data.shape)
data = cleanUp(data)  

Train shape orig:  (991709, 17)
Removing outliers from Size of City
Outliers removed from Size of City
Removing outliers from Total Yearly Income [EUR]
Outliers removed from Total Yearly Income [EUR]
Removing outliers from Age
Outliers removed from Age
Removing outliers from Body Height [cm]
Outliers removed from Body Height [cm]
Removing outliers from Crime Level in the City of Employement
Outliers removed from Crime Level in the City of Employement
Train shape new:  (958423, 17)
(1327861, 17)

Beginning Cleanup...

Cleanup finished...


In [16]:
cats = ['Year of Record', 'Gender', 'Country',
        'Profession', 'University Degree','Wears Glasses','Age'
        'Hair Color','Housing Situation','Satisfation with employer']
#,'Housing Situation','Satisfation with employer']
cons = ['Size of City','Body Height [cm]','Crime Level in the City of Employement','Work Experience in Current Job [years]'
        ,'Yearly Income in addition to Salary (e.g. Rental Income)']
#,'Crime Level in the City of Employement','Work Experience in Current Job [years]','Yearly Income in addition to Salary (e.g. Rental Income)']

data = frequency_time_blocking(data,cats,cons)

for col in train.dtypes[train.dtypes == 'object'].index.tolist():
    feat_le = LabelEncoder()
    feat_le.fit(data[col].unique().astype(str))
    data[col] = feat_le.transform(data[col].astype(str))

del_col = set(['Total Yearly Income [EUR]','Instance'])
features_col =  list(set(data) - del_col)

In [17]:
X_train,X_test = data[features_col].iloc[:splitter_index],data[features_col].iloc[splitter_index:]
Y_train = np.log(data['Total Yearly Income [EUR]'].iloc[:splitter_index])
X_test_id = data['Instance'].iloc[splitter_index:]
x_train,x_val,y_train,y_val = train_test_split(X_train,Y_train,test_size=0.2,random_state=42)

In [22]:
iterations=100000
stopping_round=500
params = {
          'max_depth': 20,
          'learning_rate': 0.001,
          "boosting": "gbdt",
          "bagging_seed": 11,
          "metric": 'mae',
          "verbosity": -1
         }
trn_data = lgb.Dataset(x_train, label=y_train)
val_data = lgb.Dataset(x_val, label=y_val)
# test_data = lgb.Dataset(X_test)
clf = lgb.train(params, trn_data, iterations, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds=stopping_round)

Training until validation scores don't improve for 500 rounds
[1000]	training's l1: 0.635086	valid_1's l1: 0.634495
[2000]	training's l1: 0.352945	valid_1's l1: 0.352668
[3000]	training's l1: 0.261987	valid_1's l1: 0.262202
[4000]	training's l1: 0.225478	valid_1's l1: 0.225975
[5000]	training's l1: 0.205993	valid_1's l1: 0.206647
[6000]	training's l1: 0.19462	valid_1's l1: 0.195416
[7000]	training's l1: 0.18717	valid_1's l1: 0.188037
[8000]	training's l1: 0.181857	valid_1's l1: 0.182761
[9000]	training's l1: 0.17791	valid_1's l1: 0.178851
[10000]	training's l1: 0.174896	valid_1's l1: 0.175885
[11000]	training's l1: 0.17251	valid_1's l1: 0.173556
[12000]	training's l1: 0.17068	valid_1's l1: 0.171773
[13000]	training's l1: 0.169247	valid_1's l1: 0.170382
[14000]	training's l1: 0.167878	valid_1's l1: 0.169045
[15000]	training's l1: 0.166843	valid_1's l1: 0.168047
[16000]	training's l1: 0.165931	valid_1's l1: 0.16718
[17000]	training's l1: 0.165101	valid_1's l1: 0.166402
[18000]	training's

In [23]:
from sklearn.metrics import mean_absolute_error
pre_test_lgb = clf.predict(x_val)

val_mae = mean_absolute_error(np.exp(y_val),np.exp(pre_test_lgb))
val_mae

8880.67379615857

In [28]:
pre_test_lgbdf=pd.DataFrame(pre_test_lgb)
pre_test_lgbdf.to_csv('resultval.csv')

In [24]:
post_test_lgb = clf.predict(X_test)

Unnamed: 0,Instance,Income
958423,1,10.40705
958424,2,8.848854
958425,3,8.159023
958426,4,11.215184
958427,5,8.138199


In [26]:
sub_df = pd.DataFrame({'Instance':X_test_id,
                       'Income':np.exp(post_test_lgb)})
sub_df.head()

Unnamed: 0,Instance,Income
958423,1,33092.117573
958424,2,6966.400973
958425,3,3494.771931
958426,4,74249.34235
958427,5,3422.749436


In [27]:
sub_df.to_csv("submission2.csv",index=False)
'done'

'done'

In [None]:
# libraries
import numpy as np
import matplotlib.pyplot as plt
 
# set width of bar
barWidth = 0.25
 
# set height of bar
bars1 = [12, 30, 1, 8, 22]
bars2 = [28, 6, 16, 5, 10]
bars3 = [29, 3, 24, 25, 17]
 
# Set position of bar on X axis
r1 = np.arange(len(bars1))
r2 = [x + barWidth for x in r1]
r3 = [x + barWidth for x in r2]
 
# Make the plot
plt.bar(r1, bars1, color='#7f6d5f', width=barWidth, edgecolor='white', label='var1')
plt.bar(r2, bars2, color='#557f2d', width=barWidth, edgecolor='white', label='var2')
plt.bar(r3, bars3, color='#2d7f5e', width=barWidth, edgecolor='white', label='var3')
 
# Add xticks on the middle of the group bars
plt.xlabel('group', fontweight='bold')
plt.xticks([r + barWidth for r in range(len(bars1))], ['A', 'B', 'C', 'D', 'E'])
 
# Create legend & Show graphic
plt.legend()
plt.show()
