In [3]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [4]:
def cleanUp(data):

    print("\nBeginning Cleanup...")
    #Imputing data    
    data["Age"].fillna(round(data["Age"].mean()),inplace=True)                              #replacing missing values for integer columns
    data["Year of Record"].fillna(data["Year of Record"].mode()[0],inplace=True)    
    data["Size of City"].fillna(round(data["Size of City"].mean()),inplace=True)    
    
    data["Gender"].fillna(data["Gender"].mode()[0],inplace=True)                            #replacing missing values for string columns
    data["Country"].fillna(data["Country"].mode()[0],inplace=True)
    data["University Degree"].fillna(data["University Degree"].mode()[0],inplace=True)
    data["Hair Color"].fillna(data["Hair Color"].mode()[0],inplace=True)    
    data[['Profession']]=data[['Profession']].fillna(value='9999')                          #replacing profession missing values with 9999
    
    #data["Work Experience in Current Job [years]"].fillna(data["Work Experience in Current Job [years]"].mean(),inplace=True) 
    #data['Work Experience in Current Job [years]']=data['Work Experience in Current Job [years]'].astype(str)
    data['Housing Situation']=data['Housing Situation'].astype(str)
    data['Work Experience in Current Job [years]'].fillna(round(data['Work Experience in Current Job [years]'].mean()),inplace=True)
    data['Housing Situation']=data['Housing Situation'].replace('0','zero')
    data['Yearly Income in addition to Salary (e.g. Rental Income)'] = data['Yearly Income in addition to Salary (e.g. Rental Income)'].str.replace(r' EUR$', '')
    data['Satisfation with employer'].fillna(data['Satisfation with employer'].mode()[0],inplace=True)
    data['Yearly Income in addition to Salary (e.g. Rental Income)']=data['Yearly Income in addition to Salary (e.g. Rental Income)'].astype(float)
    print("\nCleanup finished...")    
    return data

In [5]:
def getDataDummies(data):
    #one-hot encoding on data
    data=pd.get_dummies(data,columns=['Profession','Year of Record','Housing Situation','Gender','Country','University Degree','Wears Glasses','Hair Color','Satisfation with employer'])
    return data

In [6]:
def equalizeColTrain(data1,data2):
                                                                                            #identifying columns that are different in both data sets
    datadiff=data1[data1.columns.difference(data2.columns)]                                 #identifying columns missing in prediction data
    for item in datadiff.columns:
            data2[item]=0
    
    datadiff=data2[data2.columns.difference(data1.columns)]                                 #identifying columns missing in training data
    for item in datadiff.columns:
            data1[item]=0
            
    data2=data2[data1.columns]                                                              #making sure the series of columns in training and prediction data are same        
    
    return data1,data2

In [7]:

def normaliseData(data,feature):
       
  max_value = data[feature].max()                                                           #using min-max scaling for normalisation  
  min_value = data[feature].min()
  data[feature] = (data[feature] - min_value) / (max_value - min_value)
  return data,max_value,min_value


In [8]:
def denormaliseData(data,feature,max_value,min_value):
    data[feature]=data[feature]*(max_value-min_value)+min_value                            #denormalising function
    return data
  

In [9]:
def removeRows(data):

    #outlierCity = detect_outlier(data['Size of City'])                                     #outlier Identification and removal
    #data=data[~data["Size of City"].isin(outlierCity)]
    outlierInc = detect_outlier(data['Income in EUR'])
    data=data[~data["Income in EUR"].isin(outlierInc)]
    #outlierAge = detect_outlier(data['Age'])
    #data=data[~data["Age"].isin(outlierAge)]
    #outlierHt = detect_outlier(data['Body Height [cm]'])
    #data=data[~data["Body Height [cm]"].isin(outlierHt)]
    
    print("\tRemoving rows with negatives income..")                                        #removing rows with negative income
    data = data[(data['Income in EUR']>=0)]
    #print("\tlength: "+str(len(data)))
    
    return data

In [10]:
def detect_outlier(data):
    
    threshold=3
    mean_1 = np.mean(data)
    std_1 =np.std(data)
    outliers=[]
    for y in data:
        z_score= (y - mean_1)/std_1 
        if np.abs(z_score) > threshold:
            outliers.append(y)
    return outliers


In [13]:
data = pd.read_csv("C:\\Users\\SIDDHARTHA\\Dropbox\\Trinity Data Science\\ML\\tcd-ml-comp-201920-income-pred-group\\tcd-ml-1920-group-income-train.csv")

#dataPred = pd.read_csv("C:\\Users\\SIDDHARTHA\Dropbox\\Trinity Data Science\\ML\\tcd-ml-comp-201920-income-pred-group\\tcd-ml-1920-group-income-test.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [14]:
print("Cleaning up...")
data=cleanUp(data)                                                                      #imputing training data set
#dataPred=cleanUp(dataPred)                                                              #imputing prediction data set
#print("Removing rows...")
#data=removeRows(data)   

Cleaning up...

Beginning Cleanup...

Cleanup finished...


In [15]:
print("Normalising...")
data,maxSizeCityT,minSizeCityT=normaliseData(data,'Size of City')                       #scaling 'Size of City' column in training data set
data['Total Yearly Income [EUR]']=np.log(data['Total Yearly Income [EUR]'])                                     #log transformation of Income column
#dataPred,maxSizeCityP,minSizeCityP=normaliseData(dataPred,'Size of City')               #scaling 'Size of City' column in prediction data set
print("Performing one-hot encoding...")
data=getDataDummies(data)                                                               #one-hot encoding on training data
#dataPred=getDataDummies(dataPred) 
print("One-hot encoding done...")

Normalising...
Performing one-hot encoding...


In [None]:
#print("Equalizing columns...")
#data,dataPred=equalizeColTrain(data,dataPred)                                           #equalizing columns in training and prediction data sets
 
#print ('\tColumns in training :'+str(len(data.columns)))
#print ('\tColumns in prediction :'+str(len(dataPred.columns)))

In [16]:
train_X=data[data.columns.difference(['Total Yearly Income [EUR]','Instance'])]            #setting training features
train_y=data['Total Yearly Income [EUR]']                                                           #setting up training label
#pred_X=dataPred[dataPred.columns.difference(['Income in EUR','Income','Instance'])]     #setting up prediction features
    
print("Splitting into training and validation data")
train_size=0.70
X,X_test,y,y_test = train_test_split(train_X,train_y,train_size=train_size,random_state=42)   #splitting training and validation 70-30
print('Splitting done..\n\tTraining: '+str(train_size*100)+"%\n\tValidation: "+str(100-(train_size*100))+"%")
    

Splitting into training and validation data
Splitting done..
	Training: 70.0%
	Validation: 30.0%


In [None]:
params = {
          'max_depth': 20,
          'learning_rate': 0.001,
          "boosting": "gbdt",
          "bagging_seed": 11,
          "metric": 'mse',
          "verbosity": -1,
         }
trn_data = lgb.Dataset(X, label=y)
val_data = lgb.Dataset(X_test, label=y_test)
  
clf = lgb.train(params, trn_data, 100000, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds=500)

pre_val_lgb = clf.predict(X_test)
val_mae = mean_absolute_error(y_test,pre_val_lgb)
print(val_mae)

Training until validation scores don't improve for 500 rounds
[1000]	training's l2: 0.7648	valid_1's l2: 0.765531
[2000]	training's l2: 0.411984	valid_1's l2: 0.413958
[3000]	training's l2: 0.318424	valid_1's l2: 0.320752
[4000]	training's l2: 0.261286	valid_1's l2: 0.263668
[5000]	training's l2: 0.218176	valid_1's l2: 0.220474
[6000]	training's l2: 0.185035	valid_1's l2: 0.187256
[7000]	training's l2: 0.161711	valid_1's l2: 0.163932
[8000]	training's l2: 0.145666	valid_1's l2: 0.147828
[9000]	training's l2: 0.133675	valid_1's l2: 0.135785
[10000]	training's l2: 0.125113	valid_1's l2: 0.127163
[11000]	training's l2: 0.118525	valid_1's l2: 0.12053
[12000]	training's l2: 0.113514	valid_1's l2: 0.115474
[13000]	training's l2: 0.109424	valid_1's l2: 0.111362
[14000]	training's l2: 0.105806	valid_1's l2: 0.107729
[15000]	training's l2: 0.10201	valid_1's l2: 0.103913
[16000]	training's l2: 0.0978548	valid_1's l2: 0.0997527
[17000]	training's l2: 0.0947629	valid_1's l2: 0.0966612
[18000]	trai

In [None]:
#Linear Regression:
regressor = LinearRegression()                                                          #setting up Linear Regression model
regressor.fit(X,y)                                                                      #fitting features and labels
y_pred = regressor.predict(X_test)                                                      #predicting on validation features
   
rms = sqrt(mean_squared_error(np.exp(y_test), np.exp(y_pred)))                          #evaluating RMSE on validation set
#rms = sqrt(mean_squared_error(y_test, y_pred))
print("Error: "+str(rms)) 

In [None]:
y_pred=regressor.predict(pred_X)                                                        #using model to predict Income for prediction feature data
#predinc=pd.DataFrame(np.exp(y_pred))                                                   #exponentiating log transformed predictions and storing in data frame
#predinc=pd.DataFrame(y_pred)
predsubfile = pd.read_csv(args.outputfile)
predsubfile['Income']=np.exp(y_pred)
predsubfile.to_csv(args.outputfile,index=False)                                                     #exporting predictions to CSVs
print("Predictions stored at : "+args.outputfile)