In [0]:
print("Imports")
import numpy as np
import pandas as pd
import sklearn as skl
from sklearn import linear_model
from sklearn import neural_network as nnet
from sklearn import svm
!pip install category_encoders
import category_encoders as ce
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D as a3d
from google.colab import files
from datetime import datetime as dt
print(dt.now())

In [0]:
print ("Data loads")
training_path = "https://raw.githubusercontent.com/johnsl01/income/master/incomeknown.csv"
train_df = pd.read_csv(training_path)
test_path = "https://raw.githubusercontent.com/johnsl01/income/master/incomeunknown.csv"
test_df = pd.read_csv(test_path)
print(dt.now())

In [0]:
print("Rename columns to not contain spaces")

newnames = {"Year of Record" : "Year",
           "Size of City" : "CitySize",
           "University Degree" : "Degree",
           "Wears Glasses" : "Glasses",
           "Hair Color" : "Hair",
           "Body Height [cm]" : "Height",
           "Income in EUR" : "Income"
          }

train_df.rename(columns=newnames, inplace=True)
test_df.rename(columns=newnames, inplace=True)
print(dt.now())

In [0]:
print("hot-one encoding gender")
train_df.Gender.fillna("u", inplace=True) # fillna() replaces Null values
test_df.Gender.fillna("u", inplace=True)

cols = ["GenderFemale", "GenderMale", "GenderOther", "GenderUnknown", "Gender0", "GenderNull"]
genders = ["female", "male", "other", "unknown", "0", "u"]

for i in range(len(cols)) : 
  train_df[cols[i]] = train_df["Instance"]
  train_df.loc[train_df.Gender == genders[i], cols[i]] = 1
  train_df.loc[train_df.Gender != genders[i], cols[i]] = 0
  
  test_df[cols[i]] = test_df["Instance"]
  test_df.loc[test_df.Gender == genders[i], cols[i]] = 1
  test_df.loc[test_df.Gender != genders[i], cols[i]] = 0
#end for

#print(test_df["GenderNull"].describe())
print(dt.now())

In [0]:
print("hot-one encoding hair colour")
train_df.Hair.fillna("U", inplace=True)
test_df.Hair.fillna("U", inplace=True)

cols = ["HairBlack", "HairBrown", "HairRed", "HairBlonde", "HairUnknown", "Hair0", "HairNull"]
colours = ["Black", "Brown", "Red", "Blonde", "Unknown", "0", "U"]

for i in range(len(cols)) :
  train_df[cols[i]] = train_df["Instance"]
  train_df.loc[train_df.Hair == colours[i], cols[i]] = 1
  train_df.loc[train_df.Hair != colours[i], cols[i]] = 0
  
  test_df[cols[i]] = test_df["Instance"]
  test_df.loc[test_df.Hair == colours[i], cols[i]] = 1
  test_df.loc[test_df.Hair != colours[i], cols[i]] = 0
#end for

#print(train_df["HairBlack"].describe())
print(dt.now())

In [0]:
print("renaming Profession values")
train_df.Profession.fillna("Unknown", inplace=True)

test_df.Profession.fillna("Unknown", inplace=True)
print(dt.now())

In [0]:
print("categorically encoding degree values")

train_df.Degree.replace("PhD", 4, inplace=True)
train_df.Degree.replace("Master", 3, inplace=True)
train_df.Degree.replace("Bachelor", 2, inplace=True)
train_df.Degree.replace("No", 0, inplace=True)
train_df.Degree.replace("0", 1, inplace=True)
train_df.Degree.fillna(1, inplace=True)

test_df.Degree.replace("PhD", 4, inplace=True)
test_df.Degree.replace("Master", 3, inplace=True)
test_df.Degree.replace("Bachelor", 2, inplace=True)
test_df.Degree.replace("No", 0, inplace=True)
test_df.Degree.replace("0", 1, inplace=True)
test_df.Degree.fillna(1, inplace=True)

# new Degree values
# 4 (was PhD), 3 (was Master), 2 (was Bachelor), 1 (was 0,Null), 0 (was No)

print(dt.now())

In [0]:
print("replacing Null ages and years with median")
# median age
train_med_age = train_df["Age"].median()
print("train median age = ", train_med_age)
test_med_age = test_df["Age"].median()
print("test median age = ", test_med_age)
mean_of_age_medians = (train_med_age + test_med_age)/2
print("mean = ", mean_of_age_medians)

print("replacing age...")
train_df.Age.fillna(mean_of_age_medians, inplace=True)
test_df.Age.fillna(mean_of_age_medians, inplace=True)

# median year
train_med_yr = train_df["Year"].median()
print("train median year = ", train_med_yr)
test_med_yr = test_df["Year"].median()
print("test median year = ", test_med_yr)
mean_of_yr_medians = (train_med_yr + test_med_yr)/2
print("mean = ", mean_of_yr_medians)

print("replacing year...")
train_df.Year.fillna(mean_of_yr_medians, inplace=True)
test_df.Year.fillna(mean_of_yr_medians, inplace=True)

print(dt.now())

In [0]:
print("Capping city size at 2mil")
# only 19 people live in cities above 2mil in train_df

twomil = 2000000
train_df["CitySize"].values[train_df["CitySize"] > twomil] = twomil
test_df["CitySize"].values[test_df["CitySize"] > twomil] = twomil

print(dt.now())

In [0]:
print("Adding column that labels instances where citysize<=100k")

hunk = 100000
train_df["LivesInTown"] = train_df["Instance"]
train_df["LivesInTown"].values[train_df["CitySize"] <= hunk] = 1
train_df["LivesInTown"].values[train_df["CitySize"] > hunk] = 0


test_df["LivesInTown"] = test_df["Instance"]
test_df["LivesInTown"].values[test_df["CitySize"] <= hunk] = 1
test_df["LivesInTown"].values[test_df["CitySize"] > hunk] = 0

print(dt.now())

In [0]:
print("encoding country data")

overall_md = train_df["Income"].median()
overall_mean = train_df["Income"].mean()

# if not particularly acurate, try using country_count too,
# since smaller sample sizes seem to have much higher wages

# add new column that holds the difference between
# the mean income for a country and the mean overall
# countries in Test that are not in Train will be left as 0
# (ie. mean overall for train)
train_df["CountryMed"] = train_df["Instance"]
train_df = train_df.assign(CountryMed = 0)
train_df["CountryMean"] = train_df["Instance"]
train_df = train_df.assign(CountryMean = 0)

test_df["CountryMed"] = train_df["Instance"]
test_df = test_df. assign(CountryMed = 0)
test_df["CountryMean"] = train_df["Instance"]
test_df = test_df. assign(CountryMean = 0)

for cntry in train_df.Country.unique():
  # count = len(train_df.loc[(train_df.Country == cntry)])
  country_md = train_df.loc[(train_df.Country == cntry)].Income.median()
  country_mean = train_df.loc[(train_df.Country == cntry)].Income.mean()
  median_diff = country_md - overall_md
  mean_diff = country_mean - overall_mean
  # replacing CountryMean (was 0)
  train_df.loc[(train_df.Country == cntry), "CountryMed"] = median_diff
  train_df.loc[(train_df.Country == cntry), "CountryMean"] = mean_diff
  test_df.loc[(test_df.Country == cntry), "CountryMed"] = median_diff
  test_df.loc[(test_df.Country == cntry), "CountryMean"] = mean_diff
  # end for
#end for
print(dt.now())

In [0]:
print("encoding profession data")

overall_md = train_df["Income"].median()
overall_mean = train_df["Income"].mean()

# if not particularly acurate, try using prof_count too?

# add new column that holds the difference between
# the mean income for a country and the mean overall
# countries in Test that are not in Train will be left as 0
# (ie. mean overall for train)
train_df["ProfMed"] = train_df["Instance"]
train_df = train_df.assign(ProfMed = 0)
train_df["ProfMean"] = train_df["Instance"]
train_df = train_df.assign(ProfMean = 0)

test_df["ProfMed"] = train_df["Instance"]
test_df = test_df. assign(ProfMed = 0)
test_df["ProfMean"] = train_df["Instance"]
test_df = test_df. assign(ProfMean = 0)

for prof in train_df.Profession.unique():
  # count = len(train_df.loc[(train_df.Profession == prof)])
  prof_md = train_df.loc[(train_df.Profession == prof)].Income.median()
  prof_mean = train_df.loc[(train_df.Profession == prof)].Income.mean()
  mean_diff = prof_mean - overall_mean
  median_diff = prof_md - overall_md
  # replacing CountryMean (was 0)
  train_df.loc[(train_df.Profession == prof), "ProfMed"] = median_diff
  train_df.loc[(train_df.Profession == prof), "ProfMean"] = mean_diff
  test_df.loc[(test_df.Profession == prof), "ProfMed"] = median_diff
  test_df.loc[(test_df.Profession == prof), "ProfMean"] = mean_diff
  # end for
#end for
print(dt.now())

In [0]:
print("normalise data")
# merge tables in order to normalise properly
(split_point, _) = train_df.shape
all_df = pd.concat([train_df, test_df], ignore_index=True)
#print(all_df.describe())

# selects columns to normalise and normalises
all_num = all_df[
    ["Year", "Age", "CountryMed", 
     "CountryMean", "ProfMed", "ProfMean", 
     "CitySize", "Degree", "Height"]]
all_norm = (all_num - all_num.min())/ (all_num.max()-all_num.min())

#print(all_norm.describe())

(rows, _) = all_norm.shape
tmp = np.ones(rows)
all_norm.insert(0, "Ones", tmp)

# add all data that's already binary data
all_norm["Glasses"] = all_df["Glasses"]
all_norm["LivesInTown"] = all_df["LivesInTown"]
all_norm["GenderMale"] = all_df["GenderMale"]
all_norm["GenderFemale"] = all_df["GenderFemale"]
all_norm["GenderUnknown"] = all_df["GenderUnknown"]
all_norm["Gender0"] = all_df["Gender0"]
all_norm["GenderNull"] = all_df["GenderNull"]
all_norm["HairBlack"] = all_df["HairBlack"]
all_norm["HairBrown"] = all_df["HairBrown"]
all_norm["HairBlonde"] = all_df["HairBlonde"]
all_norm["HairRed"] = all_df["HairRed"]
all_norm["HairUnknown"] = all_df["HairUnknown"]
all_norm["Hair0"] = all_df["Hair0"]
all_norm["HairNull"] = all_df["HairNull"]

# split data
train_norm = all_norm.head(split_point)
test_norm = all_norm.tail(len(all_norm) - split_point)
#print(train_norm.describe())
#print(test_norm.describe())
print(dt.now())

In [0]:
print("Fitting model and checking score on Known data")
Known = train_norm.copy()
y = train_df["Income"].copy()


#mod = linear_model.BayesianRidge(copy_X=False, n_iter=1000)
mod = nnet.MLPRegressor(
    warm_start=True, early_stopping=True,
    learning_rate="adaptive", learning_rate_init=0.01,
    max_iter=500)
print("model created ", dt.now())
mod.fit(Known, y)
print("model fit ", dt.now())
print(mod.score(Known, y))
# end for
print(dt.now())

In [0]:
print("Predicting unknown data")
Unknown = test_norm.copy()
u_income = mod.predict(Unknown)
u_mean = u_income.mean()
u_std = u_income.std()
k_mean = train_df.Income.mean()
k_std = train_df.Income.std()
print("MEAN: k=", k_mean, "\n      u=",u_mean)
print("STDD: k=", k_std, "\n      u=", u_std)
print(dt.now())

In [0]:
submission = pd.DataFrame(columns=["Instance", "Income"])
submission["Instance"] = test_df["Instance"].copy()
submission["Income"] = u_income.copy()
submission.to_csv(r"submission.csv",index=None, header=True)
files.download("submission.csv")

In [0]:
"""
print("def predict(X, theta)")
def predict(X, theta):
  # takes m by n matrix X as input and returns an m by 1 vector containing the
  # predictions h_theta(x^i) for each row x^i, i=1,...,m in X
  n=len(theta)
  pred = X.iloc[:,0]*theta[0]
  for i in range(1,n):
    newpred = pred + (X.iloc[:,i]*theta[i])
    pred = newpred
  # end for
  return pred
# end predict
print(dt.now())

print("def computeCost(X, y, theta)")
def computeCost(X, y, theta):
  # function calculates the cost J(theta) and return its value
  pred = predict(X, theta)
  diffsq = (pred - y) **2
  cost = (diffsq.sum())/(2*len(y))
  return cost
# end computeCost
print(dt.now())

print("def computeGradient(X, y, theta)")
def computeGradient(X, y, theta):
  # function calulate the gradient of J(theta) and returns its value
  n=len(theta)
  grad = np.zeros(n)
  pred = predict(X, theta)
  diff = pred - y
  for i in range(0,n):
    prod = diff * X.iloc[:,i]
    prodsum = prod.sum()
    grad[i] = prodsum/(len(y))
  return grad
# end computeGradient
print(dt.now())

print("def gradientDescent(X, y, alpha, iters, theta)")
def gradientDescent(X, y, alpha, iters, theta):

  cost = np.zeros(iters)
  for i in range(iters):
    theta = theta - alpha * computeGradient(X,y,theta)
    cost[i] = computeCost(X, y, theta)

  return theta, cost
print(dt.now())

print("Finding Theta [Zero]")
label_vector = train_df["Income"]
# train_norm is the input to which label_vector corresponds
# test_norm is the input for which we are trying to find the labels
(_, cols) = train_norm.shape
theta = np.zeros(cols)
theta[0] = label_vector.mean()
alpha = 0.73
iters = 100
(theta_out, cost) = gradientDescent(train_norm, label_vector, alpha, iters, theta)
plt.plot(range(iters), cost)
print(theta_out)
print(dt.now())
"""

In [0]:
"""
print("Finding Theta [Non-Zero]")
label_vector = train_df["Income"]
# train_norm is the input to which label_vector corresponds
# test_norm is the input for which we are trying to find the labels
(_, cols) = train_norm.shape
theta = theta_out
alpha = 0.6
iters = 50
(theta_out, cost) = gradientDescent(train_norm, label_vector, alpha, iters, theta)
plt.plot(range(iters), cost)
print(theta_out)
print(dt.now())

labels = predict(test_norm, theta_out)
plt.hist(labels)
#print("compare mean and std dev of known and unknown outputs")
print(dt.now())

y = label_vector.copy()
yhat = predict(train_norm, theta_out)
old_costs = costs
costs = np.empty(200)
for i in range(200):
  costrtsq = ((y[i]-yhat[i])**2)**(0.5)
  #print("",y[i], "-", yhat[i], "=", costrtsq)
  costs[i] = costrtsq
print(costs.sum())
print("diff = ", (old_costs.sum() - costs.sum()))
print(dt.now())
"""

In [0]:
"""
# looking at mean/median for different countries
c = "Solomon Islands"
ggg = train_df.loc[(train_df.Country == c)].Income.mean()
print(ggg)
hhh = train_df.loc[(train_df.Country == c)].Income.median()
print(hhh)
print(train_df.Income.mean())
print(train_df.Income.median())
print(train_df.loc[(train_df.Country == c)].Income)
"""