<a href="https://colab.research.google.com/github/ktlait/Kaggle-Competition/blob/master/Machine_learning_assignement2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import neural_network as nnet
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso

import os
import pandas as pd
import numpy as np
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt



NEURAL_NET = 0
LINEAR_REGRESSION = 1
GRADIENT_BOOST = 2
LASSO_REGRESSION = 3
XGBOOST = 4

# Constant int values representing columns which are associated with
# their value in the COLUMN array
INSTANCE=0
YEAR=1
HOUSING=2
CRIME=3
WORK_EXPERIENCE=4
SATISFACTION=5
GENDER=6
AGE=7
COUNTRY=8
CITY_SIZE=9
PROFESSION=10
DEGREE=11
GLASSES=12
HAIR=13
HEIGHT=14
ADD_INCOME=15
INCOME=16

NUM_PLOT = [CRIME, WORK_EXPERIENCE, CITY_SIZE, ADD_INCOME]
STR_PLOT = [HOUSING, SATISFACTION,
            COUNTRY, PROFESSION, DEGREE]

CLEAN_DATA_DIR = os.path.abspath('data/clean_data')
DATA_DIR = os.path.abspath('data')

TRAINING_DATA = os.path.join(DATA_DIR, "training_data.csv")
TEST_DATA = os.path.join(DATA_DIR, "test_data.csv")

NA_COLUMNS = [YEAR, SATISFACTION, GENDER, COUNTRY, PROFESSION, DEGREE, HOUSING, WORK_EXPERIENCE]
TARGET_COLUMNS = [INCOME, ADD_INCOME]
CATEGORICAL_COLS = [SATISFACTION, GENDER, COUNTRY, PROFESSION, DEGREE, HOUSING]
OH_COLS = [GENDER, DEGREE, SATISFACTION, HOUSING]
ENCODING_COLS = [COUNTRY, PROFESSION]
COLS_TO_TRANSFORM = [INCOME]
LOW_FREQUENCY_THRESHOLD = 0
DROPPED_COLUMNS = [GLASSES, HAIR]


COLUMNS = ['Instance', 'Year', 'Housing', 'Crime','Work Experience', 'Satisfaction',
       'Gender', 'Age', 'Country', 'Size', 'Profession',
       'Degree', 'Glasses', 'Hair', 'Height',
       'Additional Income', 'Income']

MISSING_VALUES = ['#N/A', 'nA', '#NUM!']

INCOME_OUTLIER_THRESHOLD = np.log(4000000)
NUM_FOLDS = 3


def rename_columns(df):
    newnames = {
        'Instance': 'Instance',
        'Year of Record': 'Year',
        'Housing Situation': 'Housing',
        'Crime Level in the City of Employement': 'Crime',
        'Work Experience in Current Job [years]': 'Work Experience',
        'Satisfation with employer': 'Satisfaction',
        'Size of City': 'Size',
        'University Degree': 'Degree',
        'Wears Glasses': 'Glasses',
        'Hair Color': 'Hair',
        'Body Height [cm]': 'Height',
        'Yearly Income in addition to Salary (e.g. Rental Income)': 'Additional Income',
        'Total Yearly Income [EUR]': 'Income'
    }
    df.rename(columns=newnames, inplace=True)
    return df

def get_df_from_csv(filename, training):
    """
    Extracts and cleans the pandas datafram from a csv file
    :param filename: string representation of file name
    :param training: boolean indicating if the data is training or not
    :return: pandas cleaned dataframe
    """
    df = pd.read_csv(TRAINING_DATA, na_values=MISSING_VALUES, low_memory=False)
    df = clean_data(df, training)
    return df


def clean_values(df):
    print("Cleaning values")
    df[COLUMNS[GENDER]] = df[COLUMNS[GENDER]].replace(to_replace ="f", value ="female")
    df[COLUMNS[GENDER]] = df[COLUMNS[GENDER]].replace(to_replace ="m", value ="male")
    df[COLUMNS[GENDER]] = df[COLUMNS[GENDER]].replace(to_replace='0', value='unknown_Gender')
    df[COLUMNS[DEGREE]] = df[COLUMNS[DEGREE]].replace(to_replace='0', value='No')
    df[COLUMNS[HOUSING]] = df[COLUMNS[HOUSING]].replace(to_replace='0', value='unknown_Housing')
    for col in df.columns:
      val = "uknown_" + col
      df[col] = df[col].replace(to_replace='unknown', value=val)
    return df


def oh_encode(df):
    """
    One hot encodes columns in the dataframe
    :param dataframe: string representation of file name
    :return: pandas cleaned dataframe
    """
    for col in OH_COLS:
        df = pd.concat((df.drop(columns=COLUMNS[col]), pd.get_dummies(df[COLUMNS[col]], drop_first=True)), axis=1)
        print("One hot encoding " + COLUMNS[col])
        print(df.shape)
    return df

def clean_str_cols(df, col):
    """
    Replaces the NA values in the categorical columns with 'unknown'
    :param df:
    :param col:
    :return df:

    """
    print("filling unknown " + COLUMNS[col])
    replacement = 'unknown_' + COLUMNS[col]
    df[COLUMNS[col]].fillna(replacement, inplace=True)
    if df[COLUMNS[col]].isnull().values.any():
        print(COLUMNS[col] + " still has nans!")
    return df

def clean_num_cols(df, col):
    """
    Replaces the NA values in the numerical columsn with the mean of the column
    :param df:
    :param col:
    :return df:
    """
    df[COLUMNS[col]].fillna(df[COLUMNS[col]].mean(), inplace=True)
    return df



def create_target_mappings(df, target_column, encoding_columns, mean_smoothing_weight=0.3):
        """
        Creates target mappings for columns in the provided dataframe
        :param df:
        :param target_column:
        :param encoding_columns:
        :param mean_smoothing_weight:
        :return: target_maps
        """
        target_mappings = {}
        mean = df[COLUMNS[target_column]].mean()
        target_mappings[COLUMNS[target_column]] = mean
        for enc_col in encoding_columns:
            agg = df.groupby(COLUMNS[enc_col])[COLUMNS[target_column]].agg(['count', 'mean'])
            counts = agg['count']
            means = agg['mean']

            target_mappings[COLUMNS[enc_col]] = ((counts * means + mean_smoothing_weight * means)/(counts + mean_smoothing_weight))
        return target_mappings

def target_map_columns(df, target_maps, encoding_cols):
    """
    For every target mapping in the provided target maps, it will map the values
    of the corresponding columns in the df to the smoothed mean value
    :param df:
    :param target_maps:
    :return df:
    """
    for col in encoding_cols:
        df[COLUMNS[col]] = df[COLUMNS[col]].map(target_maps[COLUMNS[col]]).fillna(target_maps[COLUMNS[INCOME]])
    return df

def encode_labels(df, encoding_cols):
    """
    Label encodes the categorical cols passed in
    :param df:
    :param encoding_cols:
    :return df:
    """

    for col in encoding_cols:
        label_encoder = LabelEncoder()
        print("Encoding " + COLUMNS[col])
        print(df[COLUMNS[col]].head())
        if df[COLUMNS[col]].isnull().values.any():
            print("Found nulls!")
        df[COLUMNS[col]] = label_encoder.fit_transform(df[COLUMNS[col]])
    return df


def remove_unknowns(df, col, training):
    """
    Removes all the unknowns from training data
    :param df: pandas dataframe
    :param col: column in which to remove rows with unknowns
    :param training: boolean indicating if the data is training or not
    :return: pandas cleaned dataframe
    """
    if training:
        df[COLUMNS[col]].fillna('nan', inplace=True)
        df = df[df[COLUMNS[col]]!='nan']
    else:
        if col in CATEGORICAL_COLS:
            clean_str_cols(df, col)
        else:
            clean_num_cols(df, col)
    return df

def log_transform(df, cols):
    """
    Log transforms a column
    :param df:
    :param col:
    :return: dataframe
    """
    for col in cols:
        df[COLUMNS[col]] = df[COLUMNS[col]].apply(np.log)
    return df

def untransform_col(df, cols):
    """
    Reverse the log transform
    :param df:
    :param col:
    :return: dataframe
    """
    for col in cols:
        df[COLUMNS[col]] = df[COLUMNS[col]].apply(np.exp)
    return df

def remove_outliers(df, training, col):
    """
    Get rid of outliers in the column data
    :param df:
    :param training:
    :param col:
    :return: df
    """
    if training:
        z = np.abs(stats.zscore(df[COLUMNS[col]]))
        df_w_o_outliers = df[(z < 5)]
        return df_w_o_outliers
    else:
        return df

def gradient_boosted_target_estimator(df):
    pass

def convert_sparse_values(df, threshold, cols, replacement='other'):
    """
    Take a list of categorical columns in which to replace sparse values to
    be represented as 'other'
    :param df: dataframe
    :param threshold: threshold value
    :param cols: list of cols
    :param replacement: value to convert sparse values to
    :return: df
    """
    for col in cols:
        counts = df[COLUMNS[col]].value_counts()
        sparse_val_indeces = counts[counts <= threshold].index
        df[COLUMNS[col]] = df[COLUMNS[col]].replace(sparse_val_indeces, replacement)
    return df



#-----------------------Algorithms----------------------------------------
def linear_regression(x, y, x_test, y_test, final):
    """
    Basic linear regression algorithm
    :param x:
    :param y:
    :param x_test:
    :param y_test:
    :param final:
    :return y_pred:
    """
    model = LinearRegression()
    print("Creating model and fitting ...")
    model.fit(x, y)
    print("Created model, predicting ...")

    y_pred = model.predict(final)
    return y_pred




def gradient_boost(x, y, x_test, y_test, submission_data):
    """
    Create model using sklearn Gradient Boost Regressor.
    :param x: training feature set
    :param y: training target set
    :param x_test: testing set from training data
    :param y_test: testing target set from training data
    :param submission_data: final test data set
    :param iter_decreasing_change: number of iterations where MAE decreasing
    :return y_pred: predicted targets
    """
    lr_list = [0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1]
    best_model = ""
    min_score = 999999999999.9
    for learning_rate in lr_list:
        print("Learning rate: ", learning_rate)
        n_estimators = 800
        max_depth = 3
        gb_clf = GradientBoostingRegressor(n_estimators=n_estimators,
                    learning_rate=learning_rate, min_samples_split=20,
                    max_depth=max_depth, random_state=0)
        gb_clf.fit(x, y.ravel())
        mae = mean_absolute_error(gb_clf.predict(x_test), y_test)
        print("Mean Absolute Error (validation): {0:.3f}".format(mae))
        if mae < min_score:
            best_model = gb_clf
            min_score = mae
        else:
            iter_decreasing_change -= 1
            if iter_decreasing_change==0:
                break
    return best_model.predict(submission_data)

def lasso_regression(x_train, y_train, x_test, y_test, submission_data):
    """
    Runs a lasso regression model on the input data
    :param x_train:
    :param y_train:
    :param x_test:
    :param y_test:
    :param submission_data:
    :return y_pred:
    """

    lasso = Lasso()
    #parameters = {'alpha':[1e-15, 1e-10, a1e-8, 1e-4, 1e-3, 1e-2, 1, 5, 10, 20]}
    #lasso_regressor = GridSearchCV(lasso, parameters, scoring='neg_mean_absolute_error', cv=5)
    #lasso_regressor.fit(x_train, y_train)
    lasso.fit(x_train, y_train)
    print(lasso.score(x_test, y_test))

    return lasso.predict(submission_data)

#-----------------------Helpers-------------------------------------------
def create_n_net():
    return nnet.MLPRegressor(
        hidden_layer_sizes= (100,100,100,100,100),
        max_iter= 6000,
        tol=0.0000005,
        n_iter_no_change=15,
        warm_start=False,
        early_stopping=True,
        learning_rate="adaptive",
        learning_rate_init=0.000005)




def scale_model(x_train, x_test, submission_data):
    """
    Scales data with minmaxscaler. Used for NN
    :param x_train:
    :param x_test:
    :param submission_data:
    :return x_train, x_test, submission_data:

    """
    scaler = MinMaxScaler()
    x_train = scaler.fit_transform(x_train)
    x_test = scaler.fit_transform(x_test)
    submission_data = scaler.fit_transform(submission_data)
    return x_train, x_test, submission_data


# prepend_drive = 'content/drive/My Drive/Colab Notebooks/data/'
  
# df = pd.read_csv(prepend_drive + 'training_data.csv', na_values=MISSING_VALUES, low_memory=False)
# tf = pd.read_csv(prepend_drive + 'test_data.csv', na_values=MISSING_VALUES, low_memory=False)
# df, tf = clean_data(df, tf)
# model = train.split_and_train(df, TARGET_COLUMNS[0], 0.2, LASSO_REGRESSION, tf)

Create DFs from the files



In [0]:
import io
from google.colab import drive, files
drive.mount('/content/gdrive')
#uploaded = files.upload()
!ls "/content/gdrive/My Drive/"
training_file = pd.read_csv("/content/gdrive/My Drive/training_data.csv", na_values=MISSING_VALUES, low_memory=False)


training_file.shape

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
'Alex Dimiziani.gdoc'
 Doc.gdoc
'Economics presentation.gslides'
'encrypted_Bank statement.pdf'
'encrypted_Colman Kinane'
'encrypted_Colman Kinane (1).png'
'encrypted_Colman Kinane.jpg'
 encrypted_Colman_Kinane.jpg.jpg
'encrypted_Colman Kinane.png'
 encrypted_contract2.pdf
 encrypted_contract.pdf
 encrypted_error_test.png
 encrypted_example_file_name.jpg.jpg
 encrypted_example.png
 encrypted_Kate.jpg
 encrypted_Kate.png
 encrypted_sinead.png
'encrypted_small_city (1).png'
 encrypted_small_city2
 encrypted_small_city.pdf
 encrypted_small_city.png
 encrypted_test1.png
 encrypted_test2.png
 encrypted_test3.png
 encrypted_vetting.pdf
'Getting started.pdf'
 HubSpot.gdoc
'Imagery in King Lear.gdoc'
 IMG_1185.MOV
 Interview.gdoc
'Interview Transcript.gdoc'
'Izzy Wheels.gdoc'
'my Experiments.gsheet'
'New List of Parts.gdoc'
 Phoebe.gdoc
'Scientific Approach to Runn

(1048574, 17)

In [0]:


test_file = pd.read_csv("/content/gdrive/My Drive/test_data.csv", na_values=MISSING_VALUES, low_memory=False)



In [0]:
def clean_data(df, test):
    """
    Cleans the dataframe
    :param filename: string representation of file name
    :param training: boolean indicating if the data is training or not
    :return: pandas cleaned dataframe
    """
    print(df.shape)
    df['train'] = 1
    test['train'] = 0

    df = rename_columns(df)
    test = rename_columns(test)

    for col in DROPPED_COLUMNS:
        df = df.drop(COLUMNS[col], axis=1)
        test = test.drop(COLUMNS[col], axis=1)

    for col in NA_COLUMNS:
        df = remove_unknowns(df, col, True)
        test = remove_unknowns(test, col, False)

    total = pd.concat([df, test])
    total = clean_values(total)
    convert_sparse_values(total, LOW_FREQUENCY_THRESHOLD, CATEGORICAL_COLS)
    total = oh_encode(total)
    # Convert the additional income column to be numeric
    total[COLUMNS[ADD_INCOME]] = total[COLUMNS[ADD_INCOME]].str.split(" ", n=1, expand=True)[0]
    total[COLUMNS[ADD_INCOME]] = pd.to_numeric(total[COLUMNS[ADD_INCOME]])

    df = total[total['train']==1]
    test = total[total['train']==0]

    df = df.drop(['train'], axis=1)
    test = test.drop(['train'], axis=1)


    print("Mapping targets")
    df = remove_outliers(df, True, INCOME)
    target_maps = create_target_mappings(df, INCOME, ENCODING_COLS)

    df = target_map_columns(df, target_maps, ENCODING_COLS)
    test = target_map_columns(test, target_maps, ENCODING_COLS)

    # Make sure they have the same number of columns
    print(df.shape)
    print(test.shape)
    return df, test


In [0]:
df, tf = clean_data(training_file, test_file)
tf.describe()

(1048574, 17)
filling unknown Satisfaction
filling unknown Gender
filling unknown Country
filling unknown Profession
filling unknown Degree
filling unknown Housing


  result = method(y)


Cleaning values
One hot encoding Gender
(1131917, 19)
One hot encoding Degree
(1131917, 22)
One hot encoding Satisfaction
(1131917, 25)
One hot encoding Housing
(1131917, 31)
Mapping targets
(758464, 30)
(369438, 30)


Unnamed: 0,Instance,Year,Crime,Work Experience,Age,Country,Size,Profession,Height,Additional Income,Income,male,other,uknown_Gender,unknown_Gender,Master,No,PhD,unknown_Degree,Happy,Somewhat Happy,Unhappy,unknown_Satisfaction,Large Apartment,Large House,Medium Apartment,Medium House,Small Apartment,Small House,unknown_Housing
count,369438.0,369438.0,369438.0,369438.0,369438.0,369438.0,369438.0,369438.0,369438.0,369438.0,0.0,369438.0,369438.0,369438.0,369438.0,369438.0,369438.0,369438.0,369438.0,369438.0,369438.0,369438.0,369438.0,369438.0,369438.0,369438.0,369438.0,369438.0,369438.0,369438.0
mean,184719.5,1979.511986,76.227822,16.136541,37.317133,75716.214369,829871.3,73369.972106,175.215446,6708.241283,,0.381585,0.240062,0.060744,0.077539,0.239632,0.244436,0.059918,0.077788,0.335052,0.147524,0.01714,0.036358,0.125277,0.124267,0.099096,0.125009,0.007384,0.124716,0.268722
std,106647.708712,23.050585,46.800595,5.575693,15.980679,24749.491934,2118961.0,8623.417382,19.951533,24048.123229,,0.485776,0.427121,0.23886,0.267446,0.426859,0.429753,0.237335,0.267839,0.472009,0.354628,0.129791,0.187179,0.331033,0.329887,0.298792,0.330729,0.085614,0.330398,0.443295
min,1.0,1940.0,0.0,1.4,14.0,26053.575,22.0,21676.333438,87.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,92360.25,1960.0,36.0,12.0,24.0,62048.43056,72774.0,67748.567735,160.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,184719.5,1979.511986,75.0,15.0,35.0,63224.065645,502965.5,73271.255425,174.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,277078.75,1999.0,114.0,20.0,48.0,79599.824278,1183434.0,78352.66008,190.0,0.0,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,369438.0,2019.0,204.0,47.0,126.0,298698.567143,49970780.0,182189.6676,272.0,162007.99,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [0]:
def format_to_csv(col_two, col_two_name, y_pred, filename, y_name='Total Yearly Income [EUR]'):
    """
    Generates file with predicted values
    :param instance_col: array of instance values
    :param y_pred: array of predicted values
    :return:
    """
    filename += ".csv"
    predicted = pd.DataFrame({col_two_name: col_two, y_name: y_pred})
    predicted.to_csv(filename, header=[col_two_name, y_name], index=False)
    print('Created csv!')

def neural_net(x, y, x_test, y_test, submission_data):
    """
    Create model using sklearn Neural Net Regressor
    :param x: training feature set from training data
    :param y: training target set from training data
    :param x_test: testing feature set from training data
    :param y_test: testing target set from training data
    :param submission_data: testing data set
    :return y_pred: predicted target values
    """
    model_net = create_n_net()
    print("Fitting model ...")
    model_net.fit(x, y)
    print("Model fitted ...")
    print("Model training score: {0:.3f}".format(model_net.score(x, y)))
    print("Model testing score: {0:.3f}".format(model_net.score(x_test, y_test)))
    test_pred = model_net.predict(x_test)
    format_to_csv(y_test, 'Actual', test_pred, 'test_predictions', 'Test Predictions')
    return model_net.predict(submission_data)

def split_and_train(df, target_col, split, algorithm, tf, file_end):
    """
    Splits the data into training and test data based on the split provided
    and calls the training algorithm to create a model
    :param df: cleaned dataframe
    :param split: split represented as a decimal
    :param algorithm: index representing which algorithm to use
    :return model: to be used for further testing
    """
    print("Splitting data ...")
    scaler = MinMaxScaler()
    x = df.drop([COLUMNS[target_col], COLUMNS[INSTANCE]], axis=1)
    scaler.fit(x)
    x = scaler.transform(x)
    y = df[COLUMNS[target_col]]
    submission_data = tf.drop([COLUMNS[target_col], COLUMNS[INSTANCE]], axis=1)
    submission_data = scaler.transform(submission_data)
    print("Submission data shape: " + str(submission_data.shape))
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=split)
    print("X train: " + str(x_train.shape))
    print("X test: " + str(x_test.shape))
    model = ""
    print("Running training algorithm ...")
    if algorithm==NEURAL_NET:
      x_train, x_test, submission_data = scale_model(x_train, x_test, submission_data)
      model = neural_net(x_train, y_train, x_test, y_test, submission_data)
    elif algorithm==LINEAR_REGRESSION:
      model = linear_regression(x_train, y_train, x_test, y_test, submission_data)
    elif algorithm==GRADIENT_BOOST:
      model = gradient_boost(x_train, y_train, x_test, y_test, submission_data, 2)
    elif algorithm==LASSO_REGRESSION:
      model = lasso_regression(x_train, y_train, x_test, y_test, submission_data)
    elif algorithm==XGBOOST:
      model = XGBoostTrain(x, y, submission_data)
    elif algorithm==RANDOM_FOREST:
      model = random_forest(x_train, y_train, x, y, submission_data)

    model = pd.DataFrame({'Income': model.flatten()})
    print("Shape of final prediction: " + str(model.shape))
    # Untransform the data
    filename = "submission_" + str(file_end)
    format_to_csv(tf[COLUMNS[INSTANCE]], COLUMNS[INSTANCE], model['Income'], filename)

In [0]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler
RANDOM_FOREST = 5

target_col = INCOME
scaler = MinMaxScaler()
x = df.drop([COLUMNS[target_col], COLUMNS[INSTANCE]], axis=1)
scaler.fit(x)
x = scaler.transform(x)
y = df[COLUMNS[target_col]]
submission_data = tf.drop([COLUMNS[target_col], COLUMNS[INSTANCE]], axis=1)
submission_data = scaler.transform(submission_data)
print("Submission data shape: " + str(submission_data.shape))
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
print("X train: " + str(x_train.shape))
feature_list = list(df.columns)
print(feature_list)
rf = RandomForestRegressor(n_estimators = 170, criterion="mae", random_state = 42)
rf.fit(x_train, y_train)
predictions = rf.predict(x_test)
errors = abs(predictions - y_test)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')
  #important_indices = [feature_list.index('temp_1'), feature_list.index('average')]
  #train_important = train_features[:, important_indices]
  #test_important = test_features[:, important_indices]


Submission data shape: (369438, 28)
X train: (606771, 28)
['Instance', 'Year', 'Crime', 'Work Experience', 'Age', 'Country', 'Size', 'Profession', 'Height', 'Additional Income', 'Income', 'male', 'other', 'uknown_Gender', 'unknown_Gender', 'Master', 'No', 'PhD', 'unknown_Degree', 'Happy', 'Somewhat Happy', 'Unhappy', 'unknown_Satisfaction', 'Large Apartment', 'Large House', 'Medium Apartment', 'Medium House', 'Small Apartment', 'Small House', 'unknown_Housing']


In [0]:
y_pred = rf.predict(submission_data)

In [0]:
model = pd.DataFrame({'Income': y_pred.flatten()})
format_to_csv(tf[COLUMNS[INSTANCE]], COLUMNS[INSTANCE], model['Income'], "prediction")

Created csv!


In [0]:
files.download('prediction.csv')

In [0]:
XGBOOST=4
import xgboost as xgb

def XGBoostTrain(x, y, tf):
  xgtrain = xgb.DMatrix(data=x.values,label=y.values)
  xgtest = xgb.DMatrix(tf.values)

  X_train, X_test, y_train, y_test = train_test_split(x.values, y.values, test_size=0.2, random_state=123)
  xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', learning_rate = 0.5, 
                            max_depth = 6, n_estimators = 500, 
                            subsample=0.8)
  xg_reg.fit(X_train, y_train)
  preds = xg_reg.predict(X_test)
  print("Model training score: {0:.3f}".format(mean_absolute_error(y_test, preds)))
  print(tf.shape)
  y_pred = xg_reg.predict(tf.values)
  print(y_pred.shape)

  # params = {"objective":"reg:squarederror",'colsample_bytree': 0.3,'learning_rate': 0.1,
  #               'max_depth': 5, 'alpha': 10}

  # cv_results = xgb.cv(dtrain=xgtrain, params=params, nfold=3, 
  #                     num_boost_round=50,early_stopping_rounds=10,metrics="rmse", as_pandas=True, seed=123)
  # print((cv_results["test-rmse-mean"]).tail(1))

  xgb.plot_importance(xg_reg)
  plt.rcParams['figure.figsize'] = [5, 5]
  plt.show()
  return y_pred

split_and_train(df, TARGET_COLUMNS[0], 0.2, RANDOM_FOREST, tf, 4)



Splitting data ...
Submission data shape: (369438, 30)
X train: (606771, 30)
X test: (151693, 30)
Running training algorithm ...
['Instance', 'Year', 'Crime', 'Work Experience', 'Age', 'Country', 'Size', 'Profession', 'Height', 'Additional Income', 'Income', 'female', 'male', 'other', 'unknown', 'unknown_gender', 'Master', 'No', 'PhD', 'Unknown_Degree', 'Happy', 'Somewhat Happy', 'Unhappy', 'Unknown_Satisfaction', 'Large Apartment', 'Large House', 'Medium Apartment', 'Medium House', 'Small Apartment', 'Small House', 'Unknown_Housing', 'unknown_housing']


In [0]:
files.download('submission_4.csv')
