# Include required libs

from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.externals import joblib
import matplotlib.pyplot as plt
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn import metrics
import random
from sklearn import preprocessing
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
%matplotlib inline

# # function to load training and testing data

In [1]:
# function to load training and testing data
def load_train_and_test_data():
    train_features=pd.read_csv('TrainingSetValues.csv',parse_dates=True)
    train_labels=pd.read_csv('TrainingSetLabels.csv')
    to_predict_features=pd.read_csv('TestSetValues.csv',parse_dates=True)
    
    # merge training features and labels
    #training_data = pd.merge(train_features, train_labels, how='inner', on=['id'])
    
    return train_features, train_labels, to_predict_features

# impute median values for 0 amount_tsh

In [2]:
# impute median values for 0 amount_tsh
def impute_missing_amount_tsh(df):
    df.amount_tsh[df.amount_tsh <= 0] = np.median(df.amount_tsh[df.amount_tsh > 0])
    return df

# impute median values for 0 gps height

In [3]:
# impute median values for 0 gps height
def impute_missing_gps_height(df):
    df.gps_height[df.gps_height <= 0] = np.median(df.gps_height[df.gps_height > 0])
    return df

# impute median values for 0 population

In [4]:
# impute median values for 0 population
def impute_missing_population(df):
    df.population[df.population <= 0] = np.median(df.population[df.population > 0])
    return df

# transform population into categories

In [5]:
# transform population into categories
def label_population(row):
    if row['population'] <=0:
        return 'NA'
    elif row['population'] >= 1 and row['population'] <= 40:
        return 'A'
    elif row['population'] >= 41 and row['population'] <= 67:
        return 'B'
    elif row['population'] >= 68 and row['population'] <= 99:
        return 'C'
    elif row['population'] >= 100 and row['population'] <= 131:
        return 'D'
    elif row['population'] >= 132 and row['population'] <= 175:
        return 'E'
    elif row['population'] >= 176 and row['population'] <= 219:
        return 'F'
    elif row['population'] >= 220 and row['population'] <= 259:
        return 'G'
    elif row['population'] >= 260 and row['population'] <= 349:
        return 'H'
    elif row['population'] >= 350 and row['population'] <= 448:
        return 'I'
    elif row['population'] >= 449 and row['population'] <= 598:
        return 'J'
    elif row['population'] >= 599 and row['population'] <= 1290:
        return 'K'
    elif row['population'] >= 1291:
        return 'L'

def transform_population_into_categories(df):
    df['population_cat'] = df.apply(label_population, axis=1)
    return df

# impute median values for construction year

In [6]:
# impute median values for construction year
def impute_missing_construction_year(df):
    df.construction_year[df.construction_year <= 0] = np.median(df.construction_year[df.construction_year > 0])
    return df

# impute missing booleans with false and convert each value to float or integer

In [7]:
# impute missing booleans with false and convert each value to float or integer
def impute_missing_booleans(df, colname):
    df[colname].fillna(False, inplace = True)
    df[colname] = df[colname].apply(lambda x: float(x))
    return df

# Since random forest doesnt work on datetime, we will break them down to month and year.
# But, also if we simply convert month into numerical values, it doesnt work well because there may be big distance between Jan and December and also between 1970 to  2010, to take an example.
# So its better to one hot encode them after transforming the date to month and year

In [8]:
# Since random forest doesnt work on datetime, we will break them down to month and year.
# But, also if we simply convert month into numerical values, it doesnt work well because 
# there may be big distance between Jan and December and also between 1970 to  2010, to take an example.
# So its better to one hot encode them after transforming the date to month and year
def transform_date_recorded_to_month_and_year(df):
    df['date_recorded'] = pd.to_datetime(df['date_recorded'])
    df['year_recorded'] = df['date_recorded'].apply(lambda x: x.year)
    df['month_recorded'] = df['date_recorded'].apply(lambda x: x.month)
    df['date_recorded'] = (pd.to_datetime(df['date_recorded'])).apply(lambda x: x.toordinal())
    return df

# One Hot encode year and month recorded (first convert the month and year to string beafore OHEing)
# Also delete the original ones
def ohe_month_and_year_recorded(df_train, df_test):
    df_train = transform_date_recorded_to_month_and_year(df_train)
    df_test = transform_date_recorded_to_month_and_year(df_test)
    for col in ['month_recorded', 'year_recorded']:
        df_train[col] = df_train[col].apply(lambda x: str(x))
        df_test[col] = df_test[col].apply(lambda x: str(x))
        ohe_cols_postfix = [col + '_' + i for i in df_train[col].unique() if i in df_test[col].unique()]
        df_train = pd.concat((df_train, pd.get_dummies(df_train[col], prefix = col)[ohe_cols_postfix]), axis = 1)
        df_test = pd.concat((df_test, pd.get_dummies(df_test[col], prefix = col)[ohe_cols_postfix]), axis = 1)
        del df_test[col]
        del df_train[col]
    return df_train, df_test

# if latitude and/or lngitude is set to 0 or 1, it means that its a junk value. The latitude and longitude of Tanzania don't fall in this range

In [9]:
# if latitude and/or lngitude is set to 0 or 1, it means that its a junk
# value. The latitude and longitude of Tanzania don't fall in this range
def cleanup_missing_latitude_and_longitude(df):
    df.loc[df.longitude == 0, 'latitude'] = 0
    df.loc[df.latitude == 1, 'longitude'] = 0
    
    df.loc[df.latitude == 0, 'longitude'] = 0
    df.loc[df.latitude == 1, 'longitude'] = 0
    
    return df

# helper function to drop columns from a dataframe df is the dataframe from where we need to drop a column cols_to_drop is the list of column names that have to be dropped

In [10]:
# helper function to drop columns from a dataframe
# df is the dataframe from where we need to drop a column
# cols_to_drop is the list of column names that have to be dropped
def drop_columns(df, cols_to_drop):
    for col in cols_to_drop:
        del df[col]
    return df

In [11]:
#def locs1(X_train, X_test):
#    """
#    fill in the nulls for ['longitude', 'latitude', 'gps_height', 'population'] by using means from 
#    ['subvillage', 'district_code', 'basin'], and lastly the overall mean
#    """
#    trans = ['longitude', 'latitude', 'gps_height', 'population']
#    for i in [X_train, X_test]:
#        i.loc[i.longitude == 0, 'latitude'] = 0
#    for z in trans:
#        for i in [X_train, X_test]:
#            i[z].replace(0., np.NaN, inplace = True)
#            i[z].replace(1., np.NaN, inplace = True)
#        
#        #for j in ['subvillage', 'district_code', 'basin']:
#        for j in ['subvillage']:
#            X_train['mean'] = X_train.groupby([j])[z].transform('mean')
#            X_train[z] = X_train[z].fillna(X_train['mean'])
#            o = X_train.groupby([j])[z].mean()
#            fill = pd.merge(X_test, pd.DataFrame(o), left_on=[j], right_index=True, how='left').iloc[:,-1]
#            X_test[z] = X_test[z].fillna(fill)
#        
#        X_train[z] = X_train[z].fillna(X_train[z].mean())
#        X_test[z] = X_test[z].fillna(X_train[z].mean())
#        del X_train['mean']
#    return X_train, X_test

In [12]:
#def locs(X_train, X_test):
#    """
#    fill in the nulls for ['longitude', 'latitude', 'gps_height', 'population'] by using means from 
#    ['subvillage', 'district_code', 'basin'], and lastly the overall mean
#    """
#    for i in [X_train, X_test]:
#        i.loc[i.longitude == 0, 'latitude'] = 0
#    for i in [X_train, X_test]:
#        i.loc[i.latitude == 1, 'longitude'] = 0
#
#    for i in [X_train, X_test]:
#        i.loc[i.latitude == 0, 'longitude'] = 0
#    for i in [X_train, X_test]:
#        i.loc[i.latitude == 1, 'longitude'] = 0
#        
#    trans = ['longitude', 'latitude', 'gps_height', 'population']
#    for z in trans:
#        for i in [X_train, X_test]:
#            i[z].replace(0., np.NaN, inplace = True)
#            i[z].replace(1., np.NaN, inplace = True)
#        
#        #for j in ['subvillage', 'district_code', 'basin']:
#        for j in ['subvillage']:
#            X_train['mean'] = X_train.groupby([j])[z].transform('mean')
#            X_train[z] = X_train[z].fillna(X_train['mean'])
#            o = X_train.groupby([j])[z].mean()
#            fill = pd.merge(X_test, pd.DataFrame(o), left_on=[j], right_index=True, how='left').iloc[:,-1]
#            X_test[z] = X_test[z].fillna(fill)
#        
#        X_train[z] = X_train[z].fillna(X_train[z].mean())
#        X_test[z] = X_test[z].fillna(X_train[z].mean())
#        del X_train['mean']
#    return X_train, X_test

In [13]:
#def locs2(df_train, df_test):
#    """
#    fill in the nulls for ['longitude', 'latitude', 'gps_height', 'population'] by using means from 
#    ['subvillage', 'district_code', 'basin'], and lastly the overall mean
##    """
##    for i in [df_train, df_test]:
##        i.loc[i.longitude == 0, 'latitude'] = 0
##    for i in [df_train, df_test]:
##        i.loc[i.latitude == 1, 'longitude'] = 0
##
##    for i in [df_train, df_test]:
##        i.loc[i.latitude == 0, 'longitude'] = 0
##    for i in [df_train, df_test]:
##        i.loc[i.latitude == 1, 'longitude'] = 0
#        
#    trans = ['longitude', 'latitude', 'gps_height', 'population']
#    for z in trans:
#        for i in [df_train, df_test]:
#            i[z].replace(0., np.NaN, inplace = True)
#            i[z].replace(1., np.NaN, inplace = True)
#        
#        #for j in ['subvillage', 'district_code', 'basin']:
#        for j in ['subvillage']:
#            df_train['mean'] = df_train.groupby([j])[z].transform('mean')
#            df_train[z] = df_train[z].fillna(df_train['mean'])
#            o = df_train.groupby([j])[z].mean()
#            fill = pd.merge(df_test, pd.DataFrame(o), left_on=[j], right_index=True, how='left').iloc[:,-1]
#            df_test[z] = df_test[z].fillna(fill)
#        
#        df_train[z] = df_train[z].fillna(df_train[z].mean())
#        df_test[z] = df_test[z].fillna(df_train[z].mean())
#        del df_train['mean']
#    return df_train, df_test

# This function imputes missing/junk/illegal values in numerical columns with the mean of the respective fields grouped by subvillage

In [14]:
# This function imputes missing/junk/illegal values in numerical columns with the mean of the respective 
# fields grouped by subvillage.
def fill_col_vals_with_col_mean_grp_by_subvilage(df_train, df_test, columns):
    for column in columns:
        df_train[column].replace(0., np.NaN, inplace = True)
        df_train[column].replace(1., np.NaN, inplace = True)
        df_test[column].replace(0., np.NaN, inplace = True)
        df_test[column].replace(1., np.NaN, inplace = True)
        
        df_train['mean'] = df_train.groupby(['subvillage'])[column].transform('mean')
        df_train[column] = df_train[column].fillna(df_train['mean'])
        o = df_train.groupby(['subvillage'])[column].mean()
        fill = pd.merge(df_test, pd.DataFrame(o), left_on=['subvillage'], right_index=True, how='left').iloc[:,-1]
        df_test[column] = df_test[column].fillna(fill)
        
        df_train[column] = df_train[column].fillna(df_train[column].mean())
        df_test[column] = df_test[column].fillna(df_train[column].mean())
        del df_train['mean']
    return df_train, df_test

# There are many columns that have many different values. Since our motive is to one hot encode the categorical columns, we need to reduce the no. of categories for each column. For this purpose we are putting together all the values that have counts less than 100 as "other" category for each column that contains string values

In [15]:
# There are many columns that have many different values. Since our motive is to one hot encode the categorical columns,
# we need to reduce the no. of categories for each column. For this purpose we are putting together all the values that have 
# counts less than 100 as "other" category for each column that contains string values
def shrink_categories_for_columns(X_train, X_test):
    cols = [i for i in X_train.columns if type(X_train[i].iloc[0]) == str]
    #print(cols)
    #['funder', 'installer', 'wpt_name', 'basin', 'scheme_management', 'extraction_type', 
    # 'extraction_type_class', 'management', 'management_group', 'payment_type', 'water_quality', 
    #'quantity_group', 'source', 'source_class', 'waterpoint_type']
    X_train[cols] = X_train[cols].where(X_train[cols].apply(lambda x: x.map(x.value_counts())) > 100, "other")
    for column in cols:
        for i in X_test[column].unique():
            if i not in X_train[column].unique():
                X_test[column].replace(i, 'other', inplace=True)
    return X_train, X_test

# This function makes use of LDA to reduce the no. of dimensions. We will apply these on population, gps_height, latitude longitude because these have many different values and hence they are perfect candidates

In [16]:
# This function makes use of LDA to reduce the no. of dimensions. We will apply these on population, gps_height, latitude
# longitude because these have many different values and hence they are perfect candidates
def reduce_dimensions_using_lda(X_train, X_test, y_train, cols=['population', 'gps_height', 'latitude', 'longitude']):
    sc = StandardScaler()
    X_train_std = sc.fit_transform(X_train[cols])
    X_test_std = sc.transform(X_test[cols])
    lda = LDA(n_components=None)
    X_train_lda = lda.fit_transform(X_train_std, y_train.values.ravel())
    X_test_lda = lda.transform(X_test_std)
    X_train = pd.concat((pd.DataFrame(X_train_lda), X_train), axis=1)
    X_test = pd.concat((pd.DataFrame(X_test_lda), X_test), axis=1)
    for i in cols:
        del X_train[i]
        del X_test[i]
    return X_train, X_test

# function for dummy-encoding i.e. one-hot-encoding of categorical columns

In [17]:
# function for dummy-encoding i.e. ne-hot-encoding of categorical columns
def one_hot_encode(df_train, df_test):
    columns = [i for i in df_train.columns if type(df_train[i].iloc[0]) == str]
    for column in columns:
        df_train[column].fillna('NULL', inplace = True)
        ohe_cols = [column+'_'+i for i in df_train[column].unique() if i in df_test[column].unique()]
        df_train = pd.concat((df_train, pd.get_dummies(df_train[column], prefix = column)[ohe_cols]), axis = 1)
        df_test = pd.concat((df_test, pd.get_dummies(df_test[column], prefix = column)[ohe_cols]), axis = 1)
        del df_train[column]
        del df_test[column]
    return df_train, df_test

# A quick and dirty data analysis

In [18]:
# A quick and dirty data analysis
def do_some_data_analysis(training_data, training_labels):
    # find unique labels
    unique_labels = training_labels['status_group'].unique()
    unique_labels.sort()
    print("Unique labels are:", unique_labels)
    
    # find the count of each unique labels
    unique_labels_cnt = []
    cnt = training_labels[training_labels['status_group']=='functional']['status_group'].count()
    unique_labels_cnt.append(cnt)
    cnt = training_labels[training_labels['status_group']=='functional needs repair']['status_group'].count()
    unique_labels_cnt.append(cnt)
    cnt = training_labels[training_labels['status_group']=='non functional']['status_group'].count()
    unique_labels_cnt.append(cnt)
    #unique_labels_count.
    print("count of each type of pump(functional/functional-needs-repair/non functional)")
    print(unique_labels_cnt)
    y_pos = np.arange(len(unique_labels))
    plt.bar(y_pos, unique_labels_cnt, align='center', alpha=0.5)
    plt.xticks(y_pos, unique_labels)
    plt.ylabel('Count')
    plt.title('Distribution of status of pumps') 
    plt.show()
    
    # year-wise distribution of pumps in different conditions
#   training_data.construction_year=pd.to_numeric(training_data.construction_year)
#   training_data.loc[training_data.construction_year <= 0, training_data.columns=='construction_year'] = 1950
#   hist1=training_data[training_data.status_group == 'functional'].construction_year
#   hist2=training_data[training_data.status_group == 'functional needs repair'].construction_year
#   hist3=training_data[training_data.status_group == 'non functional'].construction_year
#   n,b,p=plt.hist([hist1, hist2, hist3], stacked=True,range=[1950,2010])
#   plt.legend(['functional', 'functional needs repair','non functional'],loc=0)
#   plt.text(1952, 15000,'NO DATA',fontsize=20,rotation=90,color='white')
#   plt.xlabel('Construction Year', fontsize=18)
#   plt.show()

# Helper function for column values analysis

In [19]:
# Helper function for column values analysis
def analyze_unique_values_for_column(df, colname):
    unique_col_vals = df[colname].unique()
    tmp_str = "Unique " + colname + "s:"
    print("****************************")
    print(tmp_str, unique_col_vals.size)
    print("****************************")

# Helper function for column values analysis. This function helped us in detailed analysis for the purpose of feature engineering and helped us in finding out what function to keep/ what too drop/ what values to put under one single category and so on.

In [20]:
# Helper function for column values analysis. This function helped us in detailed analysis for the purpose of feature
# engineering and helped us in finding out what function to keep/ what too drop/ what values to put under one single category
# and so on.
def analyze_in_detail_unique_values_for_column(df, colname):
    unique_col_vals = df[colname].unique()
    tmp_str = "Unique " + colname + "s:"
    print("****************************")
    print(tmp_str, unique_col_vals.size)
    print("****************************")
    lessthan10 = 0
    lessthan20 = 0
    lessthan30 = 0
    lessthan50 = 0
    lessthan100 = 0
    for val in unique_col_vals:
        cnt = df[df[colname] == val][colname].count()
        print(val, cnt) # uncomment this line if you want to see the count of each colname-value
        if(cnt < 10):
            lessthan10 +=1     
            print(val, cnt)
        elif(cnt < 20):
            lessthan20 +=1
        elif(cnt < 30):
            lessthan30 +=1
        elif(cnt < 50):
            lessthan50 +=1

    print("lessthan50: ", lessthan50 )
    print("lessthan30: ", lessthan30 )
    print("lessthan20: ", lessthan20 )
    print("lessthan10: ", lessthan10 )
    print("****************************")

# load the traiing and testing(to be predicted) data

In [21]:
# load the training and testing(to be predcicted) data
df_training_data, df_training_labels, df_topredict_data = load_train_and_test_data()

NameError: name 'pd' is not defined

# some quick analysis about the no. of functional/needs-repair/non-functional pumps

In [None]:
# some quick analysis about the no. of functional/needs-repair/non-functional pumps
do_some_data_analysis(df_training_data, df_training_labels)

# Keep a copy of the "to be predicted data" from where we can extract the id for the submission file

In [None]:
# Keep a copy of the "to be predicted data" from where we can extract the id for the submission file
df_to_predict_data1 = df_topredict_data

# Brief Stats of each column

In [None]:
print("*****************************************\nBrief Stats of each column\n*****************************************")
print(df_training_data.describe())

# getting number of nonzeros in each column

In [None]:
#getting number of nonzeros in each column
print("\n*****************************************\nnumber of nonzeros in each column\n*****************************************")
print(df_training_data.astype(bool).sum(axis=0))

# getting no. of nulls in each column

In [None]:
# getting no. of nulls in each column
print("\n*****************************************\nno. of nulls in each column\n*****************************************")
print(df_training_data.isnull().sum())

In [None]:
# function to briefly analyze each column
def briefly_analyze_each_column(df_training_data):
    analyze_unique_values_for_column(df_training_data, "funder")
    analyze_unique_values_for_column(df_training_data, "installer")
    analyze_unique_values_for_column(df_training_data, "wpt_name")
    analyze_unique_values_for_column(df_training_data, "basin")
    analyze_unique_values_for_column(df_training_data, "subvillage")
    analyze_unique_values_for_column(df_training_data, "region")
    analyze_unique_values_for_column(df_training_data, "region_code")
    analyze_unique_values_for_column(df_training_data, "district_code")
    analyze_unique_values_for_column(df_training_data, "lga")
    analyze_unique_values_for_column(df_training_data, "ward")
    analyze_unique_values_for_column(df_training_data, "recorded_by")
    analyze_unique_values_for_column(df_training_data, "scheme_management")
    analyze_unique_values_for_column(df_training_data, "scheme_name")
    analyze_unique_values_for_column(df_training_data, "extraction_type")
    analyze_unique_values_for_column(df_training_data, "extraction_type_group")
    analyze_unique_values_for_column(df_training_data, "extraction_type_class")
    analyze_unique_values_for_column(df_training_data, "management")
    analyze_unique_values_for_column(df_training_data, "management_group")
    analyze_unique_values_for_column(df_training_data, "management_group")
    analyze_unique_values_for_column(df_training_data, "payment")
    analyze_unique_values_for_column(df_training_data, "payment_type")
    analyze_unique_values_for_column(df_training_data, "management_group")
    analyze_unique_values_for_column(df_training_data, "water_quality")
    analyze_unique_values_for_column(df_training_data, "quality_group")
    analyze_unique_values_for_column(df_training_data, "quantity")
    analyze_unique_values_for_column(df_training_data, "quantity_group")
    analyze_unique_values_for_column(df_training_data, "source")
    analyze_unique_values_for_column(df_training_data, "source_type")
    analyze_unique_values_for_column(df_training_data, "source_class")
    analyze_unique_values_for_column(df_training_data, "source_class")
    analyze_unique_values_for_column(df_training_data, "waterpoint_type")
    analyze_unique_values_for_column(df_training_data, "waterpoint_type_group")


# uncomment the following function if we need to analyze the columns in brief

In [None]:
#briefly_analyze_each_column(df_training_data)

# The next few function calls help us do a detailed analysis of each column

In [None]:
analyze_in_detail_unique_values_for_column(df_training_data, "funder")

In [None]:
analyze_in_detail_unique_values_for_column(df_training_data, "installer")

In [None]:
analyze_in_detail_unique_values_for_column(df_training_data, "wpt_name")

In [None]:
analyze_in_detail_unique_values_for_column(df_training_data, "management")

In [None]:
analyze_in_detail_unique_values_for_column(df_training_data, "management_group")

In [None]:
analyze_in_detail_unique_values_for_column(df_training_data, "extraction_type_class")

In [None]:
# We dont need the id field in trainign labels, delete this
del df_training_labels['id']

# Break down the date_recorded into month and year and one hot encode them
# Since random forest doesnt work on datetime, we will break them down to month and year.
# But, also if we simply convert month into numerical values, it doesnt work well because there may be big distance between Jan and December and also between 1970 to  2010, to take an example.
# So its better to one hot encode them after transforming the date to month and year

In [None]:
# Break down the date_recorded into month and year and one hot encode them
# Since random forest doesnt work on datetime, we will break them down to month and year.
# But, also if we simply convert month into numerical values, it doesnt work well because 
# there may be big distance between Jan and December and also between 1970 to  2010, to take an example.
# So its better to one hot encode them after transforming the date to month and year
df_training_data, df_topredict_data = ohe_month_and_year_recorded(df_training_data, df_topredict_data)

# impute missing construction year with median construction year

In [None]:
# impute missing construction year with median construction year
df_training_data = impute_missing_construction_year(df_training_data)
df_topredict_data = impute_missing_construction_year(df_topredict_data)

# the fields public_meeting and permit are boolean, but there are many missing values (3334 in public_meeting and 3056 in permit) 
# impute these missing values with FALSE

In [None]:
# the fields public_meeting and permit are boolean, but there are many missing values 
# (3334 in public_meeting and 3056 in permit)
# impute these missing values with FALSE
df_training_data = impute_missing_booleans(df_training_data, "public_meeting")
df_topredict_data = impute_missing_booleans(df_topredict_data, "public_meeting")

df_training_data = impute_missing_booleans(df_training_data, "permit")
df_topredict_data = impute_missing_booleans(df_topredict_data, "permit")

# if latitude and/or lngitude is set to 0 or 1, it means that its a junk value. The latitude and longitude of Tanzania don't fall in this range

In [None]:
# if latitude and/or lngitude is set to 0 or 1, it means that its a junk
# value. The latitude and longitude of Tanzania don't fall in this range
df_training_data = cleanup_missing_latitude_and_longitude(df_training_data)
df_topredict_data = cleanup_missing_latitude_and_longitude(df_topredict_data)

# fill in the nulls for ['longitude', 'latitude', 'gps_height'] by using the mean of the respective columns
# grouped by subvillage; subvillage has the highest granularity compared to region_code, district_code, ward, basin etc.
# We could also use the overalll mean, but more granularity is better
location_columns_to_clean = ['longitude', 'latitude', 'gps_height']
df_training_data, df_topredict_data = fill_col_vals_with_col_mean_grp_by_subvilage(df_training_data, df_topredict_data, location_columns_to_clean)

# if population field contains 0 or 1, it indiactes some junk value
# fill these with median values for those subvillages and after that log-transform them. However, after log transformation, it was observed that there was no improvement in accuracy, but there was no harm either

In [None]:
location_columns_to_clean = ['population']
df_training_data, df_topredict_data = fill_col_vals_with_col_mean_grp_by_subvilage(df_training_data, df_topredict_data, location_columns_to_clean)
df_training_data['population'] = np.log(df_training_data['population'])
df_topredict_data['population'] = np.log(df_topredict_data['population'])

# drop unwanted columns 

# We will drop the following columns because of the reasons mentioned below:
# 1. id - Not a feature 
# 2. amount_tsh - 
# 3. num_private - too many different values(approx 58000), so holds no significance
# 4. region - almost perfect correaltion with region_code
# 5. quantity - very strong correlation with quantity_group
# 6. quality_group - very strong correlation with water_quality
# 7. source_type - very strong correlation with source
# 8. water_point_group - very strong correlation with water_point_type
# 9. payment - very strong correlation with payment_type
# 10. extraction_type_group - strong correlation with extraction_type
# 11. recorded_by - same values in all the rows, so its not a discriminant
# 12. subvillage/district_code/lga/ward - all these denote region, as such we can drop these. region is already being represented 
#     by lat/long and also by region_code
# 13. scheme_name - 2697 different values and 28166 empty values; so this field is almost useless


In [None]:
# drop unwanted columns 
columns_to_drop = ['id','amount_tsh', 'num_private', 'region', 'quantity', 'quality_group', 'source_type', 'payment', 
'waterpoint_type_group', 'extraction_type_group', 'recorded_by', 'subvillage', 'district_code', 'lga', 'ward', 'scheme_name']
df_training_data = drop_columns(df_training_data, columns_to_drop)
df_topredict_data = drop_columns(df_topredict_data, columns_to_drop)

In [None]:
# There are many columns that have many different values. Since our motive is to one hot encode the categorical columns,
# we need to reduce the no. of categories for each column. For this purpose we are putting together all the values that have 
# counts less than 100 as "other" category for each column that contains string values
df_training_data, df_topredict_data = shrink_categories_for_columns(df_training_data, df_topredict_data)

In [None]:
# Make use of LDA to reduce the no. of dimensions
df_training_data, df_topredict_data = reduce_dimensions_using_lda(
    df_training_data, df_topredict_data, df_training_labels, cols = ['gps_height', 'latitude', 'longitude'])

In [None]:
# One hot encode the columns that have categorical values
df_training_data,df_topredict_data = one_hot_encode(df_training_data, df_topredict_data)
#df_topredict_data = one_hot_encode1(df_topredict_data)

In [None]:
# Check the final no. of columns on which RandomForestClassifier is going to run; just for diagnostic purposes
print(len(df_training_data.columns))

# The following 2 functions help us in hyperparameter tuning of RandomForestCalssifier fucntion(the variables such as) n_estimators, max_features, max_depth, criterion(gini/entropy), min_samples_split
# Using the output of this function we tuned the respective parameters for RandomForestClassifier

In [None]:
# The following 2 functions help us in hyperparameter tuning of RandomForestCalssifier fucntion(the variables such as)
# n_estimators, max_features, max_depth, criterion(gini/entropy), min_samples_split
# Using the output of this function we tuned the respective parameters for RandomForestClassifier

# function to evaluate the performance of a model
def evaluate_model_performance(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy

# This function is used for finding the best suitable value for hyperparameter tuning.
def find_best_values_for_hyperparameters(X_train, y_train):
    rf = RandomForestClassifier(criterion='gini', max_features='auto', min_samples_split=6, 
                                oob_score=True, random_state=1, n_jobs=-1)

    #param_grid = {"n_estimators" : [500, 750, 1000], "min_samples_split" : [4, 6, 8]}

    param_grid = { 'n_estimators': [200, 500, 750, 1000], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth' : [4,5,6,7,8],
        'criterion' :['gini', 'entropy'], 'min_samples_split' : [4, 6, 8] }

    gs = GridSearchCV(estimator=rf, param_grid=param_grid, scoring='accuracy', cv=2, n_jobs=-1)

    gs = gs.fit(X_train, y_train)
    print(gs.best_score_)
    print(gs.best_params_)
    print(gs.grid_scores_)
    grid_accuracy = evaluate(best_grid, test_features, test_labels)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_training_data, df_training_labels, test_size = 0.25, random_state = 42)
## Create random forest classifier instance
## Train and Test dataset size details
print ("X_train Shape :: ", X_train.shape)
print ("y_train Shape :: ", y_train.shape)
print ("X_test Shape :: ", X_test.shape)
print ("y_test Shape :: ", y_test.shape)

# Uncomment this function for finding the best values for hyper-parameters
#find_best_values_for_hyperparameters(X_train, y_train)

rf = RandomForestClassifier(criterion='gini', min_samples_split=6, n_estimators=1000, max_features='auto', 
                            oob_score=True, random_state=1, n_jobs=-1)
rf.fit(X_train, y_train)
predictions = rf.predict(X_test)

In [None]:
print ("Train Accuracy :: ", accuracy_score(y_train, rf.predict(X_train)))
print ("Test Accuracy  :: ", accuracy_score(y_test, predictions))
print ("Confusion matrix ", confusion_matrix(y_test, predictions))

predictions = rf.predict(df_topredict_data)
df_predictions = pd.DataFrame(predictions)

# Output the predictions into a csv file
columns = ['id','status_group']
df_submission = pd.DataFrame(columns=columns)
df_to_predict_data1 = df_to_predict_data1.reset_index(drop=True)
df_predictions = df_predictions.reset_index(drop=True)
df_submission = df_submission.reset_index(drop=True)
df_submission['id'] = df_to_predict_data1['id']
df_submission['status_group'] = df_predictions[0]
df_submission.to_csv("submission_ab_kn_on_split_data.csv", sep=",", index = False)

In [None]:
#find_best_values_for_hyperparameters(df_training_data, df_training_labels.values.ravel())

# Train the random forest classifier with the whole of training data

In [None]:
# Train the random forest classifier with the whole of training data
rf = RandomForestClassifier(criterion='gini', min_samples_split=6, n_estimators=1000, max_features='auto',
     oob_score=True, random_state=1, n_jobs=-1)
rf.fit(df_training_data, df_training_labels.values.ravel())

# The accuracy score on training data
print ("%.4f" % rf.oob_score_)

# Predict the values for which we don't have the labels

In [None]:
# Predict the values for which we don't have the labels
predictions = rf.predict(df_topredict_data)

In [None]:
# convert the list of predicted values into dataframe
df_predictions = pd.DataFrame(predictions)
df_predictions

# Output the predictions into a csv file

In [None]:
# Output the predictions into a csv file
columns = ['id','status_group']
df_submission = pd.DataFrame(columns=columns)
#to_predict_features=pd.read_csv('TestSetValues.csv',parse_dates=True)
df_to_predict_data1 = df_to_predict_data1.reset_index(drop=True)
df_predictions = df_predictions.reset_index(drop=True)
df_submission = df_submission.reset_index(drop=True)
df_submission['id'] = df_to_predict_data1['id']
df_submission['status_group'] = df_predictions[0]
df_submission.to_csv("submission_ab_kn.csv", sep=",", index = False)

In [None]:
print(df_submission.shape)
print(df_submission.head)

In [None]:
# Convert string labels to numerics
#label_map = {"functional": 1, "functional needs repair": 2, "non functional": 3}
#df_training_data['status_group_num']= df_training_data['status_group'].map(label_map).astype(int)

# do sanity check
#df_training_data[['id', 'amount_tsh', 'status_group', 'status_group_num']].head(25)

In [None]:
# save the training labels into a df
#df_training_labels_str = df_training_data['status_group']
#df_training_labels_num = np.array(df_training_data['status_group_num'])

In [None]:
# drop the training labels from the training set
#df_training_data= df_training_data.drop('status_group', axis = 1)
#df_training_data= df_training_data.drop('status_group_num', axis = 1)

In [None]:
#def one_hot_encode(X_train, X_test):
#    columns = [i for i in X_train.columns if type(X_train[i].iloc[0]) == str]
#    for column in columns:
#        X_train[column].fillna('NULL', inplace = True)
#        good_cols = [column+'_'+i for i in X_train[column].unique() if i in X_test[column].unique()]
#        X_train = pd.concat((X_train, pd.get_dummies(X_train[column], prefix = column)[good_cols]), axis = 1)
#        X_test = pd.concat((X_test, pd.get_dummies(X_test[column], prefix = column)[good_cols]), axis = 1)
#        del X_train[column]
#        del X_test[column]
#    return X_train, X_test
#

In [None]:
# impute missing population 
#df_training_data = impute_missing_population1(df_training_data)
#df_topredict_data = impute_missing_population1(df_topredict_data)

In [None]:
#analyze_in_detail_unique_values_for_column(df_training_data, 'population_cat')

In [None]:
#"""
#X_train, X_test, y_train, y_test = 
#    train_test_split(df_training_data, df_training_labels_num, test_size = 0.25, random_state = 42)
#"""

In [None]:
#"""
#features_to_consider =
#    ['amount_tsh', 'gps_height', 'longitude', 'latitude', 'region_code', 'district_code', 'population', 'construction_year']
#"""

In [None]:
#"""
##def run_random_forest_predictor(X_train1, y_train1, X_test1, y_test1, features_to_consider):
#rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
## Train the model on training data
#df1 = X_train[features_to_consider]
#rf.fit(df1, y_train)
#predictions = rf.predict(X_test[features_to_consider])
## Calculate the absolute errors
#errors = abs(predictions - y_test)
## Print out the mean absolute error (mae)
#print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')
#"""