In [1]:
# Include required libs
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.externals import joblib
import matplotlib.pyplot as plt
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn import metrics
import random
from sklearn import preprocessing
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
%matplotlib inline

In [2]:
def load_train_and_test_data():
    train_features=pd.read_csv('TrainingSetValues.csv',parse_dates=True)
    train_labels=pd.read_csv('TrainingSetLabels.csv')
    to_predict_features=pd.read_csv('TestSetValues.csv',parse_dates=True)
    
    # merge training features and labels
    #training_data = pd.merge(train_features, train_labels, how='inner', on=['id'])
    
    return train_features, train_labels, to_predict_features

In [3]:
# impute median values for 0 amount_tsh
def impute_missing_amount_tsh(df):
    df.amount_tsh[df.amount_tsh <= 0] = np.median(df.amount_tsh[df.amount_tsh > 0])
    return df

In [4]:
# impute median values for 0 gps height
def impute_missing_gps_height(df):
    df.gps_height[df.gps_height <= 0] = np.median(df.gps_height[df.gps_height > 0])
    return df

In [5]:
# impute median values for 0 population
def impute_missing_population(df):
    df.population[df.population <= 0] = np.median(df.population[df.population > 0])
    return df

In [6]:
# transform population into categories
def label_population(row):
    if row['population'] <=0:
        return 'NA'
    elif row['population'] >= 1 and row['population'] <= 40:
        return 'A'
    elif row['population'] >= 41 and row['population'] <= 67:
        return 'B'
    elif row['population'] >= 68 and row['population'] <= 99:
        return 'C'
    elif row['population'] >= 100 and row['population'] <= 131:
        return 'D'
    elif row['population'] >= 132 and row['population'] <= 175:
        return 'E'
    elif row['population'] >= 176 and row['population'] <= 219:
        return 'F'
    elif row['population'] >= 220 and row['population'] <= 259:
        return 'G'
    elif row['population'] >= 260 and row['population'] <= 349:
        return 'H'
    elif row['population'] >= 350 and row['population'] <= 448:
        return 'I'
    elif row['population'] >= 449 and row['population'] <= 598:
        return 'J'
    elif row['population'] >= 599 and row['population'] <= 1290:
        return 'K'
    elif row['population'] >= 1291:
        return 'L'

def transform_population_into_categories(df):
    df['population_cat'] = df.apply(label_population, axis=1)
    return df

In [7]:
# impute median values for construction year
def impute_missing_construction_year(df):
    df.construction_year[df.construction_year <= 0] = np.median(df.construction_year[df.construction_year > 0])
    return df

In [8]:
# impute missing booleans with false and convert each value to float or integer
def impute_missing_booleans(df, colname):
    df[colname].fillna(False, inplace = True)
    df[colname] = df[colname].apply(lambda x: float(x))
    return df

In [9]:
# Since random forest doesnt work on datetime, we will break them down to month and year.
# But, also if we simply convert month into numerical values, it doesnt work well because 
# there may be big distance between Jan and December and also between 1970 to  2010, to take an example.
# So its better to one hot encode them after transforming the date to month and year
def transform_date_recorded_to_month_and_year(df):
    df['date_recorded'] = pd.to_datetime(df['date_recorded'])
    df['year_recorded'] = df['date_recorded'].apply(lambda x: x.year)
    df['month_recorded'] = df['date_recorded'].apply(lambda x: x.month)
    df['date_recorded'] = (pd.to_datetime(df['date_recorded'])).apply(lambda x: x.toordinal())
    return df

# One Hot encode year and month recorded (first convert the month and year to string beafore OHEing)
# Also delete the original ones
def ohe_month_and_year_recorded(df_train, df_test):
    df_train = transform_date_recorded_to_month_and_year(df_train)
    df_test = transform_date_recorded_to_month_and_year(df_test)
    for col in ['month_recorded', 'year_recorded']:
        df_train[col] = df_train[col].apply(lambda x: str(x))
        df_test[col] = df_test[col].apply(lambda x: str(x))
        ohe_cols_postfix = [col + '_' + i for i in df_train[col].unique() if i in df_test[col].unique()]
        df_train = pd.concat((df_train, pd.get_dummies(df_train[col], prefix = col)[ohe_cols_postfix]), axis = 1)
        df_test = pd.concat((df_test, pd.get_dummies(df_test[col], prefix = col)[ohe_cols_postfix]), axis = 1)
        del df_test[col]
        del df_train[col]
    return df_train, df_test

In [10]:
def locs(X_train, X_test):
    """
    fill in the nulls for ['longitude', 'latitude', 'gps_height', 'population'] by using means from 
    ['subvillage', 'district_code', 'basin'], and lastly the overall mean
    """
    trans = ['longitude', 'latitude', 'gps_height', 'population']
    for i in [X_train, X_test]:
        i.loc[i.longitude == 0, 'latitude'] = 0
    for z in trans:
        for i in [X_train, X_test]:
            i[z].replace(0., np.NaN, inplace = True)
            i[z].replace(1., np.NaN, inplace = True)
        
        for j in ['subvillage', 'district_code', 'basin']:
        
            X_train['mean'] = X_train.groupby([j])[z].transform('mean')
            X_train[z] = X_train[z].fillna(X_train['mean'])
            o = X_train.groupby([j])[z].mean()
            fill = pd.merge(X_test, pd.DataFrame(o), left_on=[j], right_index=True, how='left').iloc[:,-1]
            X_test[z] = X_test[z].fillna(fill)
        
        X_train[z] = X_train[z].fillna(X_train[z].mean())
        X_test[z] = X_test[z].fillna(X_train[z].mean())
        del X_train['mean']
    return X_train, X_test

In [11]:
# function to drop columns from a dataframe
def drop_columns(df, cols_to_drop):
    for col in cols_to_drop:
        del df[col]
    return df

In [12]:
#def gini(p):
#    return 1-(p**2 + (1-p)**2)

In [13]:
def small_n2(X_train, X_test):
    cols = [i for i in X_train.columns if type(X_train[i].iloc[0]) == str]
    X_train[cols] = X_train[cols].where(X_train[cols].apply(lambda x: x.map(x.value_counts())) > 100, "other")
    for column in cols:
        for i in X_test[column].unique():
            if i not in X_train[column].unique():
                X_test[column].replace(i, 'other', inplace=True)
    return X_train, X_test

In [14]:
def lda(X_train, X_test, y_train, cols=['population', 'gps_height', 'latitude', 'longitude']):
    sc = StandardScaler()
    X_train_std = sc.fit_transform(X_train[cols])
    X_test_std = sc.transform(X_test[cols])
    lda = LDA(n_components=None)
    X_train_lda = lda.fit_transform(X_train_std, y_train.values.ravel())
    X_test_lda = lda.transform(X_test_std)
    X_train = pd.concat((pd.DataFrame(X_train_lda), X_train), axis=1)
    X_test = pd.concat((pd.DataFrame(X_test_lda), X_test), axis=1)
    for i in cols:
        del X_train[i]
        del X_test[i]
    return X_train, X_test

In [15]:
def one_hot_encode(df_train, df_test):
    columns = [i for i in df_train.columns if type(df_train[i].iloc[0]) == str]
    for column in columns:
        df_train[column].fillna('NULL', inplace = True)
        ohe_cols = [column+'_'+i for i in df_train[column].unique() if i in df_test[column].unique()]
        df_train = pd.concat((df_train, pd.get_dummies(df_train[column], prefix = column)[ohe_cols]), axis = 1)
        df_test = pd.concat((df_test, pd.get_dummies(df_test[column], prefix = column)[ohe_cols]), axis = 1)
        del df_train[column]
        del df_test[column]
    return df_train, df_test

In [16]:
#def one_hot_encode1(df):
#    columns = [x for x in df.columns if type(df[x].iloc[0]) == str]
#    for column in columns:
#        df[column].fillna('NULL', inplace = True)
#        ohe_cols = [column + '_' + x for x in df[column].unique() if x in df[column].unique()]
#        df = pd.concat((df, pd.get_dummies(df[column], prefix = column)[ohe_cols]), axis = 1)
#        del df[column]
#    return df

In [17]:
# Helper function for column values analysis
def analyze_unique_values_for_column(df, colname):
    unique_col_vals = df[colname].unique()
    tmp_str = "Unique " + colname + "s:"
    print("****************************")
    print(tmp_str, unique_col_vals.size)
    print("****************************")

In [18]:
# Helper function for column values analysis
def analyze_in_detail_unique_values_for_column(df, colname):
    unique_col_vals = df[colname].unique()
    tmp_str = "Unique " + colname + "s:"
    print("****************************")
    print(tmp_str, unique_col_vals.size)
    print("****************************")
    lessthan10 = 0
    lessthan20 = 0
    lessthan30 = 0
    lessthan50 = 0
    lessthan100 = 0
    for val in unique_col_vals:
        cnt = df[df[colname] == val][colname].count()
        print(val, cnt) # uncomment this line if you want to see the count of each colname-value
        if(cnt < 10):
            lessthan10 +=1     
            print(val, cnt)
        elif(cnt < 20):
            lessthan20 +=1
        elif(cnt < 30):
            lessthan30 +=1
        elif(cnt < 50):
            lessthan50 +=1

    print("lessthan50: ", lessthan50 )
    print("lessthan30: ", lessthan30 )
    print("lessthan20: ", lessthan20 )
    print("lessthan10: ", lessthan10 )
    print("****************************")

In [19]:
df_training_data, df_training_labels, df_topredict_data = load_train_and_test_data()

In [20]:
# Keep a copy of this for the output file
df_to_predict_data1 = df_topredict_data

In [21]:
print("*****************************************\nBrief Stats of each column\n*****************************************")
print(df_training_data.describe())

#getting number of nonzeros in each column
print("\n*****************************************\nnumber of nonzeros in each column\n*****************************************")
print(df_training_data.astype(bool).sum(axis=0))

# getting no. of nulls in each column
print("\n*****************************************\nno. of nulls in each column\n*****************************************")
print(df_training_data.isnull().sum())

*****************************************
Brief Stats of each column
*****************************************
                 id     amount_tsh    gps_height     longitude      latitude  \
count  59400.000000   59400.000000  59400.000000  59400.000000  5.940000e+04   
mean   37115.131768     317.650385    668.297239     34.077427 -5.706033e+00   
std    21453.128371    2997.574558    693.116350      6.567432  2.946019e+00   
min        0.000000       0.000000    -90.000000      0.000000 -1.164944e+01   
25%    18519.750000       0.000000      0.000000     33.090347 -8.540621e+00   
50%    37061.500000       0.000000    369.000000     34.908743 -5.021597e+00   
75%    55656.500000      20.000000   1319.250000     37.178387 -3.326156e+00   
max    74247.000000  350000.000000   2770.000000     40.345193 -2.000000e-08   

        num_private   region_code  district_code    population  \
count  59400.000000  59400.000000   59400.000000  59400.000000   
mean       0.474141     15.297003   

In [22]:
#analyze_unique_values_for_column(df_training_data, "funder")
#analyze_unique_values_for_column(df_training_data, "installer")
#analyze_unique_values_for_column(df_training_data, "wpt_name")
#analyze_unique_values_for_column(df_training_data, "basin")
#analyze_unique_values_for_column(df_training_data, "subvillage")
#analyze_unique_values_for_column(df_training_data, "region")
#analyze_unique_values_for_column(df_training_data, "region_code")
#analyze_unique_values_for_column(df_training_data, "district_code")
#analyze_unique_values_for_column(df_training_data, "lga")
#analyze_unique_values_for_column(df_training_data, "ward")
#analyze_unique_values_for_column(df_training_data, "recorded_by")
#analyze_unique_values_for_column(df_training_data, "scheme_management")
#analyze_unique_values_for_column(df_training_data, "scheme_name")
#analyze_unique_values_for_column(df_training_data, "extraction_type")
#analyze_unique_values_for_column(df_training_data, "extraction_type_group")
#analyze_unique_values_for_column(df_training_data, "extraction_type_class")
#analyze_unique_values_for_column(df_training_data, "management")
#analyze_unique_values_for_column(df_training_data, "management_group")
#analyze_unique_values_for_column(df_training_data, "management_group")
#analyze_unique_values_for_column(df_training_data, "payment")
#analyze_unique_values_for_column(df_training_data, "payment_type")
#analyze_unique_values_for_column(df_training_data, "management_group")
#analyze_unique_values_for_column(df_training_data, "water_quality")
#analyze_unique_values_for_column(df_training_data, "quality_group")
#analyze_unique_values_for_column(df_training_data, "quantity")
#analyze_unique_values_for_column(df_training_data, "quantity_group")
#analyze_unique_values_for_column(df_training_data, "source")
#analyze_unique_values_for_column(df_training_data, "source_type")
#analyze_unique_values_for_column(df_training_data, "source_class")
#analyze_unique_values_for_column(df_training_data, "source_class")
#analyze_unique_values_for_column(df_training_data, "waterpoint_type")
#analyze_unique_values_for_column(df_training_data, "waterpoint_type_group")

In [23]:
#analyze_in_detail_unique_values_for_column(df_training_data, "population")

In [24]:
#analyze_in_detail_unique_values_for_column(df_training_data, "management")

In [25]:
#analyze_in_detail_unique_values_for_column(df_training_data, "management_group")

In [26]:
#analyze_in_detail_unique_values_for_column(df_training_data, "extraction_type_class")

In [27]:
del df_training_labels['id']

#df_training_data, df_topredict_data = dates(df_training_data, df_topredict_data)
#df_training_data = transform_date_recorded_to_month_and_year(df_training_data)
#df_topredict_data = transform_date_recorded_to_month_and_year(df_topredict_data)

df_training_data, df_topredict_data = ohe_month_and_year_recorded(df_training_data, df_topredict_data)

In [28]:
# impute missing construction year with median construction year
df_training_data = impute_missing_construction_year(df_training_data)
df_topredict_data = impute_missing_construction_year(df_topredict_data)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [29]:
# the fields public_meeting and permit are boolean, but there are many missing values 
# (3334 in public_meeting and 3056 in permit)
# impute these missing values with FALSE
df_training_data = impute_missing_booleans(df_training_data, "public_meeting")
df_topredict_data = impute_missing_booleans(df_topredict_data, "public_meeting")

df_training_data = impute_missing_booleans(df_training_data, "permit")
df_topredict_data = impute_missing_booleans(df_topredict_data, "permit")

In [30]:
"""
fill in the nulls for ['longitude', 'latitude', 'gps_height', 'population'] by using means from 
['subvillage', 'district_code', 'basin'], and lastly the overall mean
"""
df_training_data, df_topredict_data = locs(df_training_data, df_topredict_data)

In [31]:
df_training_data['population'] = np.log(df_training_data['population'])
df_topredict_data['population'] = np.log(df_topredict_data['population'])

In [32]:
# drop unwanted columns 
columns_to_drop = ['id','amount_tsh',  'num_private', 'region', 'quantity', 'quality_group', 'source_type', 'payment', 
'waterpoint_type_group', 'extraction_type_group', 'recorded_by', 'subvillage', 'district_code', 'lga', 'ward', 'scheme_name',
'wpt_name', 'funder', 'installer']
df_training_data = drop_columns(df_training_data, columns_to_drop)
df_topredict_data = drop_columns(df_topredict_data, columns_to_drop)

KeyError: 'scheme_namewpt_name'

In [None]:
df_training_data, df_topredict_data = small_n2(df_training_data, df_topredict_data)

In [None]:
df_training_data, df_topredict_data = lda(df_training_data, df_topredict_data, df_training_labels, cols = ['gps_height', 'latitude', 'longitude'])

In [None]:
# One hot encode the columns that have string values
df_training_data,df_topredict_data = one_hot_encode(df_training_data, df_topredict_data)
#df_topredict_data = one_hot_encode1(df_topredict_data)

In [None]:
print(len(df_training_data.columns))

In [None]:
#X_train, X_test, y_train, y_test = train_test_split(df_training_data, df_training_labels, test_size = 0.25, random_state = 42)
#    # Create random forest classifier instance
## Train and Test dataset size details
#print ("X_train Shape :: ", X_train.shape)
#print ("y_train Shape :: ", y_train.shape)
#print ("X_test Shape :: ", X_test.shape)
#print ("y_test Shape :: ", y_test.shape)
#
#rf = RandomForestClassifier(criterion='gini', min_samples_split=6, n_estimators=1000, max_features='auto', oob_score=True, random_state=1, n_jobs=-1)
#rf.fit(X_train, y_train)
#predictions = rf.predict(X_test)
#

In [None]:
##for i in range(0, 5):
##    print(y_test[i], predictions[i])    
##    #print ("Actual outcome :: {} and Predicted outcome :: {}".format(list(y_test)[i], predictions[i]))
#
#print ("Train Accuracy :: ", accuracy_score(y_train, rf.predict(X_train)))
#print ("Test Accuracy  :: ", accuracy_score(y_test, predictions))
#print ("Confusion matrix ", confusion_matrix(y_test, predictions))
#
#predictions = rf.predict(df_topredict_data)
#df_predictions = pd.DataFrame(predictions)
## Output the predictions into a csv file
#columns = ['id','status_group']
#df_submission = pd.DataFrame(columns=columns)
##to_predict_features=pd.read_csv('TestSetValues.csv',parse_dates=True)
#df_to_predict_data1 = df_to_predict_data1.reset_index(drop=True)
#df_predictions = df_predictions.reset_index(drop=True)
#df_submission = df_submission.reset_index(drop=True)
#df_submission['id'] = df_to_predict_data1['id']
#df_submission['status_group'] = df_predictions[0]
#df_submission.to_csv("submission_ab_kn.csv", sep=",", index = False)

In [None]:
rf = RandomForestClassifier(criterion='gini', min_samples_split=6, n_estimators=1000, max_features='auto',
     oob_score=True, random_state=1, n_jobs=-1)
rf.fit(df_training_data, df_training_labels.values.ravel())
print ("%.4f" % rf.oob_score_)

In [None]:
predictions = rf.predict(df_topredict_data)

In [None]:
df_predictions = pd.DataFrame(predictions)
df_predictions

In [None]:
# Output the predictions into a csv file
columns = ['id','status_group']
df_submission = pd.DataFrame(columns=columns)
#to_predict_features=pd.read_csv('TestSetValues.csv',parse_dates=True)
df_to_predict_data1 = df_to_predict_data1.reset_index(drop=True)
df_predictions = df_predictions.reset_index(drop=True)
df_submission = df_submission.reset_index(drop=True)
df_submission['id'] = df_to_predict_data1['id']
df_submission['status_group'] = df_predictions[0]
df_submission.to_csv("submission_ab_kn.csv", sep=",", index = False)

In [None]:
print(df_submission.shape)
print(df_submission.head)

In [None]:
# Convert string labels to numerics
#label_map = {"functional": 1, "functional needs repair": 2, "non functional": 3}
#df_training_data['status_group_num']= df_training_data['status_group'].map(label_map).astype(int)

# do sanity check
#df_training_data[['id', 'amount_tsh', 'status_group', 'status_group_num']].head(25)

In [None]:
# save the training labels into a df
#df_training_labels_str = df_training_data['status_group']
#df_training_labels_num = np.array(df_training_data['status_group_num'])

In [None]:
# drop the training labels from the training set
#df_training_data= df_training_data.drop('status_group', axis = 1)
#df_training_data= df_training_data.drop('status_group_num', axis = 1)

In [None]:
#def one_hot_encode(X_train, X_test):
#    columns = [i for i in X_train.columns if type(X_train[i].iloc[0]) == str]
#    for column in columns:
#        X_train[column].fillna('NULL', inplace = True)
#        good_cols = [column+'_'+i for i in X_train[column].unique() if i in X_test[column].unique()]
#        X_train = pd.concat((X_train, pd.get_dummies(X_train[column], prefix = column)[good_cols]), axis = 1)
#        X_test = pd.concat((X_test, pd.get_dummies(X_test[column], prefix = column)[good_cols]), axis = 1)
#        del X_train[column]
#        del X_test[column]
#    return X_train, X_test
#

In [None]:
# impute missing population 
#df_training_data = impute_missing_population1(df_training_data)
#df_topredict_data = impute_missing_population1(df_topredict_data)

In [None]:
#analyze_in_detail_unique_values_for_column(df_training_data, 'population_cat')

In [None]:
#"""
#X_train, X_test, y_train, y_test = 
#    train_test_split(df_training_data, df_training_labels_num, test_size = 0.25, random_state = 42)
#"""

In [None]:
#"""
#features_to_consider =
#    ['amount_tsh', 'gps_height', 'longitude', 'latitude', 'region_code', 'district_code', 'population', 'construction_year']
#"""

In [None]:
#"""
##def run_random_forest_predictor(X_train1, y_train1, X_test1, y_test1, features_to_consider):
#rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
## Train the model on training data
#df1 = X_train[features_to_consider]
#rf.fit(df1, y_train)
#predictions = rf.predict(X_test[features_to_consider])
## Calculate the absolute errors
#errors = abs(predictions - y_test)
## Print out the mean absolute error (mae)
#print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')
#"""