In [1]:
#import the libraries needed
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
from scipy import stats
import datetime

In [2]:
#Load the train values from the train_values.csv file
df = pd.read_csv("train_values.csv")

#Load the label values from the label.csv file for each building id
label = pd.read_csv("train_labels.csv")

In [3]:
#get first 2 of train_values
df.head(2)

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,...,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,...,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe


In [4]:
#get first 2 train_labels
label.head(10)

Unnamed: 0,id,status_group
0,69572,functional
1,8776,functional
2,34310,functional
3,67743,non functional
4,19728,functional
5,9944,functional
6,19816,non functional
7,54551,non functional
8,53934,non functional
9,46144,functional


In [5]:
label_encoder = preprocessing.LabelEncoder()
label['status_group'] = label_encoder.fit_transform(label['status_group'])
# functional 0, non functional - 2 , functional need repair - 1

In [6]:
label.head(8)

Unnamed: 0,id,status_group
0,69572,0
1,8776,0
2,34310,0
3,67743,2
4,19728,0
5,9944,0
6,19816,2
7,54551,2


In [7]:
#find whether dataset contains any missing values in any of the columns
df.isnull().sum()

id                           0
amount_tsh                   0
date_recorded                0
funder                    3635
gps_height                   0
installer                 3655
longitude                    0
latitude                     0
wpt_name                     0
num_private                  0
basin                        0
subvillage                 371
region                       0
region_code                  0
district_code                0
lga                          0
ward                         0
population                   0
public_meeting            3334
recorded_by                  0
scheme_management         3877
scheme_name              28166
permit                    3056
construction_year            0
extraction_type              0
extraction_type_group        0
extraction_type_class        0
management                   0
management_group             0
payment                      0
payment_type                 0
water_quality                0
quality_

In [8]:
values = {'permit': 1,'scheme_management':'Other'}
df.fillna(value=values,inplace = True)

In [9]:
#Drop columns not necessary for the model
drop_list = ['funder',
             'installer',
             'recorded_by',
             'wpt_name',
             'scheme_name',
             'payment_type',
             'waterpoint_type_group',
             'extraction_type_group',
             'public_meeting',
             'subvillage',
             'region',
             'region_code',
             'district_code',
             'recorded_by',
             'quality_group',
             'quantity_group',
             'id',
             'source',
             'source_class'
            ]
df = df.drop(drop_list,axis=1)

In [10]:
df.isnull().sum()

amount_tsh               0
date_recorded            0
gps_height               0
longitude                0
latitude                 0
num_private              0
basin                    0
lga                      0
ward                     0
population               0
scheme_management        0
permit                   0
construction_year        0
extraction_type          0
extraction_type_class    0
management               0
management_group         0
payment                  0
water_quality            0
quantity                 0
source_type              0
waterpoint_type          0
dtype: int64

In [11]:
df['date_recorded'] = pd.to_datetime(df['date_recorded'], errors='coerce')
now = pd.to_datetime('now')
df['time_for_record'] = ((now - df['date_recorded']).dt.total_seconds() / (60*60*24*365.25)).round(0)

In [12]:
df['time_to_construction_year'] = datetime.datetime.now().year - df['construction_year']

In [42]:
df['time_to_construction_year'] = np.where(df['time_to_construction_year'] == 2019, df['time_for_record'], df['time_to_construction_year'])

In [14]:
columns_to_delete = ["date_recorded","construction_year"]
df.drop(labels=columns_to_delete, axis="columns", inplace=True)

In [15]:
#df['target'] = label['status_group']

In [16]:
#plt.figure(figsize=(12,12))
#sns.scatterplot(x='longitude', y='latitude',data=df,hue='target',style='target')
#plt.ylim(-12, 0)
#plt.xlim(27, 42)
#plt.show()

In [17]:
#sns.set(style="whitegrid")
#sns.barplot(x="target", y="population", data=df)

In [18]:
dataMapper = {'VWC':'VWC',
              'WUG':'WUG',
              'Water Board':'Water Board',
              'WUA':'WUA',
              'Water authority':'Water authority',
              'Private operator':'Other',
              'Company':'Other',
              'Trust':'Other',
              'Parastatal':'Other',
              'SWC':'Other',
              'None':'Other',
              'Other':'Other'
             }
df['scheme_management']=df['scheme_management'].map(dataMapper)

In [19]:
value_replacement = {'commercial':'other', 'parastatal':'other', 'unknown':'other'}
df['management_group'].replace(value_replacement, inplace=True)

In [20]:
value_replacement = {'dam':'other', 'rainwater harvesting':'other'}
df['source_type'].replace(value_replacement, inplace=True)

In [21]:
value_replacement = {'salty':'other',
                     'milky':'other', 
                     'unknown':'other', 
                     'fluoride':'other', 
                     'coloured':'other',
                     'salty abandoned':'other', 
                     'fluoride abandoned':'other'
                    }
df['water_quality'].replace(value_replacement,inplace=True)

In [22]:
df.dtypes

amount_tsh                   float64
gps_height                     int64
longitude                    float64
latitude                     float64
num_private                    int64
basin                         object
lga                           object
ward                          object
population                     int64
scheme_management             object
permit                        object
extraction_type               object
extraction_type_class         object
management                    object
management_group              object
payment                       object
water_quality                 object
quantity                      object
source_type                   object
waterpoint_type               object
time_for_record              float64
time_to_construction_year    float64
dtype: object

In [23]:
#Object data encode with numerical data
df_objects = df.select_dtypes(include=['object'])
label_encoder = preprocessing.LabelEncoder()
for col in df_objects.columns:
    df[col] = label_encoder.fit_transform(df[col])

In [45]:
#convert dataframe into a numpy array
X = df.values
Y = label.values
#X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

In [46]:
Y=Y[:,1] #to remove id column

In [47]:
#X_train = X_train
#Y_train = Y_train[:,1]
#X_test = X_test
#Y_test = Y_test[:,1]

In [27]:
randomForestModel = RandomForestClassifier(n_estimators=80,max_depth=20)

In [28]:
def getScore(model,X_train,Y_train,X_test,Y_test):
    model.fit(X_train,Y_train)
    return model.score(X_test,Y_test)

In [29]:
kfold = StratifiedKFold(n_splits=10)
kfold

StratifiedKFold(n_splits=10, random_state=None, shuffle=False)

In [30]:
RF_score=[]

for train_index,test_index in kfold.split(X,Y):
    X_train,X_test,Y_train,Y_test = X[train_index],X[test_index],Y[train_index],Y[test_index]
    RF_score.append(getScore(randomForestModel,X_train,Y_train,X_test,Y_test))

In [31]:
RF_score

[0.8198956404645683,
 0.818212422151153,
 0.8119845143915165,
 0.8069348594512709,
 0.8267676767676768,
 0.8084175084175084,
 0.8164983164983165,
 0.8026603805354436,
 0.8035022731099511,
 0.8159312899966319]

In [32]:
avg_rf_classification_score = sum(RF_score)/len(RF_score)
print(avg_rf_classification_score)

0.8130804881784037


In [33]:
#Load the test values from the Test_set_values.csv file for predictions
test_for_pred = pd.read_csv("Test_set_values.csv")

In [34]:
from sklearn.metrics import accuracy_score
predictions = randomForestModel.predict(X_train)
accuracy_score(Y_train,predictions)

0.9457932737271333

In [35]:
def preprocesing(df):
    values = {'permit': 1,'scheme_management':'Other'}
    df.fillna(value=values,inplace = True)
    
    #Drop columns not necessary for the model
    drop_list = ['funder',
                 'installer',
                 'recorded_by',
                 'wpt_name',
                 'scheme_name',
                 'payment_type',
                 'waterpoint_type_group',
                 'extraction_type_group',
                 'public_meeting',
                 'subvillage',
                 'region',
                 'region_code',
                 'district_code',
                 'recorded_by',
                 'quality_group',
                 'quantity_group',
                 'id',
                 'source',
                 'source_class'
                ]
    df = df.drop(drop_list,axis=1)

    df['date_recorded'] = pd.to_datetime(df['date_recorded'], errors='coerce')
    now = pd.to_datetime('now')
    df['time_for_record'] = ((now - df['date_recorded']).dt.total_seconds() / (60*60*24*365.25)).round(0)
    
    df['time_to_construction_year'] = datetime.datetime.now().year - df['construction_year']
    
    df['time_to_construction_year'] = np.where(df['time_to_construction_year'] == 2019, df['time_for_record'], df['time_to_construction_year'])
    
    columns_to_delete = ["date_recorded","construction_year"]
    df.drop(labels=columns_to_delete, axis="columns", inplace=True)
    
    dataMapper = {'VWC':'VWC',
              'WUG':'WUG',
              'Water Board':'Water Board',
              'WUA':'WUA',
              'Water authority':'Water authority',
              'Private operator':'Other',
              'Company':'Other',
              'Trust':'Other',
              'Parastatal':'Other',
              'SWC':'Other',
              'None':'Other',
              'Other':'Other'
             }
    df['scheme_management']=df['scheme_management'].map(dataMapper)
    
    value_replacement = {'commercial':'other', 'parastatal':'other', 'unknown':'other'}
    df['management_group'].replace(value_replacement, inplace=True)
    
    new_value_replacement = {'salty':'other',
                     'milky':'other', 
                     'unknown':'other', 
                     'fluoride':'other', 
                     'coloured':'other',
                     'salty abandoned':'other', 
                     'fluoride abandoned':'other'
                    }
    df['water_quality'].replace(new_value_replacement,inplace=True)
    
    #Object data encode with numerical data
    df_objects = df.select_dtypes(include=['object'])
    label_encoder = preprocessing.LabelEncoder()
    for col in df_objects.columns:
        df[col] = label_encoder.fit_transform(df[col])
        
    #convert dataframe into a numpy array
    X = df.values
    
    return randomForestModel.predict(X)

In [36]:
final_pred = preprocesing(test_for_pred)

In [37]:
# replace the old ones in the list with  
# the new values 
result = pd.Series(final_pred).replace(to_replace = [0, 2, 1], value = ['functional', 'non functional', 'functional needs repair']) 

In [38]:
evaluation = pd.DataFrame({'id':test_for_pred.id,"status_group": result.values[:]})

In [39]:
evaluation.head(5)

Unnamed: 0,id,status_group
0,50785,non functional
1,51630,functional
2,17168,non functional
3,45559,non functional
4,49871,functional


In [40]:
evaluation.to_csv("Submission_NEW.csv",index=False)

In [41]:
label.head(5)

Unnamed: 0,id,status_group
0,69572,0
1,8776,0
2,34310,0
3,67743,2
4,19728,0
