# Import Libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.decomposition import PCA,TruncatedSVD
from sklearn.preprocessing import OneHotEncoder,StandardScaler,PolynomialFeatures, MinMaxScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.impute import SimpleImputer
from tempfile import mkdtemp
from shutil import rmtree
import datetime
%matplotlib inline

In [None]:
from sklearn.metrics import make_scorer,f1_score,log_loss

# Importing Data

In [None]:
X = pd.read_csv('X_Train.csv',index_col=[0],infer_datetime_format=True,parse_dates=['date_recorded'],keep_date_col=True)
y = pd.read_csv('Y_Train.csv',index_col=[0])
y = y['status_group']
X_pred = pd.read_csv('X_test.csv',index_col=[0],infer_datetime_format=True,parse_dates=['date_recorded'],keep_date_col=True)

In [None]:
X.info()

## Train Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
X.columns.values

## Finding Numeric Columns

In [None]:
num_columns = X.select_dtypes(include='number').columns.values
num_columns

In [None]:
num_columnwodate = ['amount_tsh', 'gps_height', 'longitude', 'latitude', 'num_private',
       'region_code', 'district_code', 'population']

# Random Forest Classifier Model

In [None]:
rfEstimatorModel = Pipeline([
    ('cs',ColumnSelector(num_columns)),
    ('si',SimpleImputer(missing_values=0,strategy='median')),
    ('ss',StandardScaler()),
    ('rfc', RandomForestClassifier(n_estimators=80,max_depth=90))
])

In [None]:
rfEstimatorModel.fit(X_train,y_train);

## Imputer Strategy scores

### Imputer Stratgey : Most Frequent

In [None]:
rfEstimatorModel.score(X_test,y_test)

### Imputer Strategy : Mean

In [None]:
rfEstimatorModel.score(X_test,y_test)

### Imputer Strategy : median

In [None]:
rfEstimatorModel.score(X_test,y_test)

# Going forward with Imputer strategy : Median

## Improving last model

In [None]:
NumFeature_Pipeline = Pipeline([
    ('si',SimpleImputer(missing_values=0,strategy='median'))
])

rfEstimatorModel2 = Pipeline([
    ('ct', ColumnTransformer([
        ('num_ct', NumFeature_Pipeline, num_columns)
    ])),
    ('ss',StandardScaler()),
    ('rfc', RandomForestClassifier(n_estimators=80,max_depth=90))
])

In [None]:
rfEstimatorModel2.fit(X_train,y_train);

In [None]:
rfEstimatorModel2.score(X_test,y_test)

## Changing num_column to remove construction_year

In [None]:
num_column = ['amount_tsh', 'gps_height', 'longitude', 'latitude', 'num_private',
       'region_code', 'district_code', 'population']

In [None]:
rfEstimatorModel2.score(X_test,y_test)

#### Score decreased, Continuing with having construction_year

# Checking other Classification models with GridSearchCV

In [None]:
from sklearn.linear_model import RidgeClassifier

In [None]:
estimators = {
    'Random Forest Classifier' : RandomForestClassifier(),
    'Support Vector Classifier' : SVC(),
    'Ridge Classifier' : RidgeClassifier(),
    'K Nearest Neighbors' : KNeighborsClassifier(n_neighbors=3)
}

In [None]:
NumFeature_Pipeline = Pipeline([
    ('si',SimpleImputer(missing_values=0,strategy='median'))
])

for key,estimator in estimators.items():
    print('Estimating using Classifier : ', key)
    EstimatorModel3 = Pipeline([
        ('ct', ColumnTransformer([('num_ct', NumFeature_Pipeline, num_columns)])),
        ('ss',StandardScaler()),
        ('estimator', estimator)
    ])
    
    print("Fitting Model")
    EstimatorModel3.fit(X_train,y_train);
    
    print("Calculating Score")
    score = EstimatorModel3.score(X_test,y_test)
    
    print("Score = ", score)

# Pump It Up Final Prediction model

## 1. Importing Data

In [2]:
X = pd.read_csv('X_Train.csv',index_col=[0],infer_datetime_format=True,parse_dates=['date_recorded'],keep_date_col=True)
y = pd.read_csv('Y_Train.csv',index_col=[0])
y = y['status_group']
X_pred = pd.read_csv('X_test.csv',index_col=[0],infer_datetime_format=True,parse_dates=['date_recorded'],keep_date_col=True)

## 2. Split data into Training and test set

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

## 3. Extracting categorical column

In [4]:
pd.options.display.max_columns = None
cat_columns = X_test.select_dtypes(exclude=['number','datetime64[ns]']).columns.values
cat_columns = cat_columns.tolist()
cat_columns.append('construction_year')
num_columnwoLatLngNyear = ['amount_tsh', 'gps_height', 'num_private',
       'region_code', 'district_code', 'population']
LatLng_Feature = ['longitude', 'latitude']

## 4. Feature Transformation Pipeline

In [5]:
NumFeature_Pipeline = Pipeline([
    ('si', SimpleImputer(missing_values=0,strategy='median')),
    ('ss', MinMaxScaler()),
])

In [6]:
CatFeature_Pipeline = Pipeline([
    ('si1', SimpleImputer(missing_values=np.NaN,strategy='most_frequent')),
    ('si2', SimpleImputer(missing_values=0,strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore',sparse=True)),
    ('pca', TruncatedSVD(80))
])

## 5. Submission Function

In [7]:
def writePred(estimatorModel):
    y_pred = estimatorModel.predict(X_pred)
    y_predictions = pd.DataFrame(list(zip(X_pred.index,y_pred)),columns=['id','status_group'])
    y_predictions.to_csv(datetime.datetime.now().strftime("%d%m%Y%H%M%S")+'_Submission.csv',sep=',',index=False)

## 6. Multi Estimator Predictor Pipeline

In [None]:
cachedir = mkdtemp()
results = []

In [None]:
f1_score_metric = make_scorer(f1_score, average = 'weighted')
log_loss_score_metric = make_scorer(log_loss, average = 'weighted')

In [None]:
f1_score_metric

In [None]:
scoringMetrics = [f1_score_metric,
                  log_loss_score_metric]

estimators = {
    'RFC' : RandomForestClassifier(),
    #'NB' : GaussianNB()
    #'KNN' : KNeighborsClassifier()
    #'Gradient Booster' : GradientBoostingClassifier(learning_rate=0.2)
}

params = {'RFC' : {'criterion':['entropy'], 'n_estimators':[70,80,90],
                   'min_samples_leaf':[3,5,7], 'min_samples_split':[5,7,9],
                   'max_depth':[70,80,90], 'n_jobs':[-1], 'random_state':[42]},
         'NB' : {}}

#params = {'RFC' : {'criterion':['gini','entropy'], 'n_estimators':[30, 50, 80],
#                   'min_samples_leaf':[1,2,3], 'min_samples_split':[3,4,5,6,7],
#                   'max_depth':[80, 90], 'n_jobs':[-1], 'random_state':[42]},
#          'KNN' : {'n_neighbors':[3,5,6,7], 'leaf_size':[1,2,3,5], 'n_jobs':[-1],
#                   'weights':['uniform', 'distance'], 'algorithm':['auto', 'ball_tree', 'kd_tree','brute']}}

for key,estimator in estimators.items():
    print('Estimating using Classifier : ', key)
    
    
    EstimatorModel4 = Pipeline([
        ('ct', ColumnTransformer([
            ('num_ct', NumFeature_Pipeline, num_columnwoLatLngNyear),
            ('cat_ct', CatFeature_Pipeline, cat_columns),
            ('latlng_ct', 'passthrough', LatLng_Feature),
        ])),
        ('gs', GridSearchCV(estimator,param_grid=params[key],cv=3, n_jobs=-1, verbose=1, scoring=f1_score_metric,
                            refit='f1'))
    ], memory=cachedir)
    
    #print("Fitting Model over Test Data")
    #EstimatorModel4.fit(X_train,y_train);
    #score = EstimatorModel4.score(X_test,y_test)
    #print(score)
    #results.append({'estimator':key, 'score':score, 'Best_Params':EstimatorModel4.named_steps['gs'].best_params_})
    
    #print("Calculating Score : ",end=None)
    
    #print("Test Score = ", score)
    
    print("Fitting Model over Full Data")
    EstimatorModel4.fit(X,y);
    #print(EstimatorModel4.best_estimator_)
    #print(EstimatorModel4.best_params_)
    
    #score = EstimatorModel4.score(X,y)
    results.append({'estimator':key, 'results':EstimatorModel4.named_steps['gs'].cv_results_,
                    'Best_Params':EstimatorModel4.named_steps['gs'].best_params_})
    #print("Calculating Score : ",end=None)
    
    #print(score)

    print(results)
#pd.DataFrame(results).to_csv(datetime.datetime.now().strftime("%d%m%Y%H%M%S")+'_Submission.csv',sep=',',index=False)

In [None]:
results[0]

In [None]:
rmtree(cachedir)

## Single Estimator Pipeline

In [8]:
EstimatorModel5 = Pipeline([
        ('ct', ColumnTransformer([
            ('num_ct', NumFeature_Pipeline, num_columnwoLatLngNyear),
            ('cat_ct', CatFeature_Pipeline, cat_columns),
            ('latlng_ct', 'passthrough', LatLng_Feature),
        ])),
        ('rfc', RandomForestClassifier(criterion='entropy', max_depth=70, min_samples_leaf=3, min_samples_split=7,
                                       n_estimators=90, n_jobs=-1, random_state=42))
])

In [None]:
EstimatorModel7 = Pipeline([
        ('ct', ColumnTransformer([
            ('num_ct', NumFeature_Pipeline, num_columnwoLatLngNyear),
            ('cat_ct', CatFeature_Pipeline, cat_columns),
            ('latlng_ct', 'passthrough', LatLng_Feature),
        ])),
        ('rfc', RandomForestClassifier(criterion='entropy', max_depth=80, min_samples_leaf=7, min_samples_split=7, n_estimators=80, n_jobs=1, random_state=42))
    ])

In [9]:
EstimatorModel5.fit(X,y)

Pipeline(memory=None,
         steps=[('ct',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num_ct',
                                                  Pipeline(memory=None,
                                                           steps=[('si',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=0,
                                                                                 strategy='median',
                                                                           

In [10]:
writePred(EstimatorModel5)

In [None]:
EstimatorModel5.fit(X_train,y_train)
writePred(EstimatorModel5)

In [None]:
EstimatorModel6 = Pipeline([
        ('ct', ColumnTransformer([
            ('num_ct', NumFeature_Pipeline, num_columnwoLatLngNyear),
            ('cat_ct', CatFeature_Pipeline, cat_columns),
            ('latlng_ct', 'passthrough', LatLng_Feature),
        ])),
        ('knn', KNeighborsClassifier(algorithm='brute', leaf_size=1, n_jobs=-1, n_neighbors=7, weights='distance'))
    ])

In [None]:
EstimatorModel6.fit(X,y)
writePred(EstimatorModel6)

In [None]:
EstimatorModel6.fit(X_train,y_train)
writePred(EstimatorModel6)