In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Maximazing the Dsiplay
pd.set_option('display.max_columns', None) 
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)

from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
water_treatment= pd.read_excel('/kaggle/input/water_treatment.xlsx',header=None)
water_treatment.head()

# PREPROCESSING

In [None]:
# Droping unneccessary columns
water_treatment= water_treatment.drop([0],axis=1)
print ('Number of columns and rows',water_treatment.shape)
print ('Type of each column',water_treatment.dtypes)

# Missing Values

### There are many ways to handle missing values. 
### 1. Delete the whole row containing any missing or null values like here '?'. In this case you can loss the data

### 2. Replacing all the missing or null values with any number like 0, max,min, std, mean of that feature, or any muneric value according to situation.

### In our case, we will take mean or average value of the feature. It would be inappropriate to take '0' as a value because most of the cell contain some data. It also can't be certain that this value will be a perfect data of that position but it will assist to go with the approximately value. 

### As most of the columns are showing in object form, so we first replace '?' by 0. After that, we will convert all the columns into float. Now we will replace '0' by mean of every feature or column.


In [None]:
# replacing '?' by 0
water_treatment=water_treatment.replace('?', 0)
# Converting all in to float
water_treatment = water_treatment.apply(lambda x: x.astype(np.float64), axis=1)
# Now we can have all the relevant statistics of each column
water_treatment.describe()

In [None]:
# Replacing '0' of each column with average value
water_treatment=water_treatment.replace(0.0,water_treatment.mean())
water_treatment.head()

## Normalization

### Normalization usually means to scale a variable to have a values between 0 and 1

In [None]:
from sklearn import preprocessing
normalizing_data = water_treatment.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
normalized_data = min_max_scaler.fit_transform(normalizing_data)
water_treatment_normalized = pd.DataFrame(normalized_data)

In [None]:
# Normalize Data
water_treatment_normalized.head()

# K-MEANS CLUSTERING

In [None]:
from sklearn.cluster import KMeans 
clusterNum = 3
k_means = KMeans(init = "k-means++", n_clusters = clusterNum, n_init = 12)
k_means.fit(water_treatment_normalized)
labels = k_means.labels_
print(labels)

In [None]:
water_treatment["Cluster"] = labels
water_treatment.head()

In [None]:
output= water_treatment[["Cluster"]]
output.head()

In [None]:
output.to_csv(r'kmean.txt', sep='\t')

In [None]:
X=water_treatment.ix[:,0:38].values
y = water_treatment['Cluster'].values

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=4)
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
k = 3
#Train Model and Predict  
neigh = KNeighborsClassifier(n_neighbors = k).fit(X_train,y_train)
neigh

In [None]:
yhat = neigh.predict(X_test)
#yhat[0:5]
from sklearn import metrics
print("Train set Accuracy: ", metrics.accuracy_score(y_train, neigh.predict(X_train)))
print("Test set Accuracy: ", metrics.accuracy_score(y_test, yhat))

In [None]:
Ks = 10
mean_acc = np.zeros((Ks-1))
std_acc = np.zeros((Ks-1))
ConfustionMx = [];
for n in range(1,Ks):
    
    #Train Model and Predict  
    neigh = KNeighborsClassifier(n_neighbors = n).fit(X_train,y_train)
    yhat=neigh.predict(X_test)
    mean_acc[n-1] = metrics.accuracy_score(y_test, yhat)

    
    std_acc[n-1]=np.std(yhat==y_test)/np.sqrt(yhat.shape[0])

mean_acc

In [None]:
import matplotlib.pyplot as plt
plt.plot(range(1,Ks),mean_acc,'g')
plt.fill_between(range(1,Ks),mean_acc - 1 * std_acc,mean_acc + 1 * std_acc, alpha=0.10)
plt.legend(('Accuracy ', '+/- 3xstd'))
plt.ylabel('Accuracy ')
plt.xlabel('Number of Nabors (K)')
plt.tight_layout()
plt.show()

In [None]:
print( "The best accuracy was with", mean_acc.max(), "with k=", mean_acc.argmax()+1) 

# PCA

In [None]:
from sklearn.preprocessing import StandardScaler
x=water_treatment.ix[:,0:38].values
y =water_treatment['Cluster'].values
# Standardizing the features
x = StandardScaler().fit_transform(x)
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents, columns = ['principal component 1', 'principal component 2'])


In [None]:
principalDf.head()

In [None]:
normalizing_data = principalDf.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
normalized_data = min_max_scaler.fit_transform(normalizing_data)
principalDf_normalized = pd.DataFrame(normalized_data)

In [None]:
principalDf_normalized.head()

In [None]:
clusterNum = 3
k_means = KMeans(init = "k-means++", n_clusters = clusterNum, n_init = 12)
k_means.fit(principalDf_normalized)
labels = k_means.labels_
print(labels)

In [None]:
principalDf["Cluster"] = labels
principalDf.head()

In [None]:
X = principalDf.ix[:,0:2].values
y = principalDf['Cluster'].values

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=4)
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)
from sklearn.neighbors import KNeighborsClassifier
k = 4
#Train Model and Predict  
neigh = KNeighborsClassifier(n_neighbors = k).fit(X_train,y_train)
neigh
yhat = neigh.predict(X_test)
yhat[0:5]

In [None]:
from sklearn import metrics
print("Train set Accuracy: ", metrics.accuracy_score(y_train, neigh.predict(X_train)))
print("Test set Accuracy: ", metrics.accuracy_score(y_test, yhat))

In [None]:
Ks = 10
mean_acc = np.zeros((Ks-1))
std_acc = np.zeros((Ks-1))
ConfustionMx = [];
for n in range(1,Ks):
    
    #Train Model and Predict  
    neigh = KNeighborsClassifier(n_neighbors = n).fit(X_train,y_train)
    yhat=neigh.predict(X_test)
    mean_acc[n-1] = metrics.accuracy_score(y_test, yhat)

    
    std_acc[n-1]=np.std(yhat==y_test)/np.sqrt(yhat.shape[0])

mean_acc

In [None]:
import matplotlib.pyplot as plt
plt.plot(range(1,Ks),mean_acc,'g')
plt.fill_between(range(1,Ks),mean_acc - 1 * std_acc,mean_acc + 1 * std_acc, alpha=0.10)
plt.legend(('Accuracy ', '+/- 3xstd'))
plt.ylabel('Accuracy ')
plt.xlabel('Number of Nabors (K)')
plt.tight_layout()
plt.show()

In [None]:
print( "The best accuracy was with", mean_acc.max(), "with k=", mean_acc.argmax()+1) 

## For k=1, PCA

In [None]:
clusterNum = 1
k_means = KMeans(init = "k-means++", n_clusters = clusterNum, n_init = 12)
k_means.fit(principalDf_normalized)
labels = k_means.labels_
print(labels)

In [None]:
principalDf["Cluster"] = labels
principalDf.head()

In [None]:
output= principalDf[['Cluster']]
output.head()

In [None]:
output.to_csv(r'kmean_PCA.txt', sep='\t')

# Autoencoder

In [None]:
import keras
from keras.models import Sequential
from keras.layers import Input, Dense
from keras.models import Model
from keras.optimizers import adam,sgd
from sklearn.model_selection import train_test_split

In [None]:
X=water_treatment.ix[:,0:38].values
Y = water_treatment['Cluster'].values

In [None]:
min_max_scaler = preprocessing.MinMaxScaler()
X = min_max_scaler.fit_transform(X)


In [None]:
# seed for reproducing same results
seed = 20
np.random.seed(seed)

# split the data into training (80%) and testing (20%)
(x_train, x_test, y_train, y_test) = train_test_split(X, Y, test_size=0.20, random_state=seed)

In [None]:
# reduce to 2 features
encoding_dim = 2

input_df = Input(shape=(38,))
encoded = Dense(encoding_dim, activation='relu')(input_df)
decoded = Dense(38, activation='sigmoid')(encoded)

# encoder
autoencoder = Model(input_df, decoded)

# intermediate result
encoder = Model(input_df, encoded)

autoencoder.compile(optimizer='adadelta', loss='MAE',metrics=['accuracy']) # adadelta

history=autoencoder.fit(x_train, x_train,
                epochs=100,
                batch_size=512,
                shuffle=True,
                validation_data=(x_test, x_test))

In [None]:
import matplotlib.pyplot as plt
# list all data in history
print(history.history.keys())
# summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
#np.savetxt(r's.txt', output.values, fmt='%d',delimiter="\t")

In [None]:
#np.savetxt('xgboost.txt', output.values, fmt='%d', delimiter="\t", header=None)  

In [None]:
#np.savetxt('a.txt', output.values, fmt='%d')