# MUSHROOMS DATASET PROJECT

In [None]:
import numpy as np
import pandas as pd
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
data=pd.read_csv('mushrooms.csv')

In [None]:
data.head()

In [None]:
data.shape

In [None]:
#ALl the variables are in string format. Convert categorical variables to integer using label encoder
from sklearn.preprocessing import LabelEncoder
lbl = LabelEncoder()

In [None]:
for col in data.columns:
    data[col]=lbl.fit_transform(data[col])    

In [None]:
#check the data after label encoding
data.head()

In [None]:
#split the x and y variables
y=data['class']
x=data.iloc[:,1:23]

In [None]:
#check shape of new variables
x.shape

In [None]:
y.shape

In [None]:
#check data
x.head

In [None]:
y.head

In [None]:
#I want to use PCA on this data. First normalise the data using StandardScalar so that from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x = sc.fit_transform(x)

In [None]:
#see the Standardised data
print(x)

In [None]:
#using principal component analysis
from sklearn.decomposition import PCA
pca = PCA()
x_pca = pca.fit_transform(x)

In [None]:
#plot a Scree plot of the Principal Components
plt.figure(figsize=(16,11))
plt.plot(np.cumsum(pca.explained_variance_ratio_), 'ro-')
plt.grid()

In [None]:
#from the graph
new_pca = PCA(n_components=17)

In [None]:
x_new = new_pca.fit_transform(x)

In [None]:
#using KMeans to plot the clusters. We know that we habe 2 classes of the target variable.
from sklearn.cluster import KMeans
k_means = KMeans(n_clusters=2)

In [None]:
k_means.fit_predict(x_new )

In [None]:
#plot the clusters.
colors = ['r','g']
for i in range(len(x_new)):
    plt.scatter(x_new[i][0], x_new[i][1], c=colors[k_means.labels_[i]], s=10)
plt.show()

In [None]:
#2 distinct clusters are created. Data points are far apart 
x_new.shape

In [None]:
#separate the train and test data
from sklearn.model_selection import train_test_split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_new, y, test_size = 0.25, random_state = 6)

In [None]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

In [None]:
#using Logistic regression to build the first model
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(x_train, y_train)
lr_predict =lr.predict(x_test)

In [None]:
lr_predict_prob = lr.predict_proba(x_test)

In [None]:
print(lr_predict)
print(lr_predict_prob[:,1])

In [None]:
#import metrics
from sklearn.metrics import confusion_matrix, accuracy_score

In [None]:
lr_conf_matrix = confusion_matrix(y_test, lr_predict)
lr_accuracy = accuracy_score(y_test, lr_predict)

In [None]:
print(lr_conf_matrix)
print(lr_accuracy)

In [None]:
#lets use Decision Trees to classify 
#use the number of trees as 10 first
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(max_depth=10)

In [None]:
dt.fit(x_train,y_train)
dt_predict = dt.predict(x_test)
dt_predict_prob = dt.predict_proba(x_test)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [None]:
dt_conf_matrix = confusion_matrix(y_test, dt_predict)
dt_accuracy_score = accuracy_score(y_test, dt_predict)

In [None]:
print(dt_conf_matrix)
print(dt_accuracy_score)

In [None]:
#using random forest
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(max_depth=10) #10 trees
rf.fit(x_train, y_train)
rf_predict = rf.predict(x_test)
rf_predict_prob = rf.predict_proba(x_test)

In [None]:
rf_conf_matrix = confusion_matrix(y_test,rf_predict)
rf_accuracy_score = accuracy_score(y_test, rf_predict)

In [None]:
print(rf_conf_matrix)
print(rf_accuracy_score)
#random forest has a higher accuracy score than the decision tree
#Decision tree = 99.3
#Random forest = 99.9

In [None]:
#how would an unsupervised algo like MeanShift or DBScan work? Let's find out
from sklearn.cluster import MeanShift
ms = MeanShift()
ms.fit(x_new)

In [None]:
#print the labels and the cluster centers (I will be calling them centroids)
ms_labels = ms.labels_
ms_centroids = ms.cluster_centers_
print(ms_labels)
print(ms_centroids)

In [None]:
#np.unique will give us one count of each label. 
n_clusters = len(np.unique(ms_labels))
print(n_clusters)

In [None]:
#o unsupervised learning with MeanShift gives us 3 clusters! Interesting

In [None]:
#let's plot the clusters and see how different they are from our original cluster of KMeans'
plt.figure(figsize=(10,9))
colors = ['r','g','y','b']
for i in range(len(x_new)):
    plt.scatter(x_new[i][0], x_new[i][1], c=colors[ms_labels[i]], s=5)
#print cluster centers
#Cluster centers are x's in blue
plt.scatter(ms_centroids[:,0], ms_centroids[:,1], marker='x')
plt.show()
#Considerably different!