In [9]:
!pip3 install pycaret

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [10]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="darkgrid")
import pandas as pd
plt.rcParams['figure.figsize'] = (7,5)

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [11]:
print("Pandas version: ", pd.__version__)
print("Seaborn version: ", sns.__version__)

Pandas version:  1.5.3
Seaborn version:  0.12.2


Dataset

In [12]:
df = pd.read_csv('https://raw.githubusercontent.com/i-sumitkumar/Concordia/main/SaYoPillow.csv')
df.head(25)

Unnamed: 0,SR,RR,T,LM,BO,REM,SR.1,HR,SL
0,93.8,25.68,91.84,16.6,89.84,99.6,1.84,74.2,3
1,91.64,25.104,91.552,15.88,89.552,98.88,1.552,72.76,3
2,60.0,20.0,96.0,10.0,95.0,85.0,7.0,60.0,1
3,85.76,23.536,90.768,13.92,88.768,96.92,0.768,68.84,3
4,48.12,17.248,97.872,6.496,96.248,72.48,8.248,53.12,0
5,56.88,19.376,95.376,9.376,94.064,83.44,6.376,58.44,1
6,47.0,16.8,97.2,5.6,95.8,68.0,7.8,52.0,0
7,50.0,18.0,99.0,8.0,97.0,80.0,9.0,55.0,0
8,45.28,16.112,96.168,4.224,95.112,61.12,7.112,50.28,0
9,55.52,19.104,95.104,9.104,93.656,82.76,6.104,57.76,1


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 630 entries, 0 to 629
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   SR      630 non-null    float64
 1   RR      630 non-null    float64
 2   T       630 non-null    float64
 3   LM      630 non-null    float64
 4   BO      630 non-null    float64
 5   REM     630 non-null    float64
 6   SR.1    630 non-null    float64
 7   HR      630 non-null    float64
 8   SL      630 non-null    int64  
dtypes: float64(8), int64(1)
memory usage: 44.4 KB


In [14]:
print("Number of duplicated rows is: ", df.duplicated().sum())
print("Number of rows with NaNs is: ", df.isna().any(axis=1).sum())

Number of duplicated rows is:  0
Number of rows with NaNs is:  0


Exploratory Data Analysis

In [None]:
sns.pairplot(df, hue='SL')
plt.show()

In [None]:
y=df['SL']
y.value_counts().plot(kind='pie')
plt.ylabel('')
plt.show()

Data Matrix

In [None]:
X = df.drop(columns=['SL'])
X.head(10)

In [None]:
X.describe().transpose()

In [None]:
Xs = StandardScaler().fit_transform(X)
Xcols = X.columns
X = pd.DataFrame(Xs)
X.columns = Xcols
X.head(10)

In [None]:
X.describe().transpose()

Observations and variables

In [None]:
observations = list(X.index)
variables = list(X.columns)

Box and Whisker Plots

In [None]:
ax = plt.figure()
ax = sns.boxplot(data=df, orient="v", palette="Set2")
ax.set_xticklabels(ax.get_xticklabels(),rotation=45);

In [None]:
# Use swarmplot() or stripplot to show the datapoints on top of the boxes:
#plt. figure()
ax = plt.figure()    
ax = sns.boxplot(data=df, orient="v", palette="Set2")
ax = sns.stripplot(data=df, color=".25") 
ax.set_xticklabels(ax.get_xticklabels(),rotation=45);

In [None]:
df.describe()

Correlation Matrix

In [None]:
ax = sns.heatmap(X.corr(), cmap='RdYlGn_r', linewidths=0.5, annot=True, cbar=False, square=True)
plt.yticks(rotation=0)
ax.tick_params(labelbottom=False,labeltop=True)
ax.set_xticklabels(ax.get_xticklabels(),rotation=0);
#plt.title('Covariance matrix')

# **Principal Component Analysis (PCA)**

In [None]:
pca = PCA()
Z = pca.fit_transform(X)

# 1(SL 0)->0,  2(SL 1)->1, 3(SL 2)->2, 3(SL 3)->3, 4(SL 4)->4
idx0 = np.where(y == 0)
idx1 = np.where(y == 1)
idx2 = np.where(y == 2)
idx3 = np.where(y == 3)
idx4 = np.where(y == 4)

plt. figure()
plt.scatter(Z[idx0,0], Z[idx0,1], c='r', label='Stress level 0 ')
plt.scatter(Z[idx1,0], Z[idx1,1], c='g', label='Stress level 1 ')
plt.scatter(Z[idx2,0], Z[idx2,1], c='b', label='Stress level 2 ')
plt.scatter(Z[idx3,0], Z[idx3,1], c='hotpink', label='Stress level 3 ')
plt.scatter(Z[idx4,0], Z[idx4,1], c='purple', label='Stress level 4 ')

plt.legend()
plt.xlabel('Z1')
plt.ylabel('Z2')
     

**Eigen Vector**

In [None]:
A = pca.components_.T 

plt.scatter(A[:,0],A[:,1],c='r')
plt.xlabel('A1')
plt.ylabel('A2')
for label, x, y in zip(variables, A[:, 0], A[:, 1]):
  plt.annotate(label, xy=(x, y), xytext=(-2, 2), textcoords='offset points', ha='right', va='bottom')
print(A)

In [None]:
plt.scatter(A[:, 0],A[:, 1], marker='o', c=A[:, 2], s=A[:, 3]*500, cmap=plt.get_cmap('Spectral'))
plt.xlabel('A1')
plt.ylabel('A2')
for label, x, y in zip(variables,A[:, 0],A[:, 1]):
  plt.annotate(label,xy=(x, y), xytext=(-20, 20),
      textcoords='offset points', ha='right', va='bottom',
      bbox=dict(boxstyle='round,pad=0.5', fc='yellow', alpha=0.5),
      arrowprops=dict(arrowstyle = '->', connectionstyle='arc3,rad=0'))

Scree Plot

In [None]:
#Eigenvalues
Lambda = pca.explained_variance_ 
#print(f'Eigenvalues:\n{Lambda}')

#Scree plot
plt. figure()
x = np.arange(len(Lambda)) + 1
plt.plot(x,Lambda/sum(Lambda), 'ro-', lw=3)
plt.xticks(x, [""+str(i) for i in x], rotation=0)
plt.xlabel('Number of components')
plt.ylabel('Explained variance') 
plt.show()

print(Lambda)

In [None]:
ell = pca.explained_variance_ratio_
plt.figure()
ind = np.arange(len(ell))
plt.bar(ind, ell, align='center', alpha=0.5)
plt.plot(np.cumsum(ell))
plt.xlabel('Number of components')
plt.ylabel('Cumulative explained variance')

**BIPLOT**

In [None]:
A1 = A[:,0] 
A2 = A[:,1]
Z1 = Z[:,0] 
Z2 = Z[:,1]

plt.figure()
plt.xlabel('Z1')
plt.ylabel('Z2')
for i in range(len(A1)):
# arrows project features as vectors onto PC axes
  plt.arrow(0, 0, A1[i]*max(Z1), A2[i]*max(Z2), color='k', width=0.0005, head_width=0.0025)
  plt.text(A1[i]*max(Z1)*1.2, A2[i]*max(Z2)*1.2,variables[i], color='k')

plt.scatter(Z[idx0,0], Z[idx0,1], c='r', label='SL0 ')
plt.scatter(Z[idx1,0], Z[idx1,1], c='g', label='SL1 ')
plt.scatter(Z[idx2,0], Z[idx2,1], c='b', label='SL2 ')
plt.scatter(Z[idx3,0], Z[idx3,1], c='hotpink', label='SL3 ')
plt.scatter(Z[idx4,0], Z[idx4,1], c='purple', label='SL4 ')

plt.legend(loc='upper left')

# **Using PCA Library**

In [None]:
!pip install pca
     

In [None]:
from pca import pca
# Initialize and keep all PCs
model = pca()
# Fit transform
out = model.fit_transform(X)

Principal Components

In [None]:
out['PC']

In [None]:
model.scatter(label=True, legend=False)

**Eigenvectors**

In [None]:
A = out['loadings'].T

In [None]:
sns.scatterplot(data=A, x="PC1", y="PC2")
plt.xlabel('')
plt.ylabel('')
for i in range(A.shape[0]):
 plt.text(x=A.PC1[i]+0.02,y=A.PC2[i]+0.02, s=variables[i],
          fontdict=dict(color='red',size=10),
          bbox=dict(facecolor='yellow',alpha=0.5))

**Scree Plot**

In [None]:
VR = out['variance_ratio']
x = np.arange(len(VR)) + 1
plt.plot(x, VR, 'ro-', lw=3)
plt.xticks(x, [""+str(i) for i in x], rotation=0)
plt.xlabel('Number of components')
plt.ylabel('Explained variance') 
plt.show()

Explained Variance Plot

In [None]:
model.plot();

Biplot

In [None]:
model.biplot(label=False, legend=False)

# Classification
Using PyCaret **bold text** **bold text**

In [None]:
data = df.sample(frac=0.9, random_state=786)
data_unseen = df.drop(data.index)

data.reset_index(drop=True, inplace=True)
data_unseen.reset_index(drop=True, inplace=True)

print('Data for Modeling: ' + str(data.shape))
print('Unseen Data For Predictions: ' + str(data_unseen.shape))

**Setting up the Environment in PyCaret**

In [None]:
from pycaret.classification import *
clf = setup(data=data, target='SL', train_size=0.7, session_id=123)

**Comparing All Models**

In [None]:
best_model = compare_models()

In [None]:
best_model

**Create Decision Tree Classifier**

In [None]:
dt= create_model('lr')

In [None]:
#trained model object is stored in the variable 'dt'. 
dt

**Tune Logistic Regression Model**

In [None]:
tuned_dt = tune_model(dt)

In [None]:
#tuned model object is stored in the variable 'tuned_dt'. 
tuned_dt

**Evaluate Logistic Regression Model**

In [None]:
!pip install statsmodels --upgrade

In [None]:
evaluate_model(tuned_dt)

**Create K Neighbors Model**

In [None]:
knn = create_model('knn')

Tune K Neighbors Model

In [None]:
tuned_knn = tune_model(knn, custom_grid = {'n_neighbors' : np.arange(0,50,1)})

**Evaluate K Neighbors Model**

In [None]:
evaluate_model(tuned_knn)

**Create Logistic Regression Model**

In [None]:
lr = create_model('lr')

**Tune Logistic Regression Model**

In [None]:
tuned_lr = tune_model(lr)

**Evaluate Logistic Regression Model**

In [None]:
evaluate_model(tuned_lr)

**Create Random Forest Model**

In [None]:
rf = create_model('rf')

**Tune Random Forest Model**

In [None]:
tuned_rf = tune_model(rf)

**Evaluate Random Forest Model**

In [None]:
evaluate_model(tuned_rf)

# **Tune the Best Model**

In [None]:
# Tune hyperparameters with scikit-learn (default)
tuned_best_model = tune_model(best_model)

In [None]:
tuned_best_model

**Evaluate the Best Model**

In [None]:
evaluate_model(tuned_best_model)

# **Classification + PCA**

In [None]:
clf_pca = setup(data=data, target='SL', train_size=0.7, session_id=123, normalize = True, pca = True, pca_components = 3)

In [None]:
#show the best model and their statistics
best_model_pca = compare_models()

In [None]:
best_model_pca

**Tune the Best Model**

In [None]:
# Tune hyperparameters with scikit-learn (default)
tuned_best_model_pca = tune_model(best_model_pca)

In [None]:
tuned_best_model_pca

**Evaluate the Best Model**

In [None]:
evaluate_model(tuned_best_model_pca)

# **Create Gradient Boosting Classifier**

In [None]:
gbc_pca = create_model('gbc')

In [None]:
tuned_gbc_pca = tune_model(gbc_pca)

In [None]:
tuned_gbc_pca

In [None]:
evaluate_model(tuned_gbc_pca)

# **Linear Discriminant Analysis**

In [None]:
lda_pca = create_model('lda')

In [None]:
tuned_lda_pca = tune_model(lda_pca)

In [None]:
tuned_lda_pca

In [None]:
evaluate_model(tuned_lda_pca)

# **Explainable AI with Shapley values**

In [None]:
!pip3 install shap

In [None]:
print(shap.__version__)

In [None]:
import shap


In [None]:
rf_pca = create_model('et')

In [None]:
tuned_rf_pca = tune_model(rf_pca)

**SHAP Summary Plot**

In [None]:
!pip3 uninstall shap

In [None]:
interpret_model(tuned_rf_pca, plot='summary')

**Visualize a single prediction**

In [None]:
interpret_model(tuned_rf_pca, plot='reason', observation=24)

**Visualize many predictions**

In [None]:
interpret_model(tuned_rf_pca, plot='reason')