# PCA MODEL

In [None]:
#####Links Used:
#https://www.datacamp.com/community/tutorials/principal-component-analysis-in-python
#https://towardsdatascience.com/pca-using-python-scikit-learn-e653f8989e60
#https://www.datasklr.com/principal-component-analysis-and-factor-analysis/principal-component-analysis
#https://cmdlinetips.com/2018/03/pca-example-in-python-with-scikit-learn/
#https://stackoverflow.com/questions/39216897/plot-pca-loadings-and-loading-in-biplot-in-sklearn-like-rs-autoplot

##WHEN WE START TRAIN/TEST SPLIT REF.: https://stackabuse.com/implementing-pca-in-python-with-scikit-learn/

# LOADING DATA

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
initData = pd.read_csv("publisher_analysis_data.csv")
initData.head()

Unnamed: 0.1,Unnamed: 0,maxLoad,stressMaxLoad,strainMaxLoad,energyAbsorp,youngsMod,ph,sec_Mn,sec_Mw,sec_polyDisp,sec_calcMass,sec_massRec,year,publisher
0,0,34.67,28.779,0.051118,0.07234,2274.6,5.065,44.1,127.6,2.89,22.45,71.9,1840-01-01,sumptibus Societatis Camdenensis
1,1,12.71,9.7289,0.03005,0.023396,1416.2,4.83,35.9,85.4,2.377,10.53,23.4,1840-01-01,"Printed for the Camden Society, by J.B. Nichol..."
2,2,17.301,13.585,0.034793,0.034683,2083.7,5.207,20.1,93.4,4.653,20.54,65.8,1842-01-01,"Printed for the Camden Society, by J.B. Nichol..."
3,3,25.879,21.32,0.044734,0.057662,2015.6,5.165,29.5,91.2,3.094,6.84,21.9,1843-01-01,"Printed for the Camden Society, by J.B. Nichol..."
4,4,14.901,16.132,0.047622,0.036076,1798.6,6.405,76.8,161.6,2.105,7.92,25.4,1844-01-01,J. Winchester


In [4]:
initData.shape

(957, 14)

In [5]:
initData = initData.drop(columns = ['youngsMod','Unnamed: 0', 'sec_calcMass', 'sec_massRec', 'publisher'])

In [6]:
initData = initData.dropna()
initData.shape

(957, 9)

# BUILDING PCA MODEL

In [None]:
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

pca = PCA(n_components=3)

In [None]:
initData_columnNames= initData.columns
initData_columnNames

In [None]:
pipeline = Pipeline([
    ('scaling', StandardScaler()), ('normalize', Normalizer())  
])

In [None]:
scaleNormPipe = pipeline.fit_transform(initData)

In [None]:
pca_df = pd.DataFrame(data = scaleNormPipe, columns = initData_columnNames)
pca_df.head()

## TRAIN, TEST SPLIT

In [None]:
from sklearn.model_selection import train_test_split
# "target" contains the column name of the classification labels
target = "sec_Mw"

X = pca_df.drop(target,1)
y = pca_df[target]

seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)

In [None]:
y

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score

# Train, test splits
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, 
                                                    stratify=y, 
                                                    random_state=42)

# Decision trees with depth = 2
clf = DecisionTreeClassifier(max_depth=2, random_state=42)
clf.fit(X_train, y_train)
preds = clf.predict_proba(X_test)
print('Accuracy: {:.5f}'.format(accuracy_score(y_test, 
                                                preds.argmax(axis=1))))

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set(style='white')
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from sklearn import decomposition
from sklearn import datasets
from mpl_toolkits.mplot3d import Axes3D

# Loading the dataset
iris = datasets.load_iris()
X = iris.data
y = iris.target

# Let's create a beautiful 3d-plot
fig = plt.figure(1, figsize=(6, 5))
plt.clf()
ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134)

plt.cla()

for name, label in [('Setosa', 0), ('Versicolour', 1), ('Virginica', 2)]:
    ax.text3D(X[y == label, 0].mean(),
              X[y == label, 1].mean() + 1.5,
              X[y == label, 2].mean(), name,
              horizontalalignment='center',
              bbox=dict(alpha=.5, edgecolor='w', facecolor='w'))
# Change the order of labels, so that they match
y_clr = np.choose(y, [1, 2, 0]).astype(np.float)
ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=y_clr, 
           cmap=plt.cm.nipy_spectral)

ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([]);

In [None]:
y

In [None]:
pca_train = pipeline.fit_transform(X_train)
pca_test = pipeline.transform(X_test)

In [None]:
pca_train = pca.fit_transform(pca_train)
pca_test = pca.transform(pca_test)

In [None]:
explained_variance = pca.explained_variance_ratio_
explained_variance

In [None]:
pca_train_df = pd.DataFrame(data = pca_train, columns = ['PC1', 'PC2', 'PC3'])
pca_test_df = pd.DataFrame(data = pca_test, columns = ['PC1', 'PC2','PC3'])


In [None]:
target = pd.Series(y_train, name='sec_Mw')
result_df_train = pd.concat([pca_train_df, target], axis=1)

In [None]:
result_df_train.head()

In [None]:
result_df_test = pd.concat([pca_test_df, y], axis=1)

## SCATTER PLOT

In [None]:
fig = plt.figure(figsize = (12,10))
ax = fig.add_subplot(1,1,1) 
ax.set_xlabel('First Principal Component ', fontsize = 15)
ax.set_ylabel('Second Principal Component ', fontsize = 15)
ax.set_title('Principal Component Analysis (2PCs) for Iris Dataset', fontsize = 20)

targets = [0, 1, 2]
colors = ['r', 'g', 'b']
for target, color in zip(targets, colors):
    print(target)
    indicesToKeep = pca_df['sec_Mw'] == target
    ax.scatter(result_df_train.loc[indicesToKeep, 'PC1'], 
               result_df_train.loc[indicesToKeep, 'PC2'], 
               c = color, 
               s = 50)
ax.legend(targets)
ax.grid()

In [None]:
result_df_train

In [None]:
plt.scatter(pca.components_[0], pca.components_[1], alpha=.1, color='black')
plt.xlabel('PCA 1')
plt.ylabel('PCA 2')

In [None]:
plt.scatter(pca.components_[1], pca.components_[2], alpha=.1, color='black')
plt.xlabel('PCA 2')
plt.ylabel('PCA 3')

In [None]:
def labelled_scatterplot(data=None,x=None,y=None): #,labs=None
    p1 = sns.scatterplot(data=data,x=x,y=y)
    for line in range(0,data.shape[0]):
        p1.text(data[x][line]+0.01, data[y][line], 
                #data[labs][line], horizontalalignment='left', 
                size='medium', color='black')

In [None]:
labelled_scatterplot(data=pca_train_df, x='PC 1',y='PC 2')

# Scree code

In [None]:
screePipe = pipeline.fit_transform(initData)
scree_pca = PCA(n_components=5)

In [None]:
screeFitTransf=scree_pca.fit_transform(screePipe)

In [None]:
screeFitTransf

In [None]:
scree_df = pd.DataFrame(data = screeFitTransf, columns = ['PC 1', 'PC 2', 'PC 3', 'PC 4', 'PC 5'])

In [None]:
scree_df.head()

## SCREE PLOT

In [None]:
import matplotlib.pyplot as plt

scree_PC_Values = np.arange(scree_pca.n_components_) + 1
plt.plot(scree_PC_Values, np.square(scree_pca.explained_variance_ratio_), 'ro-', linewidth=2)
###We transformed the scree plot by squaring the variance explained to make the "ideal" PC more apparent
plt.title('Scree Plot')
plt.xlabel('Principal Component')
plt.ylabel('Proportion of Variance Explained\n(transformed, squared)')
plt.show()