# PCA MODEL

In [None]:
#####Links Used:
#https://www.datacamp.com/community/tutorials/principal-component-analysis-in-python
#https://towardsdatascience.com/pca-using-python-scikit-learn-e653f8989e60
#https://www.datasklr.com/principal-component-analysis-and-factor-analysis/principal-component-analysis
#https://cmdlinetips.com/2018/03/pca-example-in-python-with-scikit-learn/
#https://stackoverflow.com/questions/39216897/plot-pca-loadings-and-loading-in-biplot-in-sklearn-like-rs-autoplot

##WHEN WE START TRAIN/TEST SPLIT REF.: https://stackabuse.com/implementing-pca-in-python-with-scikit-learn/

# LOADING DATA

In [7]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [8]:
initData = pd.read_csv("publisher_analysis_data.csv")
initData.head()

Unnamed: 0.1,Unnamed: 0,maxLoad,stressMaxLoad,strainMaxLoad,energyAbsorp,youngsMod,ph,sec_Mn,sec_Mw,sec_polyDisp,sec_calcMass,sec_massRec,year,publisher
0,0,34.67,28.779,0.051118,0.07234,2274.6,5.065,44.1,127.6,2.89,22.45,71.9,1840-01-01,sumptibus Societatis Camdenensis
1,1,12.71,9.7289,0.03005,0.023396,1416.2,4.83,35.9,85.4,2.377,10.53,23.4,1840-01-01,"Printed for the Camden Society, by J.B. Nichol..."
2,2,17.301,13.585,0.034793,0.034683,2083.7,5.207,20.1,93.4,4.653,20.54,65.8,1842-01-01,"Printed for the Camden Society, by J.B. Nichol..."
3,3,25.879,21.32,0.044734,0.057662,2015.6,5.165,29.5,91.2,3.094,6.84,21.9,1843-01-01,"Printed for the Camden Society, by J.B. Nichol..."
4,4,14.901,16.132,0.047622,0.036076,1798.6,6.405,76.8,161.6,2.105,7.92,25.4,1844-01-01,J. Winchester


In [22]:
initData['year'] = initData['year'].apply(lambda x: int((x.split('-')[0])))

In [21]:
initData.head()

Unnamed: 0,maxLoad,stressMaxLoad,strainMaxLoad,energyAbsorp,ph,sec_Mn,sec_Mw,sec_polyDisp,year
0,34.67,28.779,0.051118,0.07234,5.065,44.1,127.6,2.89,1840
1,12.71,9.7289,0.03005,0.023396,4.83,35.9,85.4,2.377,1840
2,17.301,13.585,0.034793,0.034683,5.207,20.1,93.4,4.653,1842
3,25.879,21.32,0.044734,0.057662,5.165,29.5,91.2,3.094,1843
4,14.901,16.132,0.047622,0.036076,6.405,76.8,161.6,2.105,1844


In [9]:
initData.shape

(957, 14)

In [10]:
initData = initData.drop(columns = ['youngsMod','Unnamed: 0', 'sec_calcMass', 'sec_massRec', 'publisher'])

In [11]:
initData = initData.dropna()
initData.shape

(957, 9)

# BUILDING PCA MODEL

In [12]:
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

pca = PCA(n_components=3)

In [13]:
initData_columnNames= initData.columns
initData_columnNames

Index(['maxLoad', 'stressMaxLoad', 'strainMaxLoad', 'energyAbsorp', 'ph',
       'sec_Mn', 'sec_Mw', 'sec_polyDisp', 'year'],
      dtype='object')

In [14]:
pipeline = Pipeline([
    ('scaling', StandardScaler()), ('normalize', Normalizer())  
])

In [15]:
scaleNormPipe = pipeline.fit_transform(initData)

ValueError: could not convert string to float: '1840-01-01'

In [16]:
pca_df = pd.DataFrame(data = scaleNormPipe, columns = initData_columnNames)
pca_df.head()

NameError: name 'scaleNormPipe' is not defined

## TRAIN, TEST SPLIT

In [None]:
from sklearn.model_selection import train_test_split
# "target" contains the column name of the classification labels
target = "sec_Mw"

X = pca_df.drop(target,1)
y = pca_df[target]

seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)

In [None]:
##Transforming Data to Standard Scaler
pca_train = pipeline.fit_transform(X_train)
pca_test = pipeline.transform(X_test)

In [None]:
##Taking Scaled Data and Applying PCA
pca_train = pca.fit_transform(pca_train)
pca_test = pca.transform(pca_test)

In [None]:
##Explained Variance
explained_variance = pca.explained_variance_ratio_
explained_variance

In [None]:
##Putting Principal Component Data into Train & Test Dataframes
pca_train_df = pd.DataFrame(data = pca_train, columns = ['PC1', 'PC2', 'PC3'])
pca_test_df = pd.DataFrame(data = pca_test, columns = ['PC1', 'PC2','PC3'])


In [None]:
##Appending Our Target Parameter, sec_Mw Onto the Training Principal Component Dataframe
target = pd.Series(y_train, name='sec_Mw')
result_df_train = pd.concat([pca_train_df, target], axis=1)

In [None]:
result_df_train.head()

In [None]:
result_df_test = pd.concat([pca_test_df, y], axis=1)

In [None]:
result_df_test

## SCATTER PLOT

# Scree code

In [None]:
screePipe = pipeline.fit_transform(initData)
scree_pca = PCA(n_components=5)

In [None]:
screeFitTransf=scree_pca.fit_transform(screePipe)

In [None]:
screeFitTransf

In [None]:
scree_df = pd.DataFrame(data = screeFitTransf, columns = ['PC 1', 'PC 2', 'PC 3', 'PC 4', 'PC 5'])

In [None]:
scree_df.head()

## SCREE PLOT

In [None]:
import matplotlib.pyplot as plt

scree_PC_Values = np.arange(scree_pca.n_components_) + 1
plt.plot(scree_PC_Values, np.square(scree_pca.explained_variance_ratio_), 'ro-', linewidth=2)
###We transformed the scree plot by squaring the variance explained to make the "ideal" PC more apparent
plt.title('Scree Plot')
plt.xlabel('Principal Component')
plt.ylabel('Proportion of Variance Explained\n(transformed, squared)')
plt.show()