# PCA MODEL

In [1]:
#####Links Used:
#https://www.datacamp.com/community/tutorials/principal-component-analysis-in-python
#https://towardsdatascience.com/pca-using-python-scikit-learn-e653f8989e60
#https://www.datasklr.com/principal-component-analysis-and-factor-analysis/principal-component-analysis
#https://cmdlinetips.com/2018/03/pca-example-in-python-with-scikit-learn/
#https://stackoverflow.com/questions/39216897/plot-pca-loadings-and-loading-in-biplot-in-sklearn-like-rs-autoplot

##WHEN WE START TRAIN/TEST SPLIT REF.: https://stackabuse.com/implementing-pca-in-python-with-scikit-learn/

# LOADING DATA

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
initData = pd.read_csv("publisher_analysis_data.csv")
initData['year'] = initData['year'].apply(lambda x: int(x.split('-')[0]))
initData['half']=initData['year']//50*50
year_series=initData['year']
initData = initData.drop(columns = ['youngsMod','Unnamed: 0', 'sec_calcMass', 'sec_massRec', 'publisher','half', 'year', 'ph'])
initData = initData.dropna()
print(initData.shape)
initData.head()

AttributeError: 'float' object has no attribute 'split'

# BUILDING PCA MODEL

In [None]:
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

pca = PCA(n_components=3)
pipeline = Pipeline([
    ('scaling', StandardScaler()), ('normalize', Normalizer())  
])

In [None]:
initData_columnNames= initData.columns
initData_columnNames

In [None]:
scaleNormPipe = pipeline.fit_transform(initData)
pca_df = pd.DataFrame(data = scaleNormPipe, columns = initData_columnNames)
pca_df.head()

## PCA

In [None]:
from sklearn.model_selection import train_test_split
# "target" contains the column name of the classification labels
target = "sec_Mw"

X = pca_df.drop(target, 1)
y = pca_df[target]

# seed = 7
# test_size = 0.33
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)

In [None]:
##Transforming Data to Standard Scaler
pca_train = pipeline.fit_transform(X)
# pca_test = pipeline.transform(X_test)

##Taking Scaled Data and Applying PCA
pca_train = pca.fit_transform(pca_train)
# pca_test = pca.transform(pca_test)

##Putting Principal Component Data into Train & Test Dataframes
pca_train_df = pd.DataFrame(data = pca_train, columns = ['PC1', 'PC2', 'PC3'])
# pca_test_df = pd.DataFrame(data = pca_test, columns = ['PC1', 'PC2','PC3'])

In [None]:
##Explained Variance
explained_variance = pca.explained_variance_ratio_
explained_variance

In [None]:
PCAs_Mw = pd.concat([pca_train_df,y], axis = 1)
PCAs_Mw

In [None]:
y_pca=pd.concat([PCAs_Mw,year_series], axis = 1)

In [None]:
# ##Appending Our Target Parameter, sec_Mw Onto the Training Principal Component Dataframe
# target = pd.Series(y_train, name='sec_Mw')
# result_df_train = pd.concat([pca_train_df, target], axis=1)
# result_df_train.head()
# result_df_test = pd.concat([pca_test_df, y_test], axis=1)
# result_df_test

## SCATTER PLOT

In [None]:
# sns.scatterplot(data=PCAs_Mw, x='PC1', y='PC2', hue=year_series, palette='deep')
sns.scatterplot(data=y_pca, x='PC1', y='PC2', hue=year_series, palette='deep')

#year_series

In [None]:
fig, ax = plt.subplots(figsize=(7,7))
sns.scatterplot(data=PCAs_Mw, x='PC1', y='PC3', hue='sec_Mw', palette="ch:s=.25,rot=-.25", alpha=0.7)
plt.show()

fig, ax = plt.subplots(figsize=(7,7))
sns.scatterplot(data=PCAs_Mw, x='PC2', y='PC3', hue='sec_Mw', palette="ch:s=.25,rot=-.25", alpha=0.7)
plt.show()

fig, ax = plt.subplots(figsize=(7,7))
sns.scatterplot(data=PCAs_Mw, x='PC1', y='PC2', hue='sec_Mw', palette="ch:s=.25,rot=-.25", alpha=0.7)
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(7,7))
sns.scatterplot(data=PCAs_Mw, x='PC1', y='PC2', hue=year_series, palette="ch:s=.25,rot=-.25", alpha=0.7)
plt.legend(title="Fifty Year Buckets")
plt.show()

fig, ax = plt.subplots(figsize=(7,7))
sns.scatterplot(data=PCAs_Mw, x='PC2', y='PC3', hue=year_series, palette="ch:s=.25,rot=-.25", alpha=0.7)
plt.legend(title="Fifty Year Buckets")
plt.show()

fig, ax = plt.subplots(figsize=(7,7))
sns.scatterplot(data=PCAs_Mw, x='PC1', y='PC3', hue=year_series, palette="ch:s=.25,rot=-.25", alpha=0.7)
plt.legend(title="Fifty Year Buckets")
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(7,7))
sns.scatterplot(data=PCAs_Mw, y='PC1', x='sec_Mw', alpha=0.7)
plt.show()

fig, ax = plt.subplots(figsize=(7,7))
sns.scatterplot(data=PCAs_Mw, y='PC2', x='sec_Mw', alpha=0.7)
plt.show()

fig, ax = plt.subplots(figsize=(7,7))
sns.scatterplot(data=PCAs_Mw, y='PC3', x='sec_Mw', alpha=0.7)
plt.show()

In [None]:
#fig, ax = plt.subplots(figsize=(7,7))
sns.lmplot(data=PCAs_Mw, y='PC1', x='sec_Mw')
#plt.show()

#fig, ax = plt.subplots(figsize=(7,7))
sns.lmplot(data=PCAs_Mw, y='PC2', x='sec_Mw')
#plt.show()

#fig, ax = plt.subplots(figsize=(7,7))
sns.lmplot(data=PCAs_Mw, y='PC3', x='sec_Mw')
#plt.show()

# Scree code

In [None]:
screePipe = pipeline.fit_transform(initData)
scree_pca = PCA(n_components=5)

In [None]:
screeFitTransf=scree_pca.fit_transform(screePipe)

In [None]:
screeFitTransf

In [None]:
scree_df = pd.DataFrame(data = screeFitTransf, columns = ['PC 1', 'PC 2', 'PC 3', 'PC 4', 'PC 5'])

In [None]:
scree_df.head()

## SCREE PLOT

In [None]:
import matplotlib.pyplot as plt

scree_PC_Values = np.arange(scree_pca.n_components_) + 1
plt.plot(scree_PC_Values, np.square(scree_pca.explained_variance_ratio_), 'ro-', linewidth=2)
###We transformed the scree plot by squaring the variance explained to make the "ideal" PC more apparent
plt.title('Scree Plot')
plt.xlabel('Principal Component')
plt.ylabel('Proportion of Variance Explained\n(transformed, squared)')
plt.show()