#因子分析
Factor
 Analysis

In [None]:
# factor_analyzer のインストール
!pip install factor_analyzer

In [67]:
# 基本ライブラリの読み込み
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# sklearnの 標準化モジュールをインポート
from sklearn.preprocessing import StandardScaler

# factor_analyzer をインポート
from factor_analyzer import FactorAnalyzer


### データの準備

In [117]:
Init_df = pd.read_csv('https://raw.githubusercontent.com/koichi-inoue/DataScience/main/seiseki.csv')
Init_df.head()

Unnamed: 0,学籍番号,国語,数学,理科,社会,英語
0,S01,68,64,62,65,58
1,S02,95,60,54,77,80
2,S03,66,75,77,95,95
3,S04,83,70,60,90,75
4,S05,64,84,88,65,48


In [118]:
# 学籍番号の列を削除
df = Init_df.drop( columns = '学籍番号' )
df.head()

Unnamed: 0,国語,数学,理科,社会,英語
0,68,64,62,65,58
1,95,60,54,77,80
2,66,75,77,95,95
3,83,70,60,90,75
4,64,84,88,65,48


In [119]:
# データを標準化
ST = df.describe().transpose()

def norm(x):
    return (x - ST['mean']) / ST['std']

df = norm(df)
df.head()

Unnamed: 0,国語,数学,理科,社会,英語
0,-0.433463,-0.324875,-0.882104,-0.631145,-1.097286
1,1.524712,-0.588453,-1.608046,0.181002,0.392
2,-0.578513,0.399964,0.479037,1.399222,1.407423
3,0.654412,0.070492,-1.063589,1.060827,0.053526
4,-0.723564,0.993014,1.477207,-0.631145,-1.774235


In [120]:
# 念の為、事前に相関をチェック
df.corr()

Unnamed: 0,国語,数学,理科,社会,英語
国語,1.0,0.053358,0.051508,0.170313,0.442236
数学,0.053358,1.0,0.567643,-0.446287,0.268559
理科,0.051508,0.567643,1.0,-0.241428,0.009582
社会,0.170313,-0.446287,-0.241428,1.0,-0.063588
英語,0.442236,0.268559,0.009582,-0.063588,1.0


数学と理科、国語と英語に相関が見られます。  
また、数学と社会には負の相関が見られます。

### 因子分析

In [121]:
# 因子の数を３と設定して分析
fa = FactorAnalyzer(n_factors=3, rotation='varimax')
fa.fit(df)

fa.loadings_

array([[ 0.66993149,  0.13776763, -0.27072927],
       [ 0.21520678,  0.5897546 ,  0.63556369],
       [ 0.00465146,  0.74492318,  0.20032695],
       [ 0.06246291, -0.17317856, -0.56264529],
       [ 0.75582535, -0.04885843,  0.21196202]])

In [122]:
# ラベルとインデックスを加えてデータフレーム化
loadings_df = pd.DataFrame(fa.loadings_, columns=["因子負荷量 1","因子負荷量２","因子負荷量 ３"])
loadings_df.index = df.columns
loadings_df["共通性"] = fa.get_communalities()
loadings_df["独自性"] = fa.get_uniquenesses()
loadings_df

Unnamed: 0,因子負荷量 1,因子負荷量２,因子負荷量 ３,共通性,独自性
国語,0.669931,0.137768,-0.270729,0.541082,0.458918
数学,0.215207,0.589755,0.635564,0.798066,0.201934
理科,0.004651,0.744923,0.200327,0.595063,0.404937
社会,0.062463,-0.173179,-0.562645,0.350462,0.649538
英語,0.755825,-0.048858,0.211962,0.618587,0.381413


考察：５教科の点数に３つの要因があるとすると・・かなり無理な設定ですが・・
* factor1 (> 0.6)  
  国語、英語・・・言語活用能力？
* factor2（> 0.6）  
  理科,（数学）・・・理数系能力？
* factor3（>0.6）  
  数学,（-社会）・・・これは何と考えれば？・・


### 因子得点の算出

In [123]:
# 因子得点の算出
factorScores_df = pd.DataFrame(fa.transform(df), columns = ["因子１","因子２","因子３"])
# 学籍番号をインデックスに
factorScores_df.index = Init_df['学籍番号']
print(factorScores_df.shape)
factorScores_df


(43, 3)


Unnamed: 0_level_0,因子１,因子２,因子３
学籍番号,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
S01,-0.799385,-0.488302,0.089347
S02,0.919109,-0.915338,-0.530647
S03,0.629263,0.100406,0.203678
S04,0.455375,-0.379641,-0.267575
S05,-1.315787,1.412221,0.399671
S06,0.542006,-0.132061,0.066539
S07,0.213815,-0.225722,-0.059087
S08,0.108872,-0.051963,-0.226122
S09,-1.209515,-0.472403,-0.601602
S10,0.184794,0.124984,-0.957651
