In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression

# Data selection
https://archive.ics.uci.edu/ml/datasets/Electrical+Grid+Stability+Simulated+Data+#



# Problem define

預測系統是否穩定

# Observation

In [3]:
data = pd.read_csv("Data_for_UCI_named.csv")



print(data.info())
print(data.describe())
print(data.head())

# 分割trainset和testset
test = data[8000:]
data = data[:8000]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
tau1     10000 non-null float64
tau2     10000 non-null float64
tau3     10000 non-null float64
tau4     10000 non-null float64
p1       10000 non-null float64
p2       10000 non-null float64
p3       10000 non-null float64
p4       10000 non-null float64
g1       10000 non-null float64
g2       10000 non-null float64
g3       10000 non-null float64
g4       10000 non-null float64
stab     10000 non-null float64
stabf    10000 non-null object
dtypes: float64(13), object(1)
memory usage: 1.1+ MB
None
               tau1          tau2          tau3          tau4            p1  \
count  10000.000000  10000.000000  10000.000000  10000.000000  10000.000000   
mean       5.250000      5.250001      5.250004      5.249997      3.750000   
std        2.742548      2.742549      2.742549      2.742556      0.752160   
min        0.500793      0.500141      0.500788      0.500473      1.58

stab,stabf是和穩定直接相關的值，不考慮做特徵
剩下的又以tau1, tau2, tau3, tau4, g1, g2, g3, g4 欄位和 stab,stabf 相關度較高，故考慮作為特徵

In [4]:
data['stabf'] = data['stabf'].map({'stable': 1, 'unstable': -1})
test['stabf'] = test['stabf'].map({'stable': 1, 'unstable': -1})
print(data.corr())

           tau1      tau2      tau3      tau4        p1        p2        p3  \
tau1   1.000000  0.017850 -0.005636 -0.026600  0.027020 -0.012637 -0.012770   
tau2   0.017850  1.000000  0.016199 -0.004845 -0.007029  0.006309  0.010765   
tau3  -0.005636  0.016199  1.000000  0.001828  0.023754 -0.011417 -0.013230   
tau4  -0.026600 -0.004845  0.001828  1.000000 -0.001910  0.013132  0.003682   
p1     0.027020 -0.007029  0.023754 -0.001910  1.000000 -0.566958 -0.586881   
p2    -0.012637  0.006309 -0.011417  0.013132 -0.566958  1.000000 -0.003291   
p3    -0.012770  0.010765 -0.013230  0.003682 -0.586881 -0.003291  1.000000   
p4    -0.021418 -0.004935 -0.016505 -0.013511 -0.578513 -0.012077  0.015901   
g1     0.016730 -0.004182 -0.006658 -0.004155  0.007875  0.016238 -0.009597   
g2     0.018458  0.029437 -0.001149  0.004132  0.020186 -0.019218 -0.015035   
g3     0.005032  0.013988  0.014070 -0.002416 -0.000775  0.009069 -0.005191   
g4     0.007672 -0.012708 -0.015641 -0.008245 -0.003

# trainnig

分出訓練及測試集

In [5]:
train_y = data['stabf']
test_y = test['stabf']

train_x = data[['tau1', 'tau2', 'tau3', 'tau4', 'g1', 'g2', 'g3', 'g4']]
test_x = test[['tau1', 'tau2', 'tau3', 'tau4', 'g1', 'g2', 'g3', 'g4']]

先來個基本的RandomForestClassifier試看看

In [6]:
model = RandomForestClassifier(n_estimators=64, random_state=1012, n_jobs=-1)
model.fit(train_x, train_y)
print(accuracy_score(train_y, model.predict(train_x)))
print(accuracy_score(test_y, model.predict(test_x)))

1.0
0.9075


## 等等!! 怎麼馬上就overfitting了!? 我其他資料每個都試了1-2小時也才6-7成阿 !!?
好吧...還是繼續來改善...

跟LogisticRegression分類器組合看看吧(加入不同觀點看是否有幫助)

In [7]:
dc_clf = RandomForestClassifier(n_estimators=64,random_state=1012, n_jobs=-1)
svm_clf = SVC(random_state=1012, probability=True)
lf = LogisticRegression(solver="liblinear", random_state=1012)
model = VotingClassifier(
        estimators=[('rf', dc_clf), ('lf', lf)],
        voting='soft',
        weights=[1, 1]
)
model.fit(train_x, train_y)
print(accuracy_score(train_y, model.predict(train_x)))
print(accuracy_score(test_y, model.predict(test_x)))

0.972875
0.8635


好像有好一點...
再跟svc組合看看吧

In [8]:
dc_clf = RandomForestClassifier(n_estimators=64, random_state=1012, n_jobs=-1)
svm_clf = SVC(random_state=1012, probability=True)
lf = LogisticRegression(solver="liblinear", random_state=1012)
model = VotingClassifier(
        estimators=[('rf', dc_clf), ('lf', lf), ('s', svm_clf)],
        voting='soft',
        weights=[1, 1, 1]
)
model.fit(train_x, train_y)
print(accuracy_score(train_y, model.predict(train_x)))
print(accuracy_score(test_y, model.predict(test_x)))

0.972125
0.905


還是不太夠...
幫RandomForestClassifier加上max_depth限制好了

In [9]:
dc_clf = RandomForestClassifier(n_estimators=64, max_depth=5, random_state=1012, n_jobs=-1)
svm_clf = SVC(random_state=1012, probability=True)
lf = LogisticRegression(solver="liblinear", random_state=1012)
model = VotingClassifier(
        estimators=[('rf', dc_clf), ('lf', lf), ('s', svm_clf)],
        voting='soft',
        weights=[1, 1, 1]
)
model.fit(train_x, train_y)
print(accuracy_score(train_y, model.predict(train_x)))
print(accuracy_score(test_y, model.predict(test_x)))

0.9265
0.89


恩這樣還差不多...

### 不過這樣好像有點太簡單了...下面再繼續

# Problem define-2

預測系統穩定度的值(即stab欄位)

# Observation-2

用和上面一樣的欄位當特徵好了

In [10]:
del test['stabf']

print(data.corr())


           tau1      tau2      tau3      tau4        p1        p2        p3  \
tau1   1.000000  0.017850 -0.005636 -0.026600  0.027020 -0.012637 -0.012770   
tau2   0.017850  1.000000  0.016199 -0.004845 -0.007029  0.006309  0.010765   
tau3  -0.005636  0.016199  1.000000  0.001828  0.023754 -0.011417 -0.013230   
tau4  -0.026600 -0.004845  0.001828  1.000000 -0.001910  0.013132  0.003682   
p1     0.027020 -0.007029  0.023754 -0.001910  1.000000 -0.566958 -0.586881   
p2    -0.012637  0.006309 -0.011417  0.013132 -0.566958  1.000000 -0.003291   
p3    -0.012770  0.010765 -0.013230  0.003682 -0.586881 -0.003291  1.000000   
p4    -0.021418 -0.004935 -0.016505 -0.013511 -0.578513 -0.012077  0.015901   
g1     0.016730 -0.004182 -0.006658 -0.004155  0.007875  0.016238 -0.009597   
g2     0.018458  0.029437 -0.001149  0.004132  0.020186 -0.019218 -0.015035   
g3     0.005032  0.013988  0.014070 -0.002416 -0.000775  0.009069 -0.005191   
g4     0.007672 -0.012708 -0.015641 -0.008245 -0.003

# trainning-2

參考下圖的regression方法
![Image of sk](https://scikit-learn.org/stable/_static/ml_map.png)

In [11]:
from sklearn.linear_model import SGDRegressor, LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

train_y = data['stab']
test_y = test['stab']

train_x = data[['tau1', 'tau2', 'tau3', 'tau4', 'g1', 'g2', 'g3', 'g4']]
test_x = test[['tau1', 'tau2', 'tau3', 'tau4', 'g1', 'g2', 'g3', 'g4']]

先用SGDRegressor看看 r2_score(越接近1越好)

In [12]:
model = SGDRegressor(random_state=1012)
model.fit(train_x, train_y)
y_predict = model.predict(test_x)


print(r2_score(test_y, y_predict))
print(mean_squared_error(test_y, y_predict))
print(mean_absolute_error(test_y, y_predict))

0.5360686939805197
0.0006299199834734732
0.020951237022387686


普普通通
再試試LinearRegression吧

In [13]:
model = LinearRegression()
model.fit(train_x, train_y)

y_predict = model.predict(train_x)
print("train:")
print(r2_score(train_y, y_predict))
print(mean_squared_error(train_y, y_predict))
print(mean_absolute_error(train_y, y_predict))

print("test:")
y_predict = model.predict(test_x)
print(r2_score(test_y, y_predict))
print(mean_squared_error(test_y, y_predict))
print(mean_absolute_error(test_y, y_predict))

train:
0.6517022754582176
0.0004751308593851816
0.01730721650090147
test:
0.6266840589404927
0.0005068836018854798
0.017812583317906548


不過上面是1次式的，以下對LinearRegression選用不同order的多項式來訓練

In [14]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures

degrees = [1, 2, 3, 4, 5, 6]
for i in range(len(degrees)):
    polynomial_features = PolynomialFeatures(degree=degrees[i], include_bias=False)
    model = LinearRegression()
    pipeline = Pipeline([("polynomial_features", polynomial_features), ("linear_regression", model)])
    pipeline.fit(train_x, train_y)
    model = pipeline
    print("poly order "+str(degrees[i]))
    print("train:")
    y_predict = model.predict(train_x)
    print(r2_score(train_y, y_predict))
    print(mean_squared_error(train_y, y_predict))
    print(mean_absolute_error(train_y, y_predict))

    print("test:")
    y_predict = model.predict(test_x)
    print(r2_score(test_y, y_predict))
    print(mean_squared_error(test_y, y_predict))
    print(mean_absolute_error(test_y, y_predict))
    print()

poly order 1
train:
0.6517022754582176
0.0004751308593851816
0.017307216500901453
test:
0.6266840589404925
0.00050688360188548
0.01781258331790653

poly order 2
train:
0.8908923041303781
0.00014883942573059097
0.009202546218523178
test:
0.8898830542414189
0.00014951537814411297
0.00916597835225065

poly order 3
train:
0.9496601051179812
6.86712425357348e-05
0.00581562617254721
test:
0.9476675771686008
7.105629324289997e-05
0.005914660858584581

poly order 4
train:
0.9685205644421757
4.2942718874280975e-05
0.004485760053887283
test:
0.9616079251587764
5.212826734413227e-05
0.004920564017162625

poly order 5
train:
0.9802246823575743
2.6976528998062786e-05
0.0034953898104923536
test:
0.9638043783094408
4.9145951396890236e-05
0.004519945783536085

poly order 6
train:
0.9895841203686085
1.4208837703508937e-05
0.0026251857711232555
test:
0.9478093972646785
7.086373173343013e-05
0.005176570502009488



## 可以看到test的表現在4或5次式時最好，到了6次時又掉下去，故應該選4或5次式來做LinearRegression



再換svr試試

In [15]:
from sklearn.svm import SVR

model = SVR(kernel='rbf')
model.fit(train_x, train_y)
y_predict = model.predict(test_x)


print(r2_score(test_y, y_predict))
print(mean_squared_error(test_y, y_predict))
print(mean_absolute_error(test_y, y_predict))

-0.0014437053816760415
0.0013597474327743488
0.03129049164906002


反而變爛了...

最後試一下lasso

In [16]:
from sklearn.linear_model import Lasso

model = Lasso(random_state=1012)
model.fit(train_x, train_y)
y_predict = model.predict(test_x)


print(r2_score(test_y, y_predict))
print(mean_squared_error(test_y, y_predict))
print(mean_absolute_error(test_y, y_predict))

-9.64651303281272e-08
0.0013577873190828307
0.03123453096863861


好吧更爛...

## 最後結論用LinearRegression的4或5次式時最好，而且結果還蠻準確的