In [468]:
import pandas as pd
import numpy as np
from sklearn import linear_model
import matplotlib.pyplot as plt
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split



## Linear Model For Each Column Against Remaining Useful Life

In [469]:
df = pd.read_csv("Featurized_Data.csv", index_col= 0)


In [470]:

df['Remaining Useful Life'] = np.log(df['Remaining Useful Life'])
train, test = train_test_split(df, test_size=0.3, random_state=40946397)
df

Unnamed: 0_level_0,Remaining Useful Life,initial discharge capacity,final discharge capacity,discharge cap. slope,dis. cap. intercept,min. resistance,Delta resistance,Delta_Variance
Cell ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0.878896,0.998070,0.998511,-0.000070,1.002587,0.015569,0.000053,0.000017
1,0.777530,0.996846,0.977803,-0.000414,0.994996,0.015582,-0.000102,0.000112
2,0.845173,0.994805,0.982924,-0.000206,0.992808,0.015720,-0.000008,0.000038
3,0.789092,1.010965,1.002828,-0.000416,1.012733,0.017880,-0.000254,0.000178
4,0.801053,1.003802,0.979888,-0.000574,1.004001,0.017069,-0.000221,0.000227
...,...,...,...,...,...,...,...,...
103,0.715556,1.015911,0.979040,-0.001282,0.993675,0.018572,-0.000542,0.000570
104,1.011505,1.006956,1.010626,-0.000049,1.013589,0.015938,0.000026,0.000002
105,0.852026,1.024897,1.001408,-0.000452,1.017684,0.017530,-0.000183,0.000156
106,0.813271,1.006640,0.983928,-0.000431,1.006601,0.014954,-0.000116,0.000127


# Correlation Chart

In [471]:
corr = train.corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,Remaining Useful Life,initial discharge capacity,final discharge capacity,discharge cap. slope,dis. cap. intercept,min. resistance,Delta resistance,Delta_Variance
Remaining Useful Life,1.0,0.011858,0.318327,0.665266,0.242851,-0.593625,0.617867,0.04814
initial discharge capacity,0.011858,1.0,0.0875,-0.284581,0.246343,-0.0601,-0.0537,-0.706449
final discharge capacity,0.318327,0.0875,1.0,0.312803,0.220656,-0.188493,0.110917,0.026901
discharge cap. slope,0.665266,-0.284581,0.312803,1.0,-0.230972,-0.494928,0.565382,0.228506
dis. cap. intercept,0.242851,0.246343,0.220656,-0.230972,1.0,-0.016884,0.041051,0.218526
min. resistance,-0.593625,-0.0601,-0.188493,-0.494928,-0.016884,1.0,-0.49332,0.151992
Delta resistance,0.617867,-0.0537,0.110917,0.565382,0.041051,-0.49332,1.0,0.036172
Delta_Variance,0.04814,-0.706449,0.026901,0.228506,0.218526,0.151992,0.036172,1.0


# Checking VIF (Variance Inflation Factor)

In [472]:
X = train.loc[:, 'initial discharge capacity':'Delta_Variance']
X['intercept'] = 1

vif = pd.DataFrame()
vif["Variable"] = X.columns
vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif = vif[vif['Variable']!='intercept']
vif

Unnamed: 0,Variable,VIF
0,initial discharge capacity,3.082075
1,final discharge capacity,1.315193
2,discharge cap. slope,2.706539
3,dis. cap. intercept,2.127706
4,min. resistance,1.694931
5,Delta resistance,1.747207
6,Delta_Variance,3.548467


We see signs of multicolinearity, thus we need to remove some variables. Specifically, dis cap intercept. It looks like it is correlated to final charge a lot. 

In [473]:
X = train[["initial discharge capacity","discharge cap. slope", "final discharge capacity"," min. resistance","Delta resistance","Delta_Variance"]]
X['intercept'] = 1

vif2 = pd.DataFrame()
vif2["Variable"] = X.columns
vif2["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif2 = vif2[vif2['Variable']!='intercept']
vif2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['intercept'] = 1


Unnamed: 0,Variable,VIF
0,initial discharge capacity,2.190744
1,discharge cap. slope,2.084034
2,final discharge capacity,1.186093
3,min. resistance,1.595768
4,Delta resistance,1.645881
5,Delta_Variance,2.12596


VIF is lower now, removing the cap intercept lowers it.

# Linear Regression on these variables

In [474]:
reg_capacity = linear_model.LinearRegression()
train_X = train[["initial discharge capacity","discharge cap. slope", "final discharge capacity"," min. resistance","Delta resistance","Delta_Variance"]]
train_Y = train["Remaining Useful Life"]
test_X = test[["initial discharge capacity","discharge cap. slope", "final discharge capacity"," min. resistance","Delta resistance","Delta_Variance"]]
test_Y = test["Remaining Useful Life"]
fit = reg_capacity.fit(train_X,train_Y)

# Predict Values


In [475]:
pred = fit.predict(test_X)
print('coefficeints: ', fit.coef_)
print('intercepts: ', fit.intercept_)

mean_squared_error(pred, test_Y)


coefficeints:  [  0.59891502  72.81782194   0.11871115 -17.67851721  63.62749459
   5.27610562]
intercepts:  0.4485582413467493


0.0042375753111745865

In [476]:
r2_score(pred, test_Y)


0.4124300031096777

Bad predictor since R2 is low.

# Trying to transform the data

In [489]:
from sklearn.preprocessing import power_transform, PowerTransformer

pt = PowerTransformer()
transform_fit = pt.fit(train_X.to_numpy().reshape(-1,1))


TypeError: 'LinearRegression' object is not callable