In [None]:
!pip install dmba

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from dmba import stepwise_selection
from dmba import AIC_score

In [None]:
%matplotlib inline

In [None]:
df = pd.read_csv('./cybersec_final_data.csv', sep=';', decimal=',', index_col='ID')
columns = ['ROA', 'ROE', 'IND', 'INFR', 'PEOPLE', 'PARTNERS']
df = df[columns]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1127 entries, 1 to 487
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   ROA       1120 non-null   float64
 1   ROE       1117 non-null   float64
 2   IND       1127 non-null   int64  
 3   INFR      1127 non-null   int64  
 4   PEOPLE    1127 non-null   int64  
 5   PARTNERS  1127 non-null   int64  
dtypes: float64(2), int64(4)
memory usage: 61.6 KB


In [None]:
df['IND'].value_counts()

1    284
2    267
7    246
4    123
6    117
3     57
5     33
Name: IND, dtype: int64

In [None]:
df.drop('IND', axis=1).corr()

Unnamed: 0,ROA,ROE,INFR,PEOPLE,PARTNERS
ROA,1.0,0.008218,0.116992,0.078316,0.001306
ROE,0.008218,1.0,0.520908,0.356874,0.255719
INFR,0.116992,0.520908,1.0,0.706253,0.29515
PEOPLE,0.078316,0.356874,0.706253,1.0,0.228645
PARTNERS,0.001306,0.255719,0.29515,0.228645,1.0


In [None]:
df = df.drop(['ROA', 'PEOPLE'], axis=1)

In [None]:
df['ROE'] = df['ROE'].replace(np.NaN, 0)

In [None]:
df = pd.get_dummies(df, columns = ['IND'])
df = df.rename( columns={ 
    'IND_1':'IT&Telecom', 'IND_2':'Finance', 'IND_3':'Construction', 
    'IND_4':'Manufacture', 'IND_5':'Energy', 'IND_6':'Medicine', 'IND_7':'Others'})
df_cluster = df.drop('ROE', axis=1)

In [None]:
df_tream = df[df['Others'] !=1].drop(['Others'], axis=1)

In [None]:
df_tream.shape

(881, 9)

In [None]:
X = df_tream.loc[:, ~df_tream.columns.isin(['ROE'])]
Y = df_tream['ROE']

In [None]:
X.columns

Index(['INFR', 'PARTNERS', 'IT&Telecom', 'Finance', 'Construction',
       'Manufacture', 'Energy', 'Medicine'],
      dtype='object')

In [None]:
def train_model(variables):
    if len(variables) == 0:
        return None
    model = LinearRegression()
    model.fit(X[variables], Y)
    return model

def score_model(model, variables):
    if len(variables) == 0:
        return AIC_score(Y, [Y.mean()] * len(Y), model, df=1)
    return AIC_score(Y, model.predict(X[variables]), model)

def get_stepwise_selection():
  _, best_variables = stepwise_selection(X.columns, train_model, score_model, verbose=True)
  return best_variables

def print_model(Y, X):
    predictors = get_stepwise_selection()
    lm_factor_ols = sm.OLS(Y, X[predictors].assign(const=1))
    results = lm_factor_ols.fit()
    return results.summary()

print_model(Y, X)

Variables: INFR, PARTNERS, IT&Telecom, Finance, Construction, Manufacture, Energy, Medicine
Start: score=5531.25, constant
Step: score=5203.69, add INFR
Step: score=5196.54, add PARTNERS
Step: score=5190.67, add Medicine
Step: score=5190.67, unchanged None


0,1,2,3
Dep. Variable:,ROE,R-squared:,0.325
Model:,OLS,Adj. R-squared:,0.323
Method:,Least Squares,F-statistic:,140.9
Date:,"Tue, 10 Jan 2023",Prob (F-statistic):,1.6299999999999998e-74
Time:,16:38:15,Log-Likelihood:,-2590.3
No. Observations:,881,AIC:,5189.0
Df Residuals:,877,BIC:,5208.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
INFR,11.5000,0.625,18.409,0.000,10.274,12.726
PARTNERS,1.3164,0.435,3.027,0.003,0.463,2.170
Medicine,-1.2830,0.457,-2.805,0.005,-2.181,-0.385
const,-36.9475,1.821,-20.286,0.000,-40.522,-33.373

0,1,2,3
Omnibus:,1458.305,Durbin-Watson:,2.039
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2497517.347
Skew:,-9.736,Prob(JB):,0.0
Kurtosis:,263.111,Cond. No.,46.0


In [None]:
# df_quatiles = df[(df['ROE'] < df['ROE'].quantile(0.95)) & (df['ROE'] > df['ROE'].quantile(0.05))]
# df_tream = df[(df['IT&Telecom'] != 1) & (df['Others'] !=1 )].drop('IT&Telecom', axis=1)
# columns = ['ROE', 'INFR', 'PARTNERS', 'Finance', 'Construction', 'Manufacture', 'Energy', 'Medicine']