<a href="https://colab.research.google.com/github/hananeY-inventor/test-regression/blob/main/regression_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.metrics import r2_score, mean_squared_error
import statsmodels.api as sm

In [33]:
df = pd.read_csv('/content/sample_data/output (2).csv')

In [17]:
X=df.drop(['moody_scores.csv','Country','Year'],axis=1)
y=df['moody_scores.csv']

In [19]:
# Initialize the StandardScaler
scaler = StandardScaler()

# Scale the features
X_scaled = scaler.fit_transform(X)

In [20]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [25]:
sfs = SFS(LinearRegression(),
          k_features='best',
          forward=True,
          scoring='r2',
          cv=5)

# Perform feature selection
sfs = sfs.fit(X_train, y_train)

# Get the selected feature indices
selected_features = X.columns[list(sfs.k_feature_idx_)]
print('Selected features:', selected_features)

Selected features: Index(['X3_Inflation.csv', 'X4_Corruption Index.csv',
       'X5_Government Debt to GDP Ratio.csv',
       'X6_ data_GDP per Capita (PPP) .csv',
       'X9_Foreign Debt to GDP Ratio.csv'],
      dtype='object')


# Train the Model on the Selected Features:

In [26]:
# Subset the training data to include only selected features
X_train_selected = X_train[:, sfs.k_feature_idx_]

# Initialize and train the Linear Regression model
model = LinearRegression()
model.fit(X_train_selected, y_train)

In [27]:
# Subset the testing data to include only selected features
X_test_selected = X_test[:, sfs.k_feature_idx_]

# Make predictions
y_pred = model.predict(X_test_selected)

# Evaluate the model
print('R-squared:', r2_score(y_test, y_pred))
print('MSE:', mean_squared_error(y_test, y_pred))

R-squared: 0.46614997061981456
MSE: 290.2761221203314


**Check for Multicollinearity Using VIF**

In [31]:
# Add a constant to the selected features
X_selected = sm.add_constant(X_train_selected)
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Calculate VIF for each feature, excluding the constant
vif = pd.DataFrame()
vif["Variable"] = X.columns[list(sfs.k_feature_idx_)]
# Start the range from 1 to exclude the constant column (index 0)
vif["VIF"] = [variance_inflation_factor(X_selected, i) for i in range(1, X_selected.shape[1])]
print(vif)

                              Variable       VIF
0                     X3_Inflation.csv  1.103002
1              X4_Corruption Index.csv  1.568135
2  X5_Government Debt to GDP Ratio.csv  1.125455
3   X6_ data_GDP per Capita (PPP) .csv  1.453826
4     X9_Foreign Debt to GDP Ratio.csv  1.100405


In [35]:
df.drop(['Country','Year'],axis=1,inplace=True)
df.corr()

Unnamed: 0,X1_Political.csv,X2_GDP.csv,X3_Inflation.csv,X4_Corruption Index.csv,X5_Government Debt to GDP Ratio.csv,X6_ data_GDP per Capita (PPP) .csv,X7-Trade Balance to GDP Ratio.csv,X8-Unemployment Rate.csv,X9_Foreign Debt to GDP Ratio.csv,moody_scores.csv
X1_Political.csv,1.0,-0.078444,-0.24024,0.811724,0.064508,0.518196,0.292577,-0.175863,0.071076,0.551893
X2_GDP.csv,-0.078444,1.0,0.031122,-0.126151,-0.174339,-0.009512,0.078961,-0.079566,-0.157554,-0.036498
X3_Inflation.csv,-0.24024,0.031122,1.0,-0.313455,-0.091407,-0.213077,-0.10993,0.13167,0.023112,-0.286117
X4_Corruption Index.csv,0.811724,-0.126151,-0.313455,1.0,0.170304,0.563521,0.281271,-0.151319,0.07033,0.69019
X5_Government Debt to GDP Ratio.csv,0.064508,-0.174339,-0.091407,0.170304,1.0,0.069092,-0.072162,0.156003,0.305578,0.004938
X6_ data_GDP per Capita (PPP) .csv,0.518196,-0.009512,-0.213077,0.563521,0.069092,1.0,0.62035,-0.423159,0.039228,0.486356
X7-Trade Balance to GDP Ratio.csv,0.292577,0.078961,-0.10993,0.281271,-0.072162,0.62035,1.0,-0.349336,-0.148079,0.282881
X8-Unemployment Rate.csv,-0.175863,-0.079566,0.13167,-0.151319,0.156003,-0.423159,-0.349336,1.0,-0.057169,-0.158183
X9_Foreign Debt to GDP Ratio.csv,0.071076,-0.157554,0.023112,0.07033,0.305578,0.039228,-0.148079,-0.057169,1.0,-0.018448
moody_scores.csv,0.551893,-0.036498,-0.286117,0.69019,0.004938,0.486356,0.282881,-0.158183,-0.018448,1.0


# **Calculate skewness for all columns**

In [37]:
skewness = df.skew()
print(skewness)

X1_Political.csv                      -0.019085
X2_GDP.csv                            -0.714635
X3_Inflation.csv                       7.316092
X4_Corruption Index.csv                0.450821
X5_Government Debt to GDP Ratio.csv    1.804550
X6_ data_GDP per Capita (PPP) .csv     1.707025
X7-Trade Balance to GDP Ratio.csv     -0.528948
X8-Unemployment Rate.csv               1.440235
X9_Foreign Debt to GDP Ratio.csv       1.637251
moody_scores.csv                      -0.200712
dtype: float64
