####  Importing required liabraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

#### Reading data

In [2]:
train = pd.read_csv('dataset/train.csv')

In [3]:
test = pd.read_csv('dataset/test.csv')

In [4]:
train1 = pd.read_csv('dataset/train1.csv')

#### Fill/Treat missing values

In [5]:
train1['Number_of_Windows'].fillna(train1['Number_of_Windows'].mode()[0], inplace=True)
train1['Furnishing'].fillna(train1['Furnishing'].mode()[0], inplace=True)
train1['Frequency_of_Powercuts'].fillna(train1['Frequency_of_Powercuts'].mode()[0], inplace=True)
train1['Crime_Rate'].fillna(train1['Crime_Rate'].mode()[0], inplace=True)
train1['Dust_and_Noise'].fillna(train1['Dust_and_Noise'].mode()[0], inplace=True)

In [6]:
train1['Traffic_Density_Score'].fillna(train1['Traffic_Density_Score'].median, inplace=True)

In [7]:
train1['Property_Area'].fillna(train1['Property_Area'].median, inplace=True)

In [8]:
test1 = pd.read_csv('dataset/test1.csv')

In [9]:
test1['Number_of_Windows'].fillna(test1['Number_of_Windows'].mode()[0], inplace=True)
test1['Furnishing'].fillna(test1['Furnishing'].mode()[0], inplace=True)
test1['Frequency_of_Powercuts'].fillna(test1['Frequency_of_Powercuts'].mode()[0], inplace=True)
test1['Crime_Rate'].fillna(test1['Crime_Rate'].mode()[0], inplace=True)
test1['Dust_and_Noise'].fillna(test1['Dust_and_Noise'].mode()[0], inplace=True)

In [10]:
test1['Traffic_Density_Score'].fillna(test1['Traffic_Density_Score'].median, inplace=True)
test1['Property_Area'].fillna(test1['Property_Area'].median, inplace=True)

#### Drop irrelevant features

In [11]:
train1=train1.drop('Property_ID',axis=1)

In [12]:
test1=test1.drop('Property_ID',axis=1)

In [13]:
train1=train1.drop('Number_of_Windows',axis=1)
test1=test1.drop('Number_of_Windows',axis=1)

In [14]:
train1=train1.drop('Number_of_Doors',axis=1)
test1=test1.drop('Number_of_Doors',axis=1)

In [15]:
train1.shape

(39499, 12)

#### Shape of the datasets

In [16]:
train1.shape

(39499, 12)

In [17]:
test1.shape

(10500, 11)

#### Converting categorical features to numerial

##### 1. frequency encoding on train data

In [18]:
def frequence_encoder(train1,col):
    """
    This function encodes a categorical column based on the frequency of their occurence.
    input:
        df : Input DataFrame in which encoding has to be created 
        col : Column name which has to be encoded
        target_col : Dependent column to be predicted.
    return: 
          Mean encoded dict for column
    """
    freq_value = train1.groupby(col).size()/len(train1)
    freq_dict = freq_value.to_dict()
    train1["Freq_encoded_"+col] = train1[col].replace(freq_dict)
    return freq_dict

In [19]:
frequence_encoder(train1,'Property_Type')
frequence_encoder(train1,'Furnishing')
frequence_encoder(train1,'Power_Backup')
frequence_encoder(train1,'Water_Supply')
frequence_encoder(train1,'Crime_Rate')
frequence_encoder(train1,'Dust_and_Noise')

{'High': 0.08131851439276944,
 'Low': 0.05245702422846148,
 'Medium': 0.8662244613787691}

In [20]:
train1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39499 entries, 0 to 39498
Data columns (total 18 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Property_Type                39499 non-null  object 
 1   Property_Area                39499 non-null  int64  
 2   Furnishing                   39499 non-null  object 
 3   Frequency_of_Powercuts       39499 non-null  float64
 4   Power_Backup                 39499 non-null  object 
 5   Water_Supply                 39499 non-null  object 
 6   Traffic_Density_Score        39499 non-null  float64
 7   Crime_Rate                   39499 non-null  object 
 8   Dust_and_Noise               39499 non-null  object 
 9   Air_Quality_Index            39499 non-null  int64  
 10  Neighborhood_Review          39499 non-null  float64
 11  Habitability_score           39499 non-null  float64
 12  Freq_encoded_Property_Type   39499 non-null  float64
 13  Freq_encoded_Fur

##### 2. frequency encoding on test data

In [21]:
train1 = train1.drop(['Property_Type','Furnishing','Power_Backup','Water_Supply','Crime_Rate','Dust_and_Noise'],axis=1)

In [22]:
def frequence_encoder(test1,col):
    """
    This function encodes a categorical column based on the frequency of their occurence.
    input:
        df : Input DataFrame in which encoding has to be created 
        col : Column name which has to be encoded
        target_col : Dependent column to be predicted.
    return: 
          Mean encoded dict for column
    """
    freq_value = test1.groupby(col).size()/len(test1)
    freq_dict = freq_value.to_dict()
    test1["Freq_encoded_"+col] = test1[col].replace(freq_dict)
    return freq_dict

In [23]:
frequence_encoder(test1,'Property_Type')
frequence_encoder(test1,'Furnishing')
frequence_encoder(test1,'Power_Backup')
frequence_encoder(test1,'Water_Supply')
frequence_encoder(test1,'Crime_Rate')
frequence_encoder(test1,'Dust_and_Noise')

{'High': 0.07638095238095238, 'Low': 0.05161904761904762, 'Medium': 0.872}

In [24]:
test1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10500 entries, 0 to 10499
Data columns (total 17 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Property_Type                10500 non-null  object 
 1   Property_Area                10500 non-null  int64  
 2   Furnishing                   10500 non-null  object 
 3   Frequency_of_Powercuts       10500 non-null  float64
 4   Power_Backup                 10500 non-null  object 
 5   Water_Supply                 10500 non-null  object 
 6   Traffic_Density_Score        10500 non-null  float64
 7   Crime_Rate                   10500 non-null  object 
 8   Dust_and_Noise               10500 non-null  object 
 9   Air_Quality_Index            10500 non-null  int64  
 10  Neighborhood_Review          10500 non-null  float64
 11  Freq_encoded_Property_Type   10500 non-null  float64
 12  Freq_encoded_Furnishing      10500 non-null  float64
 13  Freq_encoded_Pow

In [25]:
test1 = test1.drop(['Property_Type','Furnishing','Power_Backup','Water_Supply','Crime_Rate','Dust_and_Noise'],axis=1)

#### Test and train data

In [26]:
X = train1.drop('Habitability_score', 1)
y = train1.Habitability_score

  X = train1.drop('Habitability_score', 1)


In [27]:
X = pd.get_dummies(X)
train1 = pd.get_dummies(train1)
test1 = pd.get_dummies(test1)

In [28]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.3)

#### Baseline model fitting

In [29]:
from sklearn import linear_model
from sklearn import metrics

In [30]:
reg = linear_model.LinearRegression()
reg.fit(x_train,y_train)
metrics.r2_score(y_train,reg.predict(x_train)), metrics.r2_score(y_test,reg.predict(x_test))

(0.391322713594921, 0.39359184594949126)

Baselime model accuracy 


Train:0.39268823338002445

Test: 0.39045155432470235

In [31]:
x_train.head()

Unnamed: 0,Property_Area,Frequency_of_Powercuts,Traffic_Density_Score,Air_Quality_Index,Neighborhood_Review,Freq_encoded_Property_Type,Freq_encoded_Furnishing,Freq_encoded_Power_Backup,Freq_encoded_Water_Supply,Freq_encoded_Crime_Rate,Freq_encoded_Dust_and_Noise
4379,222,0.0,5.27,88,4.18,0.369528,0.541761,0.773235,0.502696,0.293172,0.866224
21272,460,0.0,6.81,82,2.61,0.369528,0.541761,0.226765,0.048887,0.134257,0.866224
18488,2004,0.0,7.05,104,4.78,0.290564,0.541761,0.773235,0.502696,0.512899,0.866224
21953,930,0.0,7.59,88,4.8,0.012912,0.26188,0.773235,0.502696,0.512899,0.866224
35556,1615,1.0,7.23,160,3.7,0.290564,0.541761,0.773235,0.250563,0.512899,0.081319


In [32]:
x_train.columns

Index(['Property_Area', 'Frequency_of_Powercuts', 'Traffic_Density_Score',
       'Air_Quality_Index', 'Neighborhood_Review',
       'Freq_encoded_Property_Type', 'Freq_encoded_Furnishing',
       'Freq_encoded_Power_Backup', 'Freq_encoded_Water_Supply',
       'Freq_encoded_Crime_Rate', 'Freq_encoded_Dust_and_Noise'],
      dtype='object')

#### Scalig of data

In [33]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

train1[['Property_Area', 'Frequency_of_Powercuts', 'Traffic_Density_Score','Air_Quality_Index','Neighborhood_Review']] = scaler.fit_transform(train1[['Property_Area', 'Frequency_of_Powercuts', 'Traffic_Density_Score','Air_Quality_Index','Neighborhood_Review']])

train1.head()

Unnamed: 0,Property_Area,Frequency_of_Powercuts,Traffic_Density_Score,Air_Quality_Index,Neighborhood_Review,Habitability_score,Freq_encoded_Property_Type,Freq_encoded_Furnishing,Freq_encoded_Power_Backup,Freq_encoded_Water_Supply,Freq_encoded_Crime_Rate,Freq_encoded_Dust_and_Noise
0,0.000215,0.0,0.594349,0.086207,0.772,71.98,0.369528,0.541761,0.773235,0.250563,0.293172,0.866224
1,0.022636,0.333333,0.440969,0.091954,0.71,71.2,0.369528,0.26188,0.773235,0.197853,0.512899,0.866224
2,0.022779,0.0,0.751766,0.1159,0.762,71.39,0.369528,0.196359,0.773235,0.250563,0.293172,0.866224
3,0.028608,0.666667,0.621594,0.095785,0.268,31.46,0.369528,0.26188,0.226765,0.250563,0.059672,0.866224
4,0.076455,0.0,0.550959,0.111111,0.954,93.7,0.107294,0.196359,0.773235,0.502696,0.512899,0.866224


In [34]:
X = train1.drop('Habitability_score', 1)
y = train1.Habitability_score

  X = train1.drop('Habitability_score', 1)


In [35]:
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.3)
reg = linear_model.LinearRegression()
reg.fit(x_train,y_train)
metrics.r2_score(y_train,reg.predict(x_train)), metrics.r2_score(y_test,reg.predict(x_test))

(0.39181395706707844, 0.39243778313164335)

When Baselime model is scaled and trained, the accuracy of train data increases and that of test data decreases


Train:0.3993308654176302

Test: 0.3743658802649421

In [36]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import SplineTransformer

In [37]:
from sklearn import linear_model
from sklearn import metrics

#### Make pipeline

In [38]:
# Overfitted model

pipe = make_pipeline(PowerTransformer(),PolynomialFeatures(3),RobustScaler(),linear_model.LinearRegression())
pipe.fit(x_train, y_train)

In [43]:
pipe = make_pipeline(PowerTransformer(),PolynomialFeatures(3),RobustScaler(),linear_model.LinearRegression())
pipe.fit(x_train, y_train)

metrics.r2_score(y_train,pipe.predict(x_train)),metrics.r2_score(y_test,pipe.predict(x_test))

(0.7381896031681949, 0.7186525401961842)

In [44]:
pipe = make_pipeline(PowerTransformer(),PolynomialFeatures(4),RobustScaler(),linear_model.LinearRegression())
pipe.fit(x_train, y_train)

print(metrics.r2_score(y_train,pipe.predict(x_train)),metrics.r2_score(y_test,pipe.predict(x_test)))

0.7993149556241681 0.7630146045106776


In [41]:
pipe = make_pipeline(PowerTransformer(),PolynomialFeatures(4),SplineTransformer(degree=2, n_knots=5),RobustScaler(), \
                     linear_model.LinearRegression())
pipe.fit(x_train, y_train)

print(metrics.r2_score(y_train,pipe.predict(x_train)),metrics.r2_score(y_test,pipe.predict(x_test)))

0.8631187406489289 -45980049724.23167


In [42]:
pipe = make_pipeline(PowerTransformer(),PolynomialFeatures(4),RobustScaler(),linear_model.LassoLars(alpha=.1, normalize=False))
pipe.fit(x_train, y_train)

print(metrics.r2_score(y_train,pipe.predict(x_train)),metrics.r2_score(y_test,pipe.predict(x_test)))

0.7462044436313067 0.7286517392626175


In [43]:
# Regularised model
pipe = make_pipeline(PowerTransformer(),PolynomialFeatures(4),SplineTransformer(degree=2, n_knots=5),RobustScaler(), \
                     linear_model.LassoLars(alpha=.1, normalize=False))
pipe.fit(x_train, y_train)

metrics.r2_score(y_train,pipe.predict(x_train)),metrics.r2_score(y_test,pipe.predict(x_test))

(0.7105546105856839, 0.6947816268457367)

In [45]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

In [46]:
k_range = list(range(1, 8))
params = dict(n_neighbors = k_range)
knn_regressor = GridSearchCV(KNeighborsRegressor(), params, cv =10, scoring = 'neg_mean_squared_error')
knn_regressor.fit(x_train, y_train)
print(metrics.r2_score(y_train,knn_regressor.predict(x_train)),metrics.r2_score(y_test,knn_regressor.predict(x_test)))

0.776613654142635 0.6474028472442682


In [47]:
knn_regressor.get_params

<bound method BaseEstimator.get_params of GridSearchCV(cv=10, estimator=KNeighborsRegressor(),
             param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7]},
             scoring='neg_mean_squared_error')>

In [48]:
knn_regressor.best_params_

{'n_neighbors': 5}

In [50]:
depth  =list(range(3,8))
param_grid =dict(max_depth =depth)
tree =GridSearchCV(DecisionTreeRegressor(),param_grid,cv =10)
tree.fit(x_train,y_train)
print(metrics.r2_score(y_train,tree.predict(x_train)),metrics.r2_score(y_test,tree.predict(x_test)))

0.7728565382761448 0.7417889041320842


In [51]:
tree.get_params

<bound method BaseEstimator.get_params of GridSearchCV(cv=10, estimator=DecisionTreeRegressor(),
             param_grid={'max_depth': [3, 4, 5, 6, 7]})>

In [52]:
tree.best_params_

{'max_depth': 7}

In [53]:
tuned_params = {'n_estimators': [100,  300,  500], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]}
random_regressor = RandomizedSearchCV(RandomForestRegressor(), tuned_params, n_iter = 20, scoring = 'neg_mean_absolute_error', cv = 5, n_jobs = -1)
random_regressor.fit(x_train, y_train)
metrics.r2_score(y_train,random_regressor.predict(x_train)),metrics.r2_score(y_test,random_regressor.predict(x_test))

(0.9278921301745515, 0.8092570668640175)

In [52]:
random_regressor.get_params

<bound method BaseEstimator.get_params of RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(), n_iter=20,
                   n_jobs=-1,
                   param_distributions={'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [100, 300, 500]},
                   scoring='neg_mean_absolute_error')>

In [53]:
random_regressor.best_params_

{'n_estimators': 500, 'min_samples_split': 10, 'min_samples_leaf': 2}

In [54]:
import xgboost as xgb

  from pandas import MultiIndex, Int64Index


In [55]:
xg_reg = xgb.XGBRegressor()
xg_reg.fit(x_train,y_train)
metrics.r2_score(y_train,xg_reg.predict(x_train)),metrics.r2_score(y_test,xg_reg.predict(x_test))

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


(0.8675711115189058, 0.7984985497884372)

In [56]:
prediction = (random_regressor.predict(test1))

In [57]:
prediction

array([75.22190327, 81.54965034, 83.19422757, ..., 89.92023282,
       82.36874595, 81.54965034])

In [58]:
prediction = pd.DataFrame(prediction)

In [59]:
prediction.to_csv('D:/Resume_projects/TopMentor_classes_material/7 and 8 May Morning session Documents (1)/7 and 8 May Morning session Documents/Airline/Airline/prediction.csv',index=False)

In [62]:
from sklearn.feature_selection import RFE
rfe = RFE(xg_reg, n_features_to_select=15, step=1)   # running RFE with 15 variables as output
rfe = rfe.fit(x_train, y_train)

In [63]:
rfe.support_

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True])

In [64]:
rfe.ranking_

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [66]:

list(zip(x_train.columns, rfe.support_, rfe.ranking_))

[('Property_Area', True, 1),
 ('Frequency_of_Powercuts', True, 1),
 ('Traffic_Density_Score', True, 1),
 ('Air_Quality_Index', True, 1),
 ('Neighborhood_Review', True, 1),
 ('Freq_encoded_Property_Type', True, 1),
 ('Freq_encoded_Furnishing', True, 1),
 ('Freq_encoded_Power_Backup', True, 1),
 ('Freq_encoded_Water_Supply', True, 1),
 ('Freq_encoded_Crime_Rate', True, 1),
 ('Freq_encoded_Dust_and_Noise', True, 1)]

In [68]:
col = x_train.columns[rfe.support_]

In [69]:
col

Index(['Property_Area', 'Frequency_of_Powercuts', 'Traffic_Density_Score',
       'Air_Quality_Index', 'Neighborhood_Review',
       'Freq_encoded_Property_Type', 'Freq_encoded_Furnishing',
       'Freq_encoded_Power_Backup', 'Freq_encoded_Water_Supply',
       'Freq_encoded_Crime_Rate', 'Freq_encoded_Dust_and_Noise'],
      dtype='object')

In [71]:
x_train_new = x_train[col]

In [72]:
x_train_new

Unnamed: 0,Property_Area,Frequency_of_Powercuts,Traffic_Density_Score,Air_Quality_Index,Neighborhood_Review,Freq_encoded_Property_Type,Freq_encoded_Furnishing,Freq_encoded_Power_Backup,Freq_encoded_Water_Supply,Freq_encoded_Crime_Rate,Freq_encoded_Dust_and_Noise
32218,0.033829,1.000000,0.593340,0.084291,0.460,0.146991,0.541761,0.773235,0.502696,0.293172,0.866224
38507,0.001538,0.666667,0.654894,0.133142,0.572,0.369528,0.261880,0.773235,0.197853,0.512899,0.866224
26277,0.063975,0.333333,0.537841,0.106322,0.516,0.290564,0.541761,0.773235,0.048887,0.293172,0.866224
39187,0.037942,0.333333,0.507568,0.093870,0.570,0.290564,0.196359,0.773235,0.502696,0.134257,0.866224
25229,0.012337,0.000000,0.879919,0.165709,0.860,0.072711,0.196359,0.773235,0.250563,0.512899,0.081319
...,...,...,...,...,...,...,...,...,...,...,...
13214,0.070948,0.000000,0.623613,0.106322,0.956,0.290564,0.541761,0.773235,0.502696,0.512899,0.866224
3759,0.064476,0.000000,0.544904,0.090996,0.834,0.107294,0.261880,0.773235,0.502696,0.293172,0.866224
18966,0.025175,0.000000,0.812311,0.118774,0.560,0.369528,0.541761,0.773235,0.197853,0.134257,0.081319
35505,0.003004,0.000000,0.528759,0.109195,0.768,0.369528,0.541761,0.773235,0.250563,0.293172,0.866224


In [73]:
import statsmodels.api as sm

  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,


In [75]:
x_train_sm = sm.add_constant(x_train_new)
logm2 = sm.GLM(y_train, x_train_sm, family = sm.families.Binomial())
res = logm2.fit()
res.summary()

  x = pd.concat(x[::order], 1)


0,1,2,3
Dep. Variable:,Habitability_score,No. Observations:,27649.0
Model:,GLM,Df Residuals:,27637.0
Model Family:,Binomial,Df Model:,11.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-inf
Date:,"Thu, 25 Aug 2022",Deviance:,162030000.0
Time:,21:00:41,Pearson chi2:,6.79e+23
No. Iterations:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,2.506e+17,5.98e+06,4.19e+10,0.000,2.51e+17,2.51e+17
Property_Area,5.991e+16,5.82e+06,1.03e+10,0.000,5.99e+16,5.99e+16
Frequency_of_Powercuts,8.071e+16,4.15e+06,1.95e+10,0.000,8.07e+16,8.07e+16
Traffic_Density_Score,-3.071e+16,3.63e+06,-8.47e+09,0.000,-3.07e+16,-3.07e+16
Air_Quality_Index,-2.186e+16,7.35e+06,-2.97e+09,0.000,-2.19e+16,-2.19e+16
Neighborhood_Review,4.566e+17,1.03e+07,4.44e+10,0.000,4.57e+17,4.57e+17
Freq_encoded_Property_Type,-3.895e+16,3.66e+06,-1.06e+10,0.000,-3.89e+16,-3.89e+16
Freq_encoded_Furnishing,5.231e+16,2.61e+06,2.01e+10,0.000,5.23e+16,5.23e+16
Freq_encoded_Power_Backup,-1.117e+17,1.76e+06,-6.33e+10,0.000,-1.12e+17,-1.12e+17


In [77]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [78]:

vif = pd.DataFrame()
vif['Features'] = x_train_new.columns
vif['VIF'] = [variance_inflation_factor(x_train_new.values, i) for i in range(x_train_new.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

Unnamed: 0,Features,VIF
4,Neighborhood_Review,116.93
9,Freq_encoded_Crime_Rate,25.52
2,Traffic_Density_Score,24.27
8,Freq_encoded_Water_Supply,12.74
10,Freq_encoded_Dust_and_Noise,8.81
7,Freq_encoded_Power_Backup,8.7
6,Freq_encoded_Furnishing,7.44
5,Freq_encoded_Property_Type,6.37
3,Air_Quality_Index,5.53
1,Frequency_of_Powercuts,3.49


In [79]:
final_col = x_train_new.columns

In [81]:
x_test_new = x_test[final_col]

In [84]:
model_new = linear_model.LinearRegression()
model_new.fit(x_train_new,y_train)

In [85]:
model_new.predict(x_train_new)

array([63.83316992, 63.4037228 , 62.42572389, ..., 58.7324014 ,
       73.11267239, 78.818658  ])

In [86]:
xg_reg = xgb.XGBRegressor()
xg_reg.fit(x_train_new,y_train)
metrics.r2_score(y_train,xg_reg.predict(x_train_new)),metrics.r2_score(y_test,xg_reg.predict(x_test_new))

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


(0.8675711115189058, 0.7984985497884372)

In [87]:
x_train_new

Unnamed: 0,Property_Area,Frequency_of_Powercuts,Traffic_Density_Score,Air_Quality_Index,Neighborhood_Review,Freq_encoded_Property_Type,Freq_encoded_Furnishing,Freq_encoded_Power_Backup,Freq_encoded_Water_Supply,Freq_encoded_Crime_Rate,Freq_encoded_Dust_and_Noise
32218,0.033829,1.000000,0.593340,0.084291,0.460,0.146991,0.541761,0.773235,0.502696,0.293172,0.866224
38507,0.001538,0.666667,0.654894,0.133142,0.572,0.369528,0.261880,0.773235,0.197853,0.512899,0.866224
26277,0.063975,0.333333,0.537841,0.106322,0.516,0.290564,0.541761,0.773235,0.048887,0.293172,0.866224
39187,0.037942,0.333333,0.507568,0.093870,0.570,0.290564,0.196359,0.773235,0.502696,0.134257,0.866224
25229,0.012337,0.000000,0.879919,0.165709,0.860,0.072711,0.196359,0.773235,0.250563,0.512899,0.081319
...,...,...,...,...,...,...,...,...,...,...,...
13214,0.070948,0.000000,0.623613,0.106322,0.956,0.290564,0.541761,0.773235,0.502696,0.512899,0.866224
3759,0.064476,0.000000,0.544904,0.090996,0.834,0.107294,0.261880,0.773235,0.502696,0.293172,0.866224
18966,0.025175,0.000000,0.812311,0.118774,0.560,0.369528,0.541761,0.773235,0.197853,0.134257,0.081319
35505,0.003004,0.000000,0.528759,0.109195,0.768,0.369528,0.541761,0.773235,0.250563,0.293172,0.866224


In [88]:
x_train

Unnamed: 0,Property_Area,Frequency_of_Powercuts,Traffic_Density_Score,Air_Quality_Index,Neighborhood_Review,Freq_encoded_Property_Type,Freq_encoded_Furnishing,Freq_encoded_Power_Backup,Freq_encoded_Water_Supply,Freq_encoded_Crime_Rate,Freq_encoded_Dust_and_Noise
32218,0.033829,1.000000,0.593340,0.084291,0.460,0.146991,0.541761,0.773235,0.502696,0.293172,0.866224
38507,0.001538,0.666667,0.654894,0.133142,0.572,0.369528,0.261880,0.773235,0.197853,0.512899,0.866224
26277,0.063975,0.333333,0.537841,0.106322,0.516,0.290564,0.541761,0.773235,0.048887,0.293172,0.866224
39187,0.037942,0.333333,0.507568,0.093870,0.570,0.290564,0.196359,0.773235,0.502696,0.134257,0.866224
25229,0.012337,0.000000,0.879919,0.165709,0.860,0.072711,0.196359,0.773235,0.250563,0.512899,0.081319
...,...,...,...,...,...,...,...,...,...,...,...
13214,0.070948,0.000000,0.623613,0.106322,0.956,0.290564,0.541761,0.773235,0.502696,0.512899,0.866224
3759,0.064476,0.000000,0.544904,0.090996,0.834,0.107294,0.261880,0.773235,0.502696,0.293172,0.866224
18966,0.025175,0.000000,0.812311,0.118774,0.560,0.369528,0.541761,0.773235,0.197853,0.134257,0.081319
35505,0.003004,0.000000,0.528759,0.109195,0.768,0.369528,0.541761,0.773235,0.250563,0.293172,0.866224


In [90]:
x_train_new = x_train_new.drop('Neighborhood_Review', axis = 1, inplace = True)

In [94]:
logm1 = sm.GLM(y_train,(sm.add_constant(x_train_new)), family = sm.families.Binomial())
logm1.fit().summary()

TypeError: unsupported operand type(s) for -: 'NoneType' and 'NoneType'

In [None]:
rfe = RFE(xg_reg, n_features_to_select=15, step=1)   # running RFE with 15 variables as output
rfe = rfe.fit(x_train, y_train)

In [93]:
x_train_sm = sm.add_constant(x_train_new)
logm2 = sm.GLM(y_train, x_train_sm, family = sm.families.Binomial())
res = logm2.fit()
res.summary()

TypeError: unsupported operand type(s) for -: 'NoneType' and 'NoneType'

In [92]:
vif = pd.DataFrame()
vif['Features'] = x_train_new.columns
vif['VIF'] = [variance_inflation_factor(x_train_new.values, i) for i in range(x_train_new.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

AttributeError: 'NoneType' object has no attribute 'columns'