# Feature Selection for Linear Regression

## GII

In [1]:
# Feature selection for linear regression to predict GII (Gender inequality Index)

import pandas as pd
import numpy as np
from sklearn import linear_model
from itertools import chain, combinations
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from scipy.stats import zscore
import math
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import Lasso 
from sklearn.metrics import matthews_corrcoef
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 



#returns dataframes containing independent variables, dependent variables, continent data
def preprocessing(data, remove=[], dep_name='', normalize = False ):
    data= data.replace('..', np.nan)
    
    # shuffle data

    data = data.sample(frac=1)

    # Separate continent data (unused)
    continent_data = data.iloc[188:197]
    data = data.iloc[0:188]
    
    # remove unused columns
    data = data.drop(columns=remove)
    
    # Dataframe containing independent variables
    
    ind = data.drop(columns=[dep_name])

    # Separate independent variable (target)
    allbutdep = list(data.columns)
    allbutdep.remove(dep_name)
    dep = data.drop(columns=allbutdep)

    # fill missing field with corresponding column's median
    for i in ind.columns:
        ind[i].fillna(ind[i].median(), inplace = True)
    
    dep.fillna(dep.median(), inplace = True)
    
    if(normalize):
    # normalize data
        scaler = MinMaxScaler() 
        scaled_values = scaler.fit_transform(ind) 
        ind.loc[:,:] = scaled_values
    
    
    return ind, dep, continent_data



In [5]:
# linear regresion model
lm = linear_model.LinearRegression()

#lasso
lasso_ridge = {'alpha':[1e-15,1e-10,1e-8,1e-3,1e-2,1,5,10,20]}
lasso = linear_model.Lasso(max_iter=1000000, tol=.1)
lasso_regressor = GridSearchCV(lasso,lasso_ridge, scoring='r2',cv=5)

# Ridge
ridge = linear_model.Ridge()
ridge_regressor = GridSearchCV(ridge,lasso_ridge, scoring='r2',cv=5)


Identifying features that produce high coefficients of determination (R^2) 

In [3]:
# function that generates powerset of a list
def powerset(iterable):
    s = list(iterable)
    return list(chain.from_iterable(combinations(s, r) for r in range(len(s)+1)))

# tentative: use correlation coefficient (matthews_corrcoef)

#return list with all possible combination of features and R^2 score
def feature_select(independent, dependent, regression= 'linear', num_results = 20):
    # Generate powerset of independet variables 
    ind_vars = list(independent.columns)
    PS_ind_vars= powerset(ind_vars)
    PS_ind_vars = PS_ind_vars[len(dependent.columns)+1:]
    
    # number of sets in superset
    PS_cardinality = len(PS_ind_vars)
    print('Generating superset of cardinality=', PS_cardinality,'for',regression,'regression')
    
    #print('top '+ str(num_results)+':\n')
    
    # tuple to list
    for i in range(len(PS_ind_vars)):
            PS_ind_vars[i] = list(PS_ind_vars[i])
            
    var_scores = []
    #count= 0
    # Store feature subset and score obtained using cross validation
    for i in PS_ind_vars:
        current_ds = independent[i]
        if(regression=='linear'):
            current_score = cross_val_score(lm, current_ds, dependent, scoring='r2', cv=5)
            var_scores.append([i,current_score.mean(), lm])
        elif(regression=='lasso'):
            #print(count,end='-')
            #count+=1
            lasso_regressor.fit(current_ds,dependent)
            var_scores.append([i,lasso_regressor.best_score_, lasso_regressor])
        elif(regression=='ridge'):
            #print(count,end='-')
            #count+=1
            ridge_regressor.fit(current_ds,dependent)
            var_scores.append([i,ridge_regressor.best_score_, ridge_regressor])
            
        
        
    #sort based on score
    var_scores.sort(key = lambda x: x[1], reverse = True)

    # Number of top results to display
    top_vars = num_results
    var_scores =var_scores[:top_vars+1]
    
    print('Done')
    return var_scores

# Display combination of features and corresponding score
def feature_display(var_scores, show_index = True):
    index = 0
    for i in var_scores:
        if(show_index):
            print('index:',index)
        print('variables (',len(i[0]),'):')
        for j in i[0]:
            print('\t',j)
        print('R^2 score: ', i[1],'\n')
        index+=1

# Display best combination of features     
def highest_score_num_features(num, scores, display = False):
    for i in range(len(scores)):
        if (len(scores[i][0]) == num):
            if(display):
                print('best combination of features: \nindex: ', i)
                feature_display([scores[i]],False)
            return i 



In [6]:
# import data
GII_data = pd.read_csv('gender_inequality.csv')

# independet feature name
GII_ind_variable = 'Gender Inequality Index (GII)'

# Features that will not be used
GII_remove =['GII Rank', 'Country']

# preprocessing
GII_in, GII,GII_continent = preprocessing(GII_data,GII_remove,GII_ind_variable)

# Generate scores
GII_scores = feature_select(GII_in, GII)

#feature_display(GII_scores)

GII_lasso_scores =feature_select(GII_in, GII, 'lasso')

#feature_display(GII_lasso_scores)

GII_ridge_scores =feature_select(GII_in, GII, 'ridge')



Generating superset of cardinality= 126 for linear regression
Done
Generating superset of cardinality= 126 for lasso regression


  positive)


Done
Generating superset of cardinality= 126 for ridge regression
Done


In [7]:
# Display best combination of features

print('\t==Using linear regression==\n')
best_GII_i = highest_score_num_features(3, GII_scores, True)
print('\t==Using lasso regression==\n')
best_GII_i_lasso= highest_score_num_features(3, GII_lasso_scores, True)
print('\t==Using ridge regression==\n')
best_GII_i_ridge= highest_score_num_features(3, GII_ridge_scores, True)

	==Using linear regression==

best combination of features: 
index:  9
variables ( 3 ):
	 Adolescent Birth Rate
	 Percent Representation in Parliament
	 Population with Secondary Education (Female)
R^2 score:  0.7397194327344839 

	==Using lasso regression==

best combination of features: 
index:  11
variables ( 3 ):
	 Adolescent Birth Rate
	 Percent Representation in Parliament
	 Population with Secondary Education (Female)
R^2 score:  0.740112489215803 

	==Using ridge regression==

best combination of features: 
index:  9
variables ( 3 ):
	 Adolescent Birth Rate
	 Percent Representation in Parliament
	 Population with Secondary Education (Female)
R^2 score:  0.7401165864225664 



Predictions

In [26]:
# 80-20 split

GII_test = GII_in[GII_scores[best_GII_i][0]]
X_train, X_test, y_train, y_test = train_test_split(GII_test, GII, test_size=0.20)

GII_split = lm.fit(X_train, y_train)
prediction = GII_split.predict(X_test)
print('80-20 R^2 score:', r2_score(y_test, prediction))

# Values corresponding to highest and lowest ranked country 
ideal_x = GII_data[GII_scores[best_GII_i][0]].iloc[0].values
ideal_x = [float(i) for i in ideal_x]
ideal_x = np.reshape(ideal_x,(1,-1)) #[7.8,39.6, 97.4]

worst_x = list(GII_data[GII_scores[best_GII_i][0]].iloc[187])
worst_x = [float(i) for i in worst_x]
worst_x = np.reshape(worst_x,(1,-1))

# Final training before prediction
GII_lm = lm.fit(GII_test, GII)
high_pred = GII_lm.predict(ideal_x)
low_pred = GII_lm.predict(worst_x)

print('Norway-> predicted: ',high_pred, 'actual:',GII_data.iloc[0].values[2])
print('Niger-> predicted:',low_pred, 'actual:',GII_data.iloc[187].values[2])




80-20 R^2 score: 0.6650160502031258
Norway-> predicted:  [[0.0969324]] actual: 0.067
Niger-> predicted: [[0.83691163]] actual: 0.713


## GDI

In [None]:
# Feature selection for linear regression to predict GDI (Gender development Index)


# import data
GDI_data = pd.read_csv('gender_development.csv')

# independet feature name
GDI_ind_variable = 'Gender Development Index (GDI)'

# Features that will not be used
GDI_remove =['GDI Rank', 'Country']

# preprocessing 
GDI_in, GDI,GDI_continent = preprocessing(GDI_data,GDI_remove,GDI_ind_variable)
# generate combination of features
GDI_scores = feature_select(GDI_in, GDI,'linear',50)

#feature_display(GDI_scores)

In [None]:
# display best combination 
best_GDI_i = highest_score_num_features(5, GDI_scores, True)

## Human Development Index

In [None]:
# import data
HDI_data = pd.read_csv('human_development.csv')

HDI_data['Gross National Income (GNI) per Capita']= HDI_data['Gross National Income (GNI) per Capita'].apply(lambda x: float(x.replace(',','')))

# independet feature name
HDI_ind_variable = 'Human Development Index (HDI)'

# Features that will not be used
HDI_remove =['HDI Rank', 'Country']

HDI_in, HDI,HDI_continent = preprocessing(HDI_data,HDI_remove,HDI_ind_variable)

HDI_scores = feature_select(HDI_in, HDI, 'linear',20)

#feature_display(HDI_scores)


In [None]:
best_HDI_i = highest_score_num_features(2, HDI_scores, True)