In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.formula.api as sm
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve, auc
import itertools
import time
from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV
from sklearn.preprocessing import StandardScaler 
from sklearn.metrics import r2_score

In [2]:
data = pd.read_csv('expanded', delimiter='\t', header=None, names=['characteristics'])
data = data['characteristics'].str.split(',', expand=True)
data = data.drop(range(7)).reset_index(drop=True)


In [3]:
# implement descriptive column names
column_names = ['edibility','cap_shape', 'cap_surface', 'cap_color', 'bruises', 
                'odor', 'gill_attachment', 'gill_spacing', 'gill_size', 'gill_color', 
                'stalk_shape', 'stalk_root', 'stalk_surface_above_ring', 'stalk_surface_below_ring', 
                'stalk_color_above_ring', 'stalk_color_below_ring', 'veil_type', 'veil_color', 
                'ring_number', 'ring_type', 'spore_print_color', 'population','habitat']
data.columns = column_names
data = data.drop(data[data.isnull().any(axis=1)].index)

In [4]:
# get dummy variables for two-level response
def var_transform (data):
    data['edibility']=data['edibility'].apply(lambda x: 1 if x=='EDIBLE' else 0)
    #data['odor']=data['odor'].apply(lambda x: 0 if x=='ALMOND' or x=='ANISE' else 1)
    # put further variable transformation here 
    return data
var_transform(data).head(2)

Unnamed: 0,edibility,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,1,CONVEX,SMOOTH,WHITE,BRUISES,ALMOND,FREE,CROWDED,NARROW,WHITE,...,SMOOTH,WHITE,WHITE,PARTIAL,WHITE,ONE,PENDANT,PURPLE,SEVERAL,WOODS
1,1,CONVEX,SMOOTH,WHITE,BRUISES,ALMOND,FREE,CROWDED,NARROW,WHITE,...,SMOOTH,WHITE,WHITE,PARTIAL,WHITE,ONE,PENDANT,BROWN,SEVERAL,WOODS


In [5]:
#Creating training and test datasets
#np.random.seed(2)
#train = data.sample(round(data.shape[0]*0.8))
#test = data.drop(train.index)

In [6]:
# create dummy variables for each categorical variable
cat_vars = list(data.select_dtypes(include=['object']).columns)
dummy_vars = pd.get_dummies(data[cat_vars], prefix=cat_vars)
# concatenate the dummy variables with the original dataset
data = pd.concat([data.drop(cat_vars, axis=1), dummy_vars], axis=1)

print(data.shape)

(8416, 118)


In [7]:
X = data[[
 'cap_shape_BELL',
 'cap_shape_CONICAL',
 'cap_shape_CONVEX',
 'cap_shape_FLAT',
 'cap_shape_KNOBBED',
 'cap_shape_SUNKEN',
 'cap_surface_FIBROUS',
 'cap_surface_GROOVES',
 'cap_surface_SCALY',
 'cap_surface_SMOOTH',
 'cap_color_BROWN',
 'cap_color_BUFF',
 'cap_color_CINNAMON',
 'cap_color_GRAY',
 'cap_color_GREEN',
 'cap_color_PINK',
 'cap_color_PURPLE',
 'cap_color_RED',
 'cap_color_WHITE',
 'cap_color_YELLOW',
 'bruises_BRUISES',
 'bruises_NO',
 'odor_ALMOND',
 'odor_ANISE',
 'odor_CREOSOTE',
 'odor_FISHY',
 'odor_FOUL',
 'odor_MUSTY',
 'odor_NONE',
 'odor_PUNGENT',
 'odor_SPICY',
 'gill_attachment_ATTACHED',
 'gill_attachment_FREE',
 'gill_spacing_CLOSE',
 'gill_spacing_CROWDED',
 'gill_size_BROAD',
 'gill_size_NARROW',
 'gill_color_BLACK',
 'gill_color_BROWN',
 'gill_color_BUFF',
 'gill_color_CHOCOLATE',
 'gill_color_GRAY',
 'gill_color_GREEN',
 'gill_color_ORANGE',
 'gill_color_PINK',
 'gill_color_PURPLE',
 'gill_color_RED',
 'gill_color_WHITE',
 'gill_color_YELLOW',
 'stalk_shape_ENLARGING',
 'stalk_shape_TAPERING',
 'stalk_root_?',
 'stalk_root_BULBOUS',
 'stalk_root_CLUB',
 'stalk_root_EQUAL',
 'stalk_root_ROOTED',
 'stalk_surface_above_ring_FIBROUS',
 'stalk_surface_above_ring_SCALY',
 'stalk_surface_above_ring_SILKY',
 'stalk_surface_above_ring_SMOOTH',
 'stalk_surface_below_ring_FIBROUS',
 'stalk_surface_below_ring_SCALY',
 'stalk_surface_below_ring_SILKY',
 'stalk_surface_below_ring_SMOOTH',
 'stalk_color_above_ring_BROWN',
 'stalk_color_above_ring_BUFF',
 'stalk_color_above_ring_CINNAMON',
 'stalk_color_above_ring_GRAY',
 'stalk_color_above_ring_ORANGE',
 'stalk_color_above_ring_PINK',
 'stalk_color_above_ring_RED',
 'stalk_color_above_ring_WHITE',
 'stalk_color_above_ring_YELLOW',
 'stalk_color_below_ring_BROWN',
 'stalk_color_below_ring_BUFF',
 'stalk_color_below_ring_CINNAMON',
 'stalk_color_below_ring_GRAY',
 'stalk_color_below_ring_ORANGE',
 'stalk_color_below_ring_PINK',
 'stalk_color_below_ring_RED',
 'stalk_color_below_ring_WHITE',
 'stalk_color_below_ring_YELLOW',
 'veil_type_PARTIAL',
 'veil_color_BROWN',
 'veil_color_ORANGE',
 'veil_color_WHITE',
 'veil_color_YELLOW',
 'ring_number_NONE',
 'ring_number_ONE',
 'ring_number_TWO',
 'ring_type_EVANESCENT',
 'ring_type_FLARING',
 'ring_type_LARGE',
 'ring_type_NONE',
 'ring_type_PENDANT',
 'spore_print_color_BLACK',
 'spore_print_color_BROWN',
 'spore_print_color_BUFF',
 'spore_print_color_CHOCOLATE',
 'spore_print_color_GREEN',
 'spore_print_color_ORANGE',
 'spore_print_color_PURPLE',
 'spore_print_color_WHITE',
 'spore_print_color_YELLOW',
 'population_ABUNDANT',
 'population_CLUSTERED',
 'population_NUMEROUS',
 'population_SCATTERED',
 'population_SEVERAL',
 'population_SOLITARY',
 'habitat_GRASSES',
 'habitat_LEAVES',
 'habitat_MEADOWS',
 'habitat_PATHS',
 'habitat_URBAN',
 'habitat_WASTE',
 'habitat_WOODS']]

# Feature Selection

In [8]:
import itertools
import time
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.metrics import mean_squared_error

%matplotlib inline
plt.style.use('ggplot')

In [9]:
data.head()

Unnamed: 0,edibility,cap_shape_BELL,cap_shape_CONICAL,cap_shape_CONVEX,cap_shape_FLAT,cap_shape_KNOBBED,cap_shape_SUNKEN,cap_surface_FIBROUS,cap_surface_GROOVES,cap_surface_SCALY,...,population_SCATTERED,population_SEVERAL,population_SOLITARY,habitat_GRASSES,habitat_LEAVES,habitat_MEADOWS,habitat_PATHS,habitat_URBAN,habitat_WASTE,habitat_WOODS
0,1,0,0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
1,1,0,0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
2,1,0,0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
3,1,0,0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
4,1,0,0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1


In [10]:
from sklearn.model_selection import train_test_split
# separate train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['edibility'], axis=1),
    data['edibility'],
    test_size=0.15,
    random_state=23)

X_train.shape, X_test.shape

((7153, 117), (1263, 117))

In [11]:
cutoff=0.5
def fit_linear_reg(X,Y):
    # Fit linear regression model and return classification accuracy
    model_k = linear_model.LinearRegression()
    model_k.fit(X, Y)

    # Predicted vs Actual
    actual_values = Y
    pred_values = model_k.predict(X)
    bins = np.array([0, cutoff, 1])
    cm = np.histogram2d(actual_values, pred_values, bins=bins)[0]

    # Calculate the accuracy
    accuracy = (cm[0,0] + cm[1,1]) / cm.sum()
    fpr = (cm[0,1]) / (cm[0,0] + cm[0,1])
    tpr = (cm[1,1]) / (cm[1,0] + cm[1,1])

    #print("Classification accuracy = {:.1%}".format(accuracy))
    #print("TPR or Recall = {:.1%}".format(tpr))
    #print("FPR = {:.1%}".format(fpr))

    return accuracy, fpr, tpr

In [12]:
#forward selection
k = 117

remaining_features = list(X_train.columns.values)
features = []
accuracy_list, tpr_list = [np.inf], [np.inf] #Due to 1 indexing of the loop...
features_list = dict()

for i in range(1,k+1):
    best_accuracy = np.inf
    
    for combo in itertools.combinations(remaining_features,1):

            accuracy = fit_linear_reg(X_train[list(combo) + features],y_train)   #Store temp result 

            if accuracy[0] < best_accuracy:
                best_accuracy = accuracy[0]
                best_tpr = accuracy[1] 
                best_feature = combo[0]

    #Updating variables for next loop
    features.append(best_feature)
    remaining_features.remove(best_feature)
    
    #Saving values for plotting
    accuracy_list.append(best_accuracy)
    tpr_list.append(best_tpr)
    features_list[i] = features.copy()

In [13]:
df = pd.concat([pd.DataFrame({'features':features_list}),pd.DataFrame({'accuracy':accuracy_list, 'tpr': tpr_list})], axis=1, join='inner')
df['numb_features'] = df.index
df

Unnamed: 0,features,accuracy,tpr,numb_features
1,[stalk_color_above_ring_GRAY],0.501800,0.000000,1
2,"[stalk_color_above_ring_GRAY, stalk_color_belo...",0.500226,0.000000,2
3,"[stalk_color_above_ring_GRAY, stalk_color_belo...",0.500226,0.000000,3
4,"[stalk_color_above_ring_GRAY, stalk_color_belo...",0.501282,0.000000,4
5,"[stalk_color_above_ring_GRAY, stalk_color_belo...",0.503179,0.000000,5
...,...,...,...,...
113,"[stalk_color_above_ring_GRAY, stalk_color_belo...",0.998585,0.003085,113
114,"[stalk_color_above_ring_GRAY, stalk_color_belo...",1.000000,0.000000,114
115,"[stalk_color_above_ring_GRAY, stalk_color_belo...",1.000000,0.000000,115
116,"[stalk_color_above_ring_GRAY, stalk_color_belo...",1.000000,0.000000,116


In [14]:
df.features.iloc[-1]

['stalk_color_above_ring_GRAY',
 'stalk_color_below_ring_YELLOW',
 'veil_type_PARTIAL',
 'cap_color_PURPLE',
 'ring_type_FLARING',
 'cap_color_PINK',
 'cap_shape_SUNKEN',
 'spore_print_color_ORANGE',
 'odor_CREOSOTE',
 'spore_print_color_YELLOW',
 'stalk_color_above_ring_RED',
 'cap_color_BUFF',
 'gill_color_YELLOW',
 'spore_print_color_PURPLE',
 'veil_color_WHITE',
 'gill_color_ORANGE',
 'veil_color_BROWN',
 'spore_print_color_GREEN',
 'stalk_color_below_ring_RED',
 'habitat_WASTE',
 'habitat_URBAN',
 'gill_color_RED',
 'cap_surface_SMOOTH',
 'cap_color_BROWN',
 'spore_print_color_BUFF',
 'cap_shape_CONICAL',
 'cap_surface_GROOVES',
 'odor_MUSTY',
 'stalk_color_above_ring_ORANGE',
 'stalk_color_above_ring_YELLOW',
 'gill_attachment_ATTACHED',
 'stalk_color_below_ring_ORANGE',
 'gill_attachment_FREE',
 'cap_color_GREEN',
 'veil_color_ORANGE',
 'veil_color_YELLOW',
 'population_CLUSTERED',
 'cap_shape_FLAT',
 'population_SOLITARY',
 'stalk_color_above_ring_CINNAMON',
 'odor_PUNGENT',
 '