In [1]:
import statsmodels.api as sm
import pandas as pd

# Spalten Namen einbeziehn 
colums = ['word_freq_make', 'word_freq_address', 'word_freq_all', 'word_freq_3d', 'word_freq_our', 'word_freq_over', 'word_freq_remove', 'word_freq_internet', 'word_freq_order', 'word_freq_mail', 'word_freq_receive', 'word_freq_will', 'word_freq_people', 'word_freq_report', 'word_freq_addresses', 'word_freq_free', 'word_freq_business', 'word_freq_email', 'word_freq_you', 'word_freq_credit', 'word_freq_your', 'word_freq_font', 'word_freq_000', 'word_freq_money', 'word_freq_hp', 'word_freq_hpl', 'word_freq_george', 'word_freq_650', 'word_freq_lab', 'word_freq_labs', 'word_freq_telnet', 'word_freq_857', 'word_freq_data', 'word_freq_415', 'word_freq_85', 'word_freq_technology', 'word_freq_1999', 'word_freq_parts', 'word_freq_pm', 'word_freq_direct', 'word_freq_cs', 'word_freq_meeting', 'word_freq_original', 'word_freq_project', 'word_freq_re', 'word_freq_edu', 'word_freq_table', 'word_freq_conference', 'char_freq_;', 'char_freq_(', 'char_freq_[', 'char_freq_!', 'char_freq_$', 'char_freq_#', 'capital_run_length_average', 'capital_run_length_longest', 'capital_run_length_total', 'class']

# Laden der Daten
data = pd.read_csv('spambase.data', names=colums)

# Trennung der Zielvariable und der Features
X = data.drop(columns='class')
y = data['class']

# Liste aller Features
all_features = list(X.columns)

# Vorab-Initialisierung der besten Features
best_features = []

# Forward Selection
while len(all_features) > 0:
    p_values = []
    for feature in all_features:
        features_to_try = best_features + [feature]
        X_try = sm.add_constant(X[features_to_try])
        model = sm.OLS(y, X_try).fit()
        p_value = model.pvalues[feature]
        p_values.append((feature, p_value))
    best_feature, best_p_value = min(p_values, key=lambda x: x[1])
    if best_p_value < 0.01:
        best_features.append(best_feature)
        all_features.remove(best_feature)
    else:
        break


# Ausgabe des besten Sets von Features
print('Bestes Set von Features:', best_features)
print(len(best_features))


Bestes Set von Features: ['word_freq_your', 'word_freq_000', 'word_freq_remove', 'capital_run_length_total', 'word_freq_free', 'char_freq_!', 'word_freq_hp', 'word_freq_our', 'char_freq_$', 'word_freq_internet', 'word_freq_money', 'word_freq_over', 'word_freq_credit', 'word_freq_font', 'word_freq_meeting', 'word_freq_email', 'word_freq_george', 'word_freq_edu', 'char_freq_;', 'word_freq_re', 'word_freq_you', 'word_freq_data', 'word_freq_order', 'word_freq_hpl', 'word_freq_will', 'word_freq_business', 'word_freq_1999', 'word_freq_project', 'word_freq_all', 'word_freq_3d', 'word_freq_conference', 'word_freq_table', 'word_freq_original', 'word_freq_make', 'word_freq_address', 'word_freq_85', 'capital_run_length_average']
37


In [3]:
# Überprüfung p-value
print(p_values)

[('word_freq_people', 0.4559249235408598), ('word_freq_report', 0.6527545671645643), ('word_freq_addresses', 0.2832314577643797), ('word_freq_650', 0.7194198851612752), ('word_freq_lab', 0.8588390307827478), ('word_freq_telnet', 0.6282124039064938), ('word_freq_857', 0.10039674668568786), ('word_freq_415', 0.09568525063901315), ('word_freq_technology', 0.12913613234433596), ('word_freq_pm', 0.10146252482518359), ('word_freq_cs', 0.49336763188928434), ('char_freq_[', 0.15359910609922803), ('capital_run_length_longest', 0.05974892159677463)]


In [26]:
#Speicherung der selektierten aten in CVS
selected_data = data.drop(columns=all_features)

display(selected_data)

selected_data.info
selected_data.to_csv('selected_data.csv')


Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,word_freq_table,word_freq_conference,char_freq_;,char_freq_(,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_total,class
0,0.00,0.64,0.64,0.0,0.32,0.00,0.00,0.00,0.00,0.00,...,0.0,0.0,0.000,0.000,0.778,0.000,0.000,3.756,278,1
1,0.21,0.28,0.50,0.0,0.14,0.28,0.21,0.07,0.00,0.94,...,0.0,0.0,0.000,0.132,0.372,0.180,0.048,5.114,1028,1
2,0.06,0.00,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.0,0.0,0.010,0.143,0.276,0.184,0.010,9.821,2259,1
3,0.00,0.00,0.00,0.0,0.63,0.00,0.31,0.63,0.31,0.63,...,0.0,0.0,0.000,0.137,0.137,0.000,0.000,3.537,191,1
4,0.00,0.00,0.00,0.0,0.63,0.00,0.31,0.63,0.31,0.63,...,0.0,0.0,0.000,0.135,0.135,0.000,0.000,3.537,191,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4596,0.31,0.00,0.62,0.0,0.00,0.31,0.00,0.00,0.00,0.00,...,0.0,0.0,0.000,0.232,0.000,0.000,0.000,1.142,88,0
4597,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.00,0.00,...,0.0,0.0,0.000,0.000,0.353,0.000,0.000,1.555,14,0
4598,0.30,0.00,0.30,0.0,0.00,0.00,0.00,0.00,0.00,0.00,...,0.0,0.0,0.102,0.718,0.000,0.000,0.000,1.404,118,0
4599,0.96,0.00,0.00,0.0,0.32,0.00,0.00,0.00,0.00,0.00,...,0.0,0.0,0.000,0.057,0.000,0.000,0.000,1.147,78,0
