In [1]:
# Read and Write Files
import pandas as pd

# Ensembles and Classifiers
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier

# Feature Selection / Extraction
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.feature_selection import chi2, SelectKBest, SelectFromModel
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

# Natural Language Toolkit
import nltk
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer

# Accuracy Metrics
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, classification_report

# Miscellaneous
import numpy as np
import warnings
warnings.filterwarnings("ignore")



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:
from google.colab import drive
drive.mount('/gdrive')
#Change current working directory to gdrive
%cd /gdrive

Mounted at /gdrive
/gdrive


In [3]:
#Read files
comment_df = r'/gdrive/My Drive/Data Mining/Assignment 4/Comments.csv'
comment_df = pd.read_csv(comment_df) #creates a dataframe

customer_information_df = r'/gdrive/My Drive/Data Mining/Assignment 4/Customers.csv'
customer_information_df = pd.read_csv(customer_information_df)  #creates a dataframe

In [4]:
print(comment_df.shape)
print(customer_information_df.shape)

print("")

customer_information_df.head()

(2070, 2)
(2070, 17)



Unnamed: 0,ID,Sex,Status,Children,Est_Income,Car_Owner,Usage,Age,RatePlan,LongDistance,International,Local,Dropped,Paymethod,LocalBilltype,LongDistanceBilltype,TARGET
0,1,F,S,1,38000.0,N,229.64,24.393333,3,23.56,0.0,206.08,0,CC,Budget,Intnl_discount,Cancelled
1,6,M,M,2,29616.0,N,75.29,49.426667,2,29.78,0.0,45.5,0,CH,FreeLocal,Standard,Current
2,8,M,M,0,19732.8,N,47.25,50.673333,3,24.81,0.0,22.44,0,CC,FreeLocal,Standard,Current
3,11,M,S,2,96.33,N,59.01,56.473333,1,26.13,0.0,32.88,1,CC,Budget,Standard,Current
4,14,F,M,2,52004.8,N,28.14,25.14,1,5.03,0.0,23.11,0,CH,Budget,Intnl_discount,Cancelled


In [5]:
#Extract target column from Customer Info file
X_train = customer_information_df.drop('TARGET', axis=1)
y_train = customer_information_df['TARGET'] #extracting training data without the target column


## **Part 1: Use Two Stemmers and Word Tokenizer**

In [6]:
comment_df['tokenized_comments'] = comment_df['Comments'].apply(word_tokenize)
comment_df.head()

Unnamed: 0,ID,Comments,tokenized_comments
0,1309,Does not like the way the phone works. It is t...,"[Does, not, like, the, way, the, phone, works,..."
1,3556,Wanted to know the nearest store location. Wan...,"[Wanted, to, know, the, nearest, store, locati..."
2,2230,Wants to know how to do text messaging. Referr...,"[Wants, to, know, how, to, do, text, messaging..."
3,2312,Asked how to disable call waiting. referred hi...,"[Asked, how, to, disable, call, waiting, ., re..."
4,3327,Needs help learning how to use the phone. I su...,"[Needs, help, learning, how, to, use, the, pho..."


Snowball Stemmer

In [7]:
stemmer = SnowballStemmer("english")

In [8]:
example_words = ["python","pythoner","pythoning","pythoned","pythonly"]
for w in example_words:
    print(stemmer.stem(w))

python
python
python
python
python


In [9]:
snowball_df = pd.DataFrame()
snowball_df['stem_token_comments'] = comment_df['tokenized_comments'].apply(lambda x: [stemmer.stem(y) for y in x]).apply(lambda x: " ".join(x))
snowball_df.head()

Unnamed: 0,stem_token_comments
0,doe not like the way the phone work . it is to...
1,want to know the nearest store locat . want to...
2,want to know how to do text messag . refer him...
3,ask how to disabl call wait . refer him to web...
4,need help learn how to use the phone . i sugge...


Porter Stemmer

In [10]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [11]:
example_words = ["python","pythoner","pythoning","pythoned","pythonly"]
for w in example_words:
    print(ps.stem(w))

python
python
python
python
pythonli


In [12]:
porter_df = pd.DataFrame()
porter_df['stem_token_comments'] = comment_df['tokenized_comments'].apply(lambda x: [stemmer.stem(y) for y in x]).apply(lambda x: " ".join(x))
porter_df.head()

Unnamed: 0,stem_token_comments
0,doe not like the way the phone work . it is to...
1,want to know the nearest store locat . want to...
2,want to know how to do text messag . refer him...
3,ask how to disabl call wait . refer him to web...
4,need help learn how to use the phone . i sugge...


## **Part 2: Construct the Term-Document Matrix**

In [13]:
count_vectorizer = CountVectorizer(stop_words='english',lowercase=False)
term_document_counts = count_vectorizer.fit_transform(snowball_df['stem_token_comments'])
term_document_matrix = pd.DataFrame(term_document_counts.toarray(), columns=count_vectorizer.get_feature_names()).set_index(porter_df.index)

In [14]:
print(term_document_counts.shape)

print(count_vectorizer.get_feature_names())

(2070, 354)
['3399', '3g', 'abysm', 'access', 'accessori', 'adapt', 'add', 'addit', 'additon', 'address', 'adit', 'adress', 'advertis', 'afraid', 'alway', 'angel', 'angri', 'ani', 'anoth', 'anyth', 'anytim', 'area', 'asap', 'ask', 'bad', 'basic', 'bateri', 'batteri', 'becaus', 'believ', 'better', 'bigger', 'book', 'bought', 'brain', 'bring', 'built', 'busi', 'button', 'buy', 'cancel', 'cancer', 'car', 'care', 'carrier', 'caus', 'cc', 'cell', 'certain', 'chang', 'charg', 'charger', 'check', 'chip', 'citi', 'claim', 'cleariti', 'cold', 'comapr', 'compani', 'compar', 'competit', 'complain', 'complaint', 'concept', 'connect', 'consisit', 'consist', 'constan', 'contact', 'continu', 'contract', 'correct', 'cost', 'coupl', 'cover', 'coverag', 'creat', 'credit', 'cstmer', 'cstmr', 'current', 'cust', 'custom', 'customr', 'date', 'day', 'dead', 'decent', 'defect', 'deo', 'did', 'die', 'differ', 'difficult', 'digiti', 'direct', 'disabl', 'doe', 'don', 'dont', 'drop', 'dure', 'easier', 'effect', '

## **Part 3: Compute Term Frequency–Inverse Document Frequency (TF-IDF) Matrix from the Term Document Matrix**

In [15]:
tf_idf_transformer = TfidfTransformer()
tf_idf_X_train = tf_idf_transformer.fit_transform(term_document_counts)
tf_idf_df = pd.DataFrame(tf_idf_X_train.toarray(), columns=count_vectorizer.get_feature_names()).set_index(snowball_df.index)
tf_idf_df.head()

#Text is now structured form, can use to build classificaiton model or regression model 
#Might say this is too many words, 354 words --> feature selection

Unnamed: 0,3399,3g,abysm,access,accessori,adapt,add,addit,additon,address,adit,adress,advertis,afraid,alway,angel,angri,ani,anoth,anyth,anytim,area,asap,ask,bad,basic,bateri,batteri,becaus,believ,better,bigger,book,bought,brain,bring,built,busi,button,buy,...,transf,transfer,travel,tri,trust,turn,uncomfort,understand,unhappi,unlimit,unreli,unwil,upset,usag,use,useless,valu,veri,vm,wa,wait,want,wast,way,weak,web,websit,week,whi,wife,wish,wll,wold,work,wors,worst,wrong,xvyx,year,york
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.388928,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.209678,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.27568,0.0,0.0,0.0,0.0,0.0,0.708478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.275333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.312065,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.195324,0.0,0.0,0.0,0.0,0.567289,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.352868,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.480064,0.0,0.0,0.0,0.0,0.333825,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.348322,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Perform Feature Selection

In [16]:
selector = SelectKBest(score_func=chi2, k=10)
selector.fit_transform(tf_idf_df, y_train)
cols = selector.get_support(indices=True)

tf_idf_df_selected_features_df = tf_idf_df.iloc[:,cols]
tf_idf_df_selected_features_df.tail()

Unnamed: 0,alway,charg,continu,explain,figur,ot,receiv,screen,transeff,turn
2065,0.0,0.446161,0.0,0.460113,0.457852,0.0,0.0,0.0,0.0,0.0
2066,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2067,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2068,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2069,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Accuracy Score, Report, and Cross Validation Accuracy

In [17]:
def accuracy_score(classifier, comparison):
    print("===ACCURACY SCORE===")
    print(round(classifier.score(comparison, y_train),6))
    print("")

In [18]:
def accuracy_report(predictions):
    print("===CONFUSION MATRIX===")
    print(confusion_matrix(y_train, predictions))
    print("")
    print("===CLASSIFICATION REPORT===")
    print(classification_report(y_train, predictions))

In [19]:
def cross_val_accuracy(cross_val_score):
    print("===ALL ACCURACY SCORES===")
    print(cross_val_score)
    print("")
    print("===MEAN ACCURACY SCORE===")
    print(round(cross_val_score.mean(),6))

Cast Random Forest Classifier onto Text Data

In [20]:
random_forest = RandomForestClassifier()
random_forest_text = random_forest.fit(tf_idf_df_selected_features_df, y_train)
random_forst_predictions = random_forest.predict(tf_idf_df_selected_features_df)
accuracy_score(random_forest, tf_idf_df_selected_features_df)
accuracy_report(random_forst_predictions)

===ACCURACY SCORE===
0.627536

===CONFUSION MATRIX===
[[  51  753]
 [  18 1248]]

===CLASSIFICATION REPORT===
              precision    recall  f1-score   support

   Cancelled       0.74      0.06      0.12       804
     Current       0.62      0.99      0.76      1266

    accuracy                           0.63      2070
   macro avg       0.68      0.52      0.44      2070
weighted avg       0.67      0.63      0.51      2070



Cross-Validation on Random Forest Classifier for Text Data

In [21]:
random_forest_cross_val_score = cross_val_score(
    random_forest, 
    tf_idf_df_selected_features_df, 
    y_train, 
    cv=20, 
    scoring="balanced_accuracy"
)

cross_val_accuracy(random_forest_cross_val_score) #True Accuracy

===ALL ACCURACY SCORES===
[0.52864886 0.50425861 0.51645374 0.53290747 0.5046875  0.5546875
 0.521875   0.534375   0.5046875  0.5546875  0.52956349 0.54206349
 0.49206349 0.5125     0.5125     0.50456349 0.5125     0.55456349
 0.5125     0.49662698]

===MEAN ACCURACY SCORE===
0.521336


## **Part 4: Combine TF-IDF Matrix with Customer Data and do One-Hot Encoding**

Merge Files

In [22]:
#Merge files

print(customer_information_df.shape)
X_train = customer_information_df.drop('TARGET', axis=1) #extracting training data without the target column

print(X_train.shape)

df=pd.concat([X_train, tf_idf_df_selected_features_df], axis=1)
print(df.shape)
print(df)

(2070, 17)
(2070, 16)
(2070, 26)
        ID Sex Status  Children  ...  receiv screen  transeff  turn
0        1   F      S         1  ...     0.0    0.0       0.0   0.0
1        6   M      M         2  ...     0.0    0.0       0.0   0.0
2        8   M      M         0  ...     0.0    0.0       0.0   0.0
3       11   M      S         2  ...     0.0    0.0       0.0   0.0
4       14   F      M         2  ...     0.0    0.0       0.0   0.0
...    ...  ..    ...       ...  ...     ...    ...       ...   ...
2065  3821   F      S         0  ...     0.0    0.0       0.0   0.0
2066  3822   F      S         1  ...     0.0    0.0       0.0   0.0
2067  3823   F      M         0  ...     0.0    0.0       0.0   0.0
2068  3824   F      M         2  ...     0.0    0.0       0.0   0.0
2069  3825   F      S         0  ...     0.0    0.0       0.0   0.0

[2070 rows x 26 columns]


One Hot Encoding on Categorical Features

In [23]:
columns = df.keys()
encode_columns = []

for i in range(len(columns)):
    if type(df[columns[i]].iloc[0]) == str:
        encode_columns.append(columns[i])

        encoded_df = pd.get_dummies(df, columns=encode_columns)
encoded_df.head()

Unnamed: 0,ID,Children,Est_Income,Usage,Age,RatePlan,LongDistance,International,Local,Dropped,alway,charg,continu,explain,figur,ot,receiv,screen,transeff,turn,Sex_F,Sex_M,Status_D,Status_M,Status_S,Car_Owner_N,Car_Owner_Y,Paymethod_Auto,Paymethod_CC,Paymethod_CH,LocalBilltype_Budget,LocalBilltype_FreeLocal,LongDistanceBilltype_Intnl_discount,LongDistanceBilltype_Standard
0,1,1,38000.0,229.64,24.393333,3,23.56,0.0,206.08,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0,0,1,1,0,0,1,0,1,0,1,0
1,6,2,29616.0,75.29,49.426667,2,29.78,0.0,45.5,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0,1,0,1,0,0,0,1,0,1,0,1
2,8,0,19732.8,47.25,50.673333,3,24.81,0.0,22.44,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0,1,0,1,0,0,1,0,0,1,0,1
3,11,2,96.33,59.01,56.473333,1,26.13,0.0,32.88,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0,0,1,1,0,0,1,0,1,0,0,1
4,14,2,52004.8,28.14,25.14,1,5.03,0.0,23.11,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0,1,0,1,0,0,0,1,1,0,1,0


## **Part 5: Feature Selection Methods (Filter and Wrapper) to determine best set of features**

Cast Random Forest Classifier on Full Data

In [24]:
total_random_forest = random_forest.fit(encoded_df, y_train)
total_random_forest_predictions = random_forest.predict(encoded_df)
accuracy_score(total_random_forest, encoded_df)
accuracy_report(total_random_forest_predictions)

===ACCURACY SCORE===
1.0

===CONFUSION MATRIX===
[[ 804    0]
 [   0 1266]]

===CLASSIFICATION REPORT===
              precision    recall  f1-score   support

   Cancelled       1.00      1.00      1.00       804
     Current       1.00      1.00      1.00      1266

    accuracy                           1.00      2070
   macro avg       1.00      1.00      1.00      2070
weighted avg       1.00      1.00      1.00      2070



Cross Validation on Combined Data

In [25]:
total_random_forest_cross_val_score = cross_val_score(
    total_random_forest, 
    encoded_df,
    y_train, 
    cv=20, 
    scoring="balanced_accuracy"
)

cross_val_accuracy(total_random_forest_cross_val_score)

===ALL ACCURACY SCORES===
[0.85598142 0.85965931 0.85423926 0.91153697 0.8828125  0.9140625
 0.8234375  0.8109375  0.8546875  0.8734375  0.90119048 0.90575397
 0.90575397 0.80813492 0.86031746 0.80575397 0.88988095 0.95912698
 0.88988095 0.91488095]

===MEAN ACCURACY SCORE===
0.874073


Construct Random Forest without Text Data

In [26]:
no_text_df = encoded_df.drop(columns=tf_idf_df_selected_features_df.keys(), axis=1)
no_text_df.head()

Unnamed: 0,ID,Children,Est_Income,Usage,Age,RatePlan,LongDistance,International,Local,Dropped,Sex_F,Sex_M,Status_D,Status_M,Status_S,Car_Owner_N,Car_Owner_Y,Paymethod_Auto,Paymethod_CC,Paymethod_CH,LocalBilltype_Budget,LocalBilltype_FreeLocal,LongDistanceBilltype_Intnl_discount,LongDistanceBilltype_Standard
0,1,1,38000.0,229.64,24.393333,3,23.56,0.0,206.08,0,1,0,0,0,1,1,0,0,1,0,1,0,1,0
1,6,2,29616.0,75.29,49.426667,2,29.78,0.0,45.5,0,0,1,0,1,0,1,0,0,0,1,0,1,0,1
2,8,0,19732.8,47.25,50.673333,3,24.81,0.0,22.44,0,0,1,0,1,0,1,0,0,1,0,0,1,0,1
3,11,2,96.33,59.01,56.473333,1,26.13,0.0,32.88,1,0,1,0,0,1,1,0,0,1,0,1,0,0,1
4,14,2,52004.8,28.14,25.14,1,5.03,0.0,23.11,0,1,0,0,1,0,1,0,0,0,1,1,0,1,0


In [27]:
no_text_random_forest = random_forest.fit(no_text_df, y_train)
no_text_random_forest_predictions = random_forest.predict(no_text_df)
accuracy_score(no_text_random_forest, no_text_df)
accuracy_report(no_text_random_forest_predictions)

===ACCURACY SCORE===
1.0

===CONFUSION MATRIX===
[[ 804    0]
 [   0 1266]]

===CLASSIFICATION REPORT===
              precision    recall  f1-score   support

   Cancelled       1.00      1.00      1.00       804
     Current       1.00      1.00      1.00      1266

    accuracy                           1.00      2070
   macro avg       1.00      1.00      1.00      2070
weighted avg       1.00      1.00      1.00      2070



Random Forest Classifier without Text Data - Customer Info

In [28]:
no_text_random_forest_cross_val_score = cross_val_score(
    no_text_random_forest, 
    no_text_df,
    y_train, 
    cv=20, 
    scoring="balanced_accuracy"
)

Cross Validation without Text Data

In [29]:
cross_val_accuracy(no_text_random_forest_cross_val_score)

===ALL ACCURACY SCORES===
[0.86391792 0.88346883 0.85423926 0.93960511 0.8703125  0.9140625
 0.81875    0.815625   0.875      0.90625    0.89325397 0.91825397
 0.91825397 0.7922619  0.90575397 0.80575397 0.88988095 0.95912698
 0.90238095 0.91488095]

===MEAN ACCURACY SCORE===
0.882052


Feature Selection on Full Data - Using Decision Tree Classifier

In [30]:
decision_tree = DecisionTreeClassifier()
decision_tree.fit(no_text_df, y_train)

model = SelectFromModel(decision_tree, prefit=True, max_features=7, threshold=-np.inf)
feature_idx = model.get_support()
feature_names = no_text_df.columns[feature_idx]

x = model.transform(no_text_df)
x_selected_features_df = pd.DataFrame(x, columns=feature_names).set_index(X_train.index)
x_selected_features_df

Unnamed: 0,ID,Children,Est_Income,Age,LongDistance,Local,Status_M
0,1.0,1.0,38000.00,24.393333,23.56,206.08,0.0
1,6.0,2.0,29616.00,49.426667,29.78,45.50,1.0
2,8.0,0.0,19732.80,50.673333,24.81,22.44,1.0
3,11.0,2.0,96.33,56.473333,26.13,32.88,0.0
4,14.0,2.0,52004.80,25.140000,5.03,23.11,1.0
...,...,...,...,...,...,...,...
2065,3821.0,0.0,78851.30,48.373333,0.37,28.66,0.0
2066,3822.0,1.0,17540.70,62.786667,22.17,13.45,0.0
2067,3823.0,0.0,83891.90,61.020000,28.92,45.47,1.0
2068,3824.0,2.0,28220.80,38.766667,26.49,12.46,1.0


## **Part 6: Train-Test Split, build new classifier with best set of features**

Train Test Split 

In [31]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    x_selected_features_df,
    y_train,
    test_size=0.20, 
    random_state=20)

In [32]:
def accuracy_output(model): 
  print(" ~Accuracy~")
  print(round(model.score(X_test, y_test), 6))

In [33]:
def confusion_output(y_test,X_test_prediction):
  print("~Confusion Matrix~")
  print(confusion_matrix(y_test, X_test_prediction))

Construct New Classification Model

In [34]:
random_forest = RandomForestClassifier()
random_forest.fit(X_train, y_train)
random_forest_prediction = random_forest.predict(X_test)

In [35]:
accuracy_output(random_forest)
print() 
confusion_output(y_test, random_forest_prediction)   

 ~Accuracy~
0.852657

~Confusion Matrix~
[[137  33]
 [ 28 216]]


Sequential Forward Search

In [36]:
sequential_forward_search = SFS(
    decision_tree, 
    k_features=7, 
    forward=True, 
    floating=False, 
    verbose=2,
    scoring='accuracy',
    cv=20)

sequential_forward_search.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.6s finished

[2020-10-30 16:40:57] Features: 1/7 -- score: 0.8647884219806053[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.6s finished

[2020-10-30 16:40:58] Features: 2/7 -- score: 0.8702394945636203[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.5s finished

[2020-10-30 16:40:59] Features: 3/7 -- score: 0.8678004701733764[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 

SequentialFeatureSelector(clone_estimator=True, cv=20,
                          estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                           class_weight=None,
                                                           criterion='gini',
                                                           max_depth=None,
                                                           max_features=None,
                                                           max_leaf_nodes=None,
                                                           min_impurity_decrease=0.0,
                                                           min_impurity_split=None,
                                                           min_samples_leaf=1,
                                                           min_samples_split=2,
                                                           min_weight_fraction_leaf=0.0,
                                                           presor

In [37]:
#Sequential forward search result
print("~Sequential Forward Search Parameters~")
print(sequential_forward_search.k_feature_names_) #prints the top 7 features

print("~Sequential Forward Search Score~")
print(sequential_forward_search.k_score_) #score based on that set of features

~Sequential Forward Search Parameters~
('ID', 'Children', 'Est_Income', 'Age', 'LongDistance', 'Local', 'Status_M')
~Sequential Forward Search Score~
0.8267411107846018
