In [54]:

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
#from sklearn.feature_extraction.text import TfidfVectorizer

#NLTK-------------------------------
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
from sklearn.model_selection import train_test_split
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
from nltk.corpus import stopwords

# Import libraries for feature 
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2


from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix,classification_report
from sklearn import metrics
from sklearn.model_selection import cross_val_score

import warnings
warnings.filterwarnings("ignore")


from google.colab import drive
drive.mount('/gdrive')
#Change current working directory to gdrive
%cd /gdrive


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
/gdrive


In [55]:
#Read files
textfile = r'/gdrive/My Drive/CIS508/Assignment-5/Comments.csv'
textData = pd.read_csv(textfile) #creates a dataframe

CustInfofile = r'/gdrive/My Drive/CIS508/Assignment-5/Customers.csv'
CustInfoData = pd.read_csv(CustInfofile)  #creates a dataframe

print(textData.shape)
print(CustInfoData.shape)


(2070, 2)
(2070, 17)


In [56]:
#Extract target column from Customer Info file
Y_Train = CustInfoData["TARGET"]
X_Train = CustInfoData.drop(columns=["TARGET"]) #extracting training data without the target column
                     
print(X_Train.shape)
print(textData.shape)
textData.head()
print(X_Train)
print(Y_Train)

(2070, 16)
(2070, 2)
        ID Sex Status  ...  Paymethod  LocalBilltype LongDistanceBilltype
0        1   F      S  ...         CC         Budget       Intnl_discount
1        6   M      M  ...         CH      FreeLocal             Standard
2        8   M      M  ...         CC      FreeLocal             Standard
3       11   M      S  ...         CC         Budget             Standard
4       14   F      M  ...         CH         Budget       Intnl_discount
...    ...  ..    ...  ...        ...            ...                  ...
2065  3821   F      S  ...         CC      FreeLocal             Standard
2066  3822   F      S  ...       Auto         Budget             Standard
2067  3823   F      M  ...         CH         Budget             Standard
2068  3824   F      M  ...         CC      FreeLocal             Standard
2069  3825   F      S  ...         CC      FreeLocal             Standard

[2070 rows x 16 columns]
0       Cancelled
1         Current
2         Current
3         C

**SNOWBALL STEMMER**

In [0]:
# Use English stemmer.
Stemmer1 = SnowballStemmer("english")
#Tokenize - Split the sentences to lists of words
textData['CommentsTokenized'] = textData['Comments'].apply(word_tokenize)
export_csv = textData.to_csv(r'/gdrive/My Drive/CIS508/Assignment-5/TextDataTokenized_snow.csv')

#Now do stemming - create a new dataframe to store stemmed version
NewTextData1=pd.DataFrame()
NewTextData1=textData.drop(columns=["CommentsTokenized","Comments"])
NewTextData1['CommentsTokenizedStemmed'] = textData['CommentsTokenized'].apply(lambda x: [Stemmer1.stem(y) for y in x]) # Stem every word.
export_csv = NewTextData1.to_csv(r'/gdrive/My Drive/CIS508/Assignment-5/NewTextDataTS_snow.csv')

#Join stemmed strings
NewTextData1['CommentsTokenizedStemmed'] = NewTextData1['CommentsTokenizedStemmed'].apply(lambda x: " ".join(x))
export_csv = NewTextData1.to_csv(r'/gdrive/My Drive/CIS508/Assignment-5/NewTextData-Joined_snow.csv')



In [58]:
#Do Bag-Of-Words model - Term - Document Matrix
#Learn the vocabulary dictionary and return term-document matrix.
#count_vect = CountVectorizer(stop_words=None)
count_vect = CountVectorizer(stop_words='english',lowercase=False)
TD_counts1 = count_vect.fit_transform(NewTextData1.CommentsTokenizedStemmed)
#print(TD_counts1.shape)
TD_counts1.dtype
print(count_vect.get_feature_names())
print("No. of stem words in SNOWBALL stemmer is : ",len(count_vect.get_feature_names()))
#print(TD_counts1)
DF_TD_Counts=pd.DataFrame(TD_counts1.toarray())
print(DF_TD_Counts)
export_csv = DF_TD_Counts.to_csv(r'/gdrive/My Drive/CIS508/Assignment-5/TD_counts-TokenizedStemmed_snow.csv')


['3399', '3g', 'abysm', 'access', 'accessori', 'adapt', 'add', 'addit', 'additon', 'address', 'adit', 'adress', 'advertis', 'afraid', 'alway', 'angel', 'angri', 'ani', 'anoth', 'anyth', 'anytim', 'area', 'asap', 'ask', 'bad', 'basic', 'bateri', 'batteri', 'becaus', 'believ', 'better', 'bigger', 'book', 'bought', 'brain', 'bring', 'built', 'busi', 'button', 'buy', 'cancel', 'cancer', 'car', 'care', 'carrier', 'caus', 'cc', 'cell', 'certain', 'chang', 'charg', 'charger', 'check', 'chip', 'citi', 'claim', 'cleariti', 'cold', 'comapr', 'compani', 'compar', 'competit', 'complain', 'complaint', 'concept', 'connect', 'consisit', 'consist', 'constan', 'contact', 'continu', 'contract', 'correct', 'cost', 'coupl', 'cover', 'coverag', 'creat', 'credit', 'cstmer', 'cstmr', 'current', 'cust', 'custom', 'customr', 'date', 'day', 'dead', 'decent', 'defect', 'deo', 'did', 'die', 'differ', 'difficult', 'digiti', 'direct', 'disabl', 'doe', 'don', 'dont', 'drop', 'dure', 'easier', 'effect', 'encount', 'e

**PORTER STEMMER**

In [0]:
# Use English stemmer.
Stemmer2 = PorterStemmer()
#Tokenize - Split the sentences to lists of words
textData['CommentsTokenized'] = textData['Comments'].apply(word_tokenize)

export_csv = textData.to_csv(r'/gdrive/My Drive/CIS508/Assignment-5/TextDataTokenized_porter.csv')

#Now do stemming - create a new dataframe to store stemmed version
NewTextData2=pd.DataFrame()
NewTextData2=textData.drop(columns=["CommentsTokenized","Comments"])
NewTextData2['CommentsTokenizedStemmed'] = textData['CommentsTokenized'].apply(lambda x: [Stemmer2.stem(y) for y in x]) # Stem every word.

export_csv = NewTextData2.to_csv(r'/gdrive/My Drive/CIS508/Assignment-5/NewTextDataTS_porter.csv')

#Join stemmed strings
NewTextData2['CommentsTokenizedStemmed'] = NewTextData2['CommentsTokenizedStemmed'].apply(lambda x: " ".join(x))

export_csv = NewTextData2.to_csv(r'/gdrive/My Drive/CIS508/Assignment-5/NewTextData-Joined_porter.csv')

In [60]:
#Do Bag-Of-Words model - Term - Document Matrix
#Learn the vocabulary dictionary and return term-document matrix.
#count_vect = CountVectorizer(stop_words=None)
count_vect = CountVectorizer(stop_words='english',lowercase=False)
TD_counts2 = count_vect.fit_transform(NewTextData2.CommentsTokenizedStemmed)
#print(TD_counts2.shape)
TD_counts2.dtype
print(count_vect.get_feature_names())
print("No. of stem words in PORTER stemmer is : ",len(count_vect.get_feature_names()))
#print(TD_counts1)
DF_TD_Counts=pd.DataFrame(TD_counts2.toarray())
print(DF_TD_Counts)
export_csv = DF_TD_Counts.to_csv(r'/gdrive/My Drive/CIS508/Assignment-5/TD_counts-TokenizedStemmed_porter.csv')

['3399', '3g', 'As', 'CC', 'He', 'If', 'In', 'Is', 'It', 'We', 'abysm', 'access', 'accessori', 'adapt', 'add', 'addit', 'additon', 'address', 'adit', 'adress', 'advertis', 'afraid', 'alway', 'angel', 'angri', 'ani', 'anoth', 'anyth', 'anytim', 'area', 'asap', 'ask', 'bad', 'basic', 'bateri', 'batteri', 'becaus', 'believ', 'better', 'bigger', 'book', 'bought', 'brain', 'bring', 'built', 'busi', 'button', 'buy', 'cancel', 'cancer', 'car', 'care', 'carrier', 'caus', 'cc', 'cell', 'certain', 'chang', 'charg', 'charger', 'check', 'chip', 'citi', 'claim', 'cleariti', 'cold', 'comapr', 'compani', 'compar', 'competit', 'complain', 'complaint', 'concept', 'connect', 'consisit', 'consist', 'constanli', 'contact', 'continu', 'contract', 'correct', 'cost', 'coupl', 'cover', 'coverag', 'creat', 'credit', 'cstmer', 'cstmr', 'current', 'cust', 'custom', 'customr', 'date', 'day', 'dead', 'decent', 'defect', 'deo', 'did', 'die', 'differ', 'difficult', 'digiti', 'direct', 'disabl', 'doe', 'don', 'dont',

**LANCASTER STEMMER**

In [0]:
# Use English stemmer.
Stemmer3 = LancasterStemmer()
#Tokenize - Split the sentences to lists of words
textData['CommentsTokenized'] = textData['Comments'].apply(word_tokenize)

export_csv = textData.to_csv(r'/gdrive/My Drive/CIS508/Assignment-5/TextDataTokenized_lancaster.csv')

#Now do stemming - create a new dataframe to store stemmed version
NewTextData=pd.DataFrame()
NewTextData=textData.drop(columns=["CommentsTokenized","Comments"])
NewTextData['CommentsTokenizedStemmed'] = textData['CommentsTokenized'].apply(lambda x: [Stemmer3.stem(y) for y in x]) # Stem every word.

export_csv = NewTextData.to_csv(r'/gdrive/My Drive/CIS508/Assignment-5/NewTextDataTS_lancaster.csv')

#Join stemmed strings
NewTextData['CommentsTokenizedStemmed'] = NewTextData['CommentsTokenizedStemmed'].apply(lambda x: " ".join(x))

export_csv = NewTextData.to_csv(r'/gdrive/My Drive/CIS508/Assignment-5/NewTextData-Joined_lancaster.csv')

In [62]:
#Do Bag-Of-Words model - Term - Document Matrix
#Learn the vocabulary dictionary and return term-document matrix.
#count_vect = CountVectorizer(stop_words=None)
count_vect = CountVectorizer(stop_words='english',lowercase=False)
TD_counts3 = count_vect.fit_transform(NewTextData.CommentsTokenizedStemmed)
#print(TD_counts3.shape)
TD_counts3.dtype
print(count_vect.get_feature_names())
print("No. of stem words in LANCASTER stemmer is : ",len(count_vect.get_feature_names()))
#print(TD_counts3)
DF_TD_Counts=pd.DataFrame(TD_counts3.toarray())
print(DF_TD_Counts)
export_csv = DF_TD_Counts.to_csv(r'/gdrive/My Drive/CIS508/Assignment-5/TD_counts-TokenizedStemmed_lancaster.csv')

['3399', '3g', 'abysm', 'access', 'ad', 'adapt', 'addit', 'additon', 'address', 'adit', 'adress', 'advert', 'afraid', 'aft', 'al', 'alway', 'angel', 'angry', 'anoth', 'anyth', 'anytim', 'ar', 'asap', 'ask', 'bad', 'bas', 'batery', 'battery', 'becaus', 'believ', 'bet', 'big', 'bil', 'book', 'bought', 'brain', 'bring', 'built', 'busy', 'button', 'buy', 'cal', 'cancel', 'car', 'carry', 'caus', 'cc', 'cel', 'certain', 'chang', 'charg', 'check', 'chip', 'city', 'claim', 'clear', 'cold', 'comapr', 'comp', 'company', 'competit', 'complain', 'complaint', 'conceiv', 'connect', 'consisit', 'consist', 'const', 'contact', 'continu', 'contract', 'correct', 'cost', 'coupl', 'cov', 'cre', 'credit', 'cstmer', 'cstmr', 'cur', 'cust', 'custom', 'customr', 'dat', 'day', 'dead', 'dec', 'defect', 'deo', 'did', 'die', 'diff', 'difficult', 'digit', 'direct', 'dis', 'doe', 'don', 'dont', 'drop', 'dur', 'dying', 'easy', 'effect', 'encount', 'end', 'enemy', 'equip', 'ev', 'everytim', 'everywh', 'evrey', 'exact'

**TF-IDF Matrix on best Stemmer method**

In [63]:
#Compute TF-IDF Matrix
tfidf_transformer = TfidfTransformer()
X_Train_tfidf = tfidf_transformer.fit_transform(TD_counts1)
print(X_Train_tfidf.shape)
DF_TF_IDF=pd.DataFrame(X_Train_tfidf.toarray())
print(DF_TF_IDF)
export_csv= DF_TF_IDF.to_csv(r'/gdrive/My Drive/CIS508/Assignment-5/TFIDF_counts_porter.csv')


(2070, 354)
      0    1    2    3        4    5    ...  348  349  350  351  352  353
0     0.0  0.0  0.0  0.0  0.00000  0.0  ...  0.0  0.0  0.0  0.0  0.0  0.0
1     0.0  0.0  0.0  0.0  0.27568  0.0  ...  0.0  0.0  0.0  0.0  0.0  0.0
2     0.0  0.0  0.0  0.0  0.00000  0.0  ...  0.0  0.0  0.0  0.0  0.0  0.0
3     0.0  0.0  0.0  0.0  0.00000  0.0  ...  0.0  0.0  0.0  0.0  0.0  0.0
4     0.0  0.0  0.0  0.0  0.00000  0.0  ...  0.0  0.0  0.0  0.0  0.0  0.0
...   ...  ...  ...  ...      ...  ...  ...  ...  ...  ...  ...  ...  ...
2065  0.0  0.0  0.0  0.0  0.00000  0.0  ...  0.0  0.0  0.0  0.0  0.0  0.0
2066  0.0  0.0  0.0  0.0  0.00000  0.0  ...  0.0  0.0  0.0  0.0  0.0  0.0
2067  0.0  0.0  0.0  0.0  0.00000  0.0  ...  0.0  0.0  0.0  0.0  0.0  0.0
2068  0.0  0.0  0.0  0.0  0.00000  0.0  ...  0.0  0.0  0.0  0.0  0.0  0.0
2069  0.0  0.0  0.0  0.0  0.00000  0.0  ...  0.0  0.0  0.0  0.0  0.0  0.0

[2070 rows x 354 columns]


In [64]:
#Merge files

print(CustInfoData.shape)

combined=pd.concat([CustInfoData,DF_TF_IDF_SelectedFeatures], axis=1)
print(combined.shape)
print(combined)
#export_csv= combined.to_csv(r'/gdrive/My Drive/CIS508/Combined-Cust+TFIDF+SelectedFeatures.csv')



(2070, 17)
(2070, 67)
        ID Sex Status  Children  Est_Income  ...   45        46   47   48   49
0        1   F      S         1    38000.00  ...  0.0  0.000000  0.0  0.0  0.0
1        6   M      M         2    29616.00  ...  0.0  0.000000  0.0  0.0  0.0
2        8   M      M         0    19732.80  ...  0.0  0.000000  0.0  0.0  0.0
3       11   M      S         2       96.33  ...  0.0  0.000000  0.0  0.0  0.0
4       14   F      M         2    52004.80  ...  0.0  0.348322  0.0  0.0  0.0
...    ...  ..    ...       ...         ...  ...  ...       ...  ...  ...  ...
2065  3821   F      S         0    78851.30  ...  0.0  0.000000  0.0  0.0  0.0
2066  3822   F      S         1    17540.70  ...  0.0  0.000000  0.0  0.0  0.0
2067  3823   F      M         0    83891.90  ...  0.0  0.000000  0.0  0.0  0.0
2068  3824   F      M         2    28220.80  ...  0.0  0.000000  0.0  0.0  0.0
2069  3825   F      S         0    28589.10  ...  0.0  0.000000  0.0  0.0  0.0

[2070 rows x 67 columns]


In [65]:
#Do one Hot encoding for categorical features
categorical_features = ["Sex","Status","Car_Owner","Paymethod","LocalBilltype","LongDistanceBilltype"]
#X_cat = combined.select_dtypes(exclude=['int','float64'])
print(categorical_features)
combined_one_hot = pd.get_dummies(combined,columns=categorical_features)
print(combined_one_hot.shape)
export_csv= combined_one_hot.to_csv(r'/gdrive/My Drive/CIS508/Assignment-5/combined_one_hot.csv')



['Sex', 'Status', 'Car_Owner', 'Paymethod', 'LocalBilltype', 'LongDistanceBilltype']
(2070, 75)


**Feature selection: Filter method**

In [66]:
#Feature selection
new_DF_TF_IDF = SelectKBest(score_func=chi2, k=50).fit_transform(DF_TF_IDF,Y_Train)
new_DF_TF_IDF.shape

DF_TF_IDF_SelectedFeatures= pd.DataFrame(new_DF_TF_IDF)
print(DF_TF_IDF_SelectedFeatures)
#print(DF_TF_IDF_SelectedFeatures.columns)


#export_csv= DF_TF_IDF_SelectedFeatures.to_csv(r'/gdrive/My Drive/CIS508/TFIDF_counts-Selected Features.csv')


            0    1    2    3         4         5   ...   44   45   46   47   48   49
0     0.000000  0.0  0.0  0.0  0.000000  0.000000  ...  0.0  0.0  0.0  0.0  0.0  0.0
1     0.000000  0.0  0.0  0.0  0.000000  0.000000  ...  0.0  0.0  0.0  0.0  0.0  0.0
2     0.000000  0.0  0.0  0.0  0.000000  0.000000  ...  0.0  0.0  0.0  0.0  0.0  0.0
3     0.000000  0.0  0.0  0.0  0.000000  0.000000  ...  0.0  0.0  0.0  0.0  0.0  0.0
4     0.000000  0.0  0.0  0.0  0.000000  0.000000  ...  0.0  0.0  0.0  0.0  0.0  0.0
...        ...  ...  ...  ...       ...       ...  ...  ...  ...  ...  ...  ...  ...
2065  0.000000  0.0  0.0  0.0  0.000000  0.446161  ...  0.0  0.0  0.0  0.0  0.0  0.0
2066  0.000000  0.0  0.0  0.0  0.000000  0.000000  ...  0.0  0.0  0.0  0.0  0.0  0.0
2067  0.000000  0.0  0.0  0.0  0.000000  0.000000  ...  0.0  0.0  0.0  0.0  0.0  0.0
2068  0.772949  0.0  0.0  0.0  0.545354  0.000000  ...  0.0  0.0  0.0  0.0  0.0  0.0
2069  0.000000  0.0  0.0  0.0  0.000000  0.000000  ...  0.0  0.0 

**Feature selection: Filter method: Splitting Train and Test data**

In [67]:
X_Train1, X_Test1, Y_Train1, Y_Test1 = train_test_split(combined_one_hot.drop(columns=["TARGET"]), 
                                                        combined_one_hot["TARGET"], 
                                                        test_size=0.20, random_state=42)
print('Training dataset shape:', X_Train1.shape, Y_Train1)
print('Testing dataset shape:', X_Test1.shape, Y_Test1)


Training dataset shape: (1656, 74) 849       Current
1043    Cancelled
175       Current
1228      Current
538     Cancelled
          ...    
1638      Current
1095      Current
1130      Current
1294      Current
860       Current
Name: TARGET, Length: 1656, dtype: object
Testing dataset shape: (414, 74) 1181      Current
69        Current
351     Cancelled
1163    Cancelled
429       Current
          ...    
1532      Current
1671      Current
416       Current
2023      Current
1428      Current
Name: TARGET, Length: 414, dtype: object


In [68]:
#Construct a Random Forest Classifier on text data
clf=RandomForestClassifier()
RF_text = clf.fit(X_Train1,Y_Train1)
print("Accuracy score (training): {0:.6f}".format(clf.score(X_Test1, Y_Test1)))
rf_predictions = clf.predict(X_Test1)
print("Confusion Matrix:")
print(confusion_matrix(Y_Test1, rf_predictions))
print("Classification Report")
print(classification_report(Y_Test1, rf_predictions))

Accuracy score (training): 0.842995
Confusion Matrix:
[[124  33]
 [ 32 225]]
Classification Report
              precision    recall  f1-score   support

   Cancelled       0.79      0.79      0.79       157
     Current       0.87      0.88      0.87       257

    accuracy                           0.84       414
   macro avg       0.83      0.83      0.83       414
weighted avg       0.84      0.84      0.84       414



In [69]:
#Construct a  Decision Tree Classifier on text data
clf=DecisionTreeClassifier()
RF_text = clf.fit(X_Train1,Y_Train1)
print("Accuracy score (training): {0:.6f}".format(clf.score(X_Test1, Y_Test1)))
rf_predictions = clf.predict(X_Test1)
print("Confusion Matrix:")
print(confusion_matrix(Y_Test1, rf_predictions))
print("Classification Report")
print(classification_report(Y_Test1, rf_predictions))

Accuracy score (training): 0.821256
Confusion Matrix:
[[122  35]
 [ 39 218]]
Classification Report
              precision    recall  f1-score   support

   Cancelled       0.76      0.78      0.77       157
     Current       0.86      0.85      0.85       257

    accuracy                           0.82       414
   macro avg       0.81      0.81      0.81       414
weighted avg       0.82      0.82      0.82       414



In [70]:
#Construct a GRADIENT BOOSTING Classifier on text data
clf=GradientBoostingClassifier()
RF_text = clf.fit(X_Train1,Y_Train1)
print("Accuracy score (training): {0:.6f}".format(clf.score(X_Test1, Y_Test1)))
rf_predictions = clf.predict(X_Test1)
print("Confusion Matrix:")
print(confusion_matrix(Y_Test1, rf_predictions))
print("Classification Report")
print(classification_report(Y_Test1, rf_predictions))

Accuracy score (training): 0.845411
Confusion Matrix:
[[115  42]
 [ 22 235]]
Classification Report
              precision    recall  f1-score   support

   Cancelled       0.84      0.73      0.78       157
     Current       0.85      0.91      0.88       257

    accuracy                           0.85       414
   macro avg       0.84      0.82      0.83       414
weighted avg       0.84      0.85      0.84       414



**Feature selection: Wrapper method - Random Forest**

In [71]:
# Build RF classifier to use in feature selection
clf = RandomForestClassifier()

# Build step forward feature selection
sfs1 = sfs(clf,
           k_features=5,
           forward=True,
           floating=False,
           verbose=2,
           scoring='accuracy',
           cv=5)

# Perform SFFS
sfs1 = sfs1.fit(DF_TF_IDF,Y_Train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 354 out of 354 | elapsed:   28.9s finished

[2019-12-14 06:19:51] Features: 1/5 -- score: 0.6159469643320449[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 353 out of 353 | elapsed:   29.2s finished

[2019-12-14 06:20:20] Features: 2/5 -- score: 0.6202889643988454[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 352 out of 352 | elapsed:   29.3s finished

[2019-12-14 06:20:50] Features: 3/5 -- score: 0.6236682787577302[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 

In [72]:
# Which features?
feat_cols = list(sfs1.k_feature_idx_)
print(feat_cols)


[14, 30, 155, 295, 342]


In [73]:
SF=DF_TF_IDF.iloc[:,feat_cols]
print(SF)

      14   30   155  295  342
0     0.0  0.0  0.0  0.0  0.0
1     0.0  0.0  0.0  0.0  0.0
2     0.0  0.0  0.0  0.0  0.0
3     0.0  0.0  0.0  0.0  0.0
4     0.0  0.0  0.0  0.0  0.0
...   ...  ...  ...  ...  ...
2065  0.0  0.0  0.0  0.0  0.0
2066  0.0  0.0  0.0  0.0  0.0
2067  0.0  0.0  0.0  0.0  0.0
2068  0.0  0.0  0.0  0.0  0.0
2069  0.0  0.0  0.0  0.0  0.0

[2070 rows x 5 columns]


In [74]:
#Merge files

print(CustInfoData.shape)

combined=pd.concat([CustInfoData,SF], axis=1)
print(combined.shape)
print(combined)
#export_csv= combined.to_csv(r'/gdrive/My Drive/CIS508/Combined-Cust+TFIDF+SelectedFeatures.csv')


(2070, 17)
(2070, 22)
        ID Sex Status  Children  Est_Income  ...   14   30  155  295  342
0        1   F      S         1    38000.00  ...  0.0  0.0  0.0  0.0  0.0
1        6   M      M         2    29616.00  ...  0.0  0.0  0.0  0.0  0.0
2        8   M      M         0    19732.80  ...  0.0  0.0  0.0  0.0  0.0
3       11   M      S         2       96.33  ...  0.0  0.0  0.0  0.0  0.0
4       14   F      M         2    52004.80  ...  0.0  0.0  0.0  0.0  0.0
...    ...  ..    ...       ...         ...  ...  ...  ...  ...  ...  ...
2065  3821   F      S         0    78851.30  ...  0.0  0.0  0.0  0.0  0.0
2066  3822   F      S         1    17540.70  ...  0.0  0.0  0.0  0.0  0.0
2067  3823   F      M         0    83891.90  ...  0.0  0.0  0.0  0.0  0.0
2068  3824   F      M         2    28220.80  ...  0.0  0.0  0.0  0.0  0.0
2069  3825   F      S         0    28589.10  ...  0.0  0.0  0.0  0.0  0.0

[2070 rows x 22 columns]


In [75]:
#Do one Hot encoding for categorical features
categorical_features = ["Sex","Status","Car_Owner","Paymethod","LocalBilltype","LongDistanceBilltype"]
#X_cat = combined.select_dtypes(exclude=['int','float64'])
print(categorical_features)
combined_one_hot = pd.get_dummies(combined,columns=categorical_features)
print(combined_one_hot.shape)
export_csv= combined_one_hot.to_csv(r'/gdrive/My Drive/CIS508/Assignment-5/combined_one_hot.csv')

['Sex', 'Status', 'Car_Owner', 'Paymethod', 'LocalBilltype', 'LongDistanceBilltype']
(2070, 30)


**Feature selection: Wrapper method:  Splitting Train and Test data**

In [76]:
X_Train2, X_Test2, Y_Train2, Y_Test2 = train_test_split(combined_one_hot.drop(columns=["TARGET"]), 
                                                        combined_one_hot["TARGET"], 
                                                        test_size=0.20, random_state=42)
print('Training dataset shape:', X_Train2.shape, Y_Train2)
print('Testing dataset shape:', X_Test2.shape, Y_Test2)

Training dataset shape: (1656, 29) 849       Current
1043    Cancelled
175       Current
1228      Current
538     Cancelled
          ...    
1638      Current
1095      Current
1130      Current
1294      Current
860       Current
Name: TARGET, Length: 1656, dtype: object
Testing dataset shape: (414, 29) 1181      Current
69        Current
351     Cancelled
1163    Cancelled
429       Current
          ...    
1532      Current
1671      Current
416       Current
2023      Current
1428      Current
Name: TARGET, Length: 414, dtype: object


In [77]:
#Construct a Random Forest Classifier on text data
clf=RandomForestClassifier()
RF_text = clf.fit(X_Train2,Y_Train2)
print("Accuracy score (training): {0:.6f}".format(clf.score(X_Test2, Y_Test2)))
rf_predictions = clf.predict(X_Test2)
print("Confusion Matrix:")
print(confusion_matrix(Y_Test2, rf_predictions))
print("Classification Report")
print(classification_report(Y_Test2, rf_predictions))
#run cross-validation on best hyperparameters
clf_cv_score = cross_val_score(clf, X_Train2, Y_Train2, cv=8, scoring="roc_auc")
print("=== All AUC Scores ===")
print(clf_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Random Forest: ",clf_cv_score.mean())

Accuracy score (training): 0.859903
Confusion Matrix:
[[128  29]
 [ 29 228]]
Classification Report
              precision    recall  f1-score   support

   Cancelled       0.82      0.82      0.82       157
     Current       0.89      0.89      0.89       257

    accuracy                           0.86       414
   macro avg       0.85      0.85      0.85       414
weighted avg       0.86      0.86      0.86       414

=== All AUC Scores ===
[0.9140663  0.9170096  0.91999804 0.93337253 0.9356261  0.94503233
 0.91024887 0.89489087]


=== Mean AUC Score ===
Mean AUC Score - Random Forest:  0.9212805810298018


In [78]:
#Construct a Decision Tree Classifier on text data
clf=DecisionTreeClassifier()
DT_text = clf.fit(X_Train2,Y_Train2)
print("Accuracy score (training): {0:.6f}".format(clf.score(X_Test2, Y_Test2)))
dt_predictions = clf.predict(X_Test2)
print("Confusion Matrix:")
print(confusion_matrix(Y_Test2, dt_predictions))
print("Classification Report")
print(classification_report(Y_Test2, dt_predictions))
#run cross-validation on best hyperparameters
clf_cv_score = cross_val_score(clf, X_Train2, Y_Train2, cv=8, scoring="roc_auc")
print("=== All AUC Scores ===")
print(clf_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Decision Tree: ",clf_cv_score.mean())

Accuracy score (training): 0.801932
Confusion Matrix:
[[119  38]
 [ 44 213]]
Classification Report
              precision    recall  f1-score   support

   Cancelled       0.73      0.76      0.74       157
     Current       0.85      0.83      0.84       257

    accuracy                           0.80       414
   macro avg       0.79      0.79      0.79       414
weighted avg       0.80      0.80      0.80       414

=== All AUC Scores ===
[0.81578692 0.84435626 0.85493827 0.82495591 0.84082892 0.77910053
 0.81305115 0.85525794]


=== Mean AUC Score ===
Mean AUC Score - Decision Tree:  0.8285344865745948
