In [1]:
import pandas as pd
import numpy as np
import math
import operator
import sklearn

In [2]:
# Importing data 
feature_names=['id',
'age',
'job',
'marital',
'education',
'default',
'balance',
'housing',
'loan',
'contact',
'day',
'month',
'duration',
'campaign',
'pdays',
'previous',
'poutcome',
'y']

data = pd.read_csv('trainingset.txt', delimiter=",", names=feature_names)

data = data[data.pdays != -1] # the prediction is "how likely the client will subscribe a term deposit *after* they have been contacted by phone. -1 signals never been contacted and therefore it is not *after*
data = data.drop('id', axis='columns') # id is not needed to predict the value of y

from sklearn import preprocessing
le = preprocessing.LabelEncoder()

data = data.apply(le.fit_transform)
data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
12989,15,2,1,2,0,971,0,0,1,20,10,0,0,136,2,0,0
12995,18,7,1,2,0,1628,1,0,1,21,10,0,0,71,3,1,0
12998,38,10,1,1,0,763,1,0,2,22,10,0,0,132,1,2,1
13017,33,2,2,1,0,1800,0,0,1,4,9,0,0,161,0,0,0
13033,15,1,0,1,0,1050,1,0,1,9,9,0,0,159,1,0,0


In [3]:
# outlier check
firstQ_cont = data.quantile(0.25)
thirdQ_cont = data.quantile(0.75)
cont_result = thirdQ_cont - firstQ_cont
print(cont_result)

age            15.0
job             4.0
marital         1.0
education       1.0
default         0.0
balance      1042.0
housing         1.0
loan            0.0
contact         0.0
day            12.0
month           5.0
duration        0.0
campaign        1.0
pdays         190.0
previous        3.0
poutcome        1.0
y               0.0
dtype: float64


In [4]:
print(data.shape)

(4560, 17)


In [5]:
# shows how uneven our dataset is
data['y'].value_counts()

0    3496
1    1064
Name: y, dtype: int64

In [6]:
# Upsampling TYPE B, creates synthetic data
from sklearn.utils import resample

data_majority = data[data.y==0]
data_minority = data[data.y==1]

data_minority_upsampled = resample(data_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=2261,    # to match majority class
                                 random_state=123) # reproducible results

data_upsampled = pd.concat([data_majority, data_minority_upsampled])

data_upsampled.y.value_counts()

0    3496
1    2261
Name: y, dtype: int64

In [7]:
# Downsampling TYPE A, deleted data
# deleted data, for target_name_balanced to be equal in length to dataset
data_majority = data_upsampled[data_upsampled.y==0]
data_minority = data_upsampled[data_upsampled.y==1]
 

data_majority_downsampled = resample(data_majority, 
                                 replace=True,     # sample with replacement
                                 n_samples=2262,    # to match majority class
                                 random_state=123) # reproducible results
 
# Combine minority class with downsampled majority class
data_balanced = pd.concat([data_majority_downsampled, data_minority])
 
# Display new class counts
data_balanced.y.value_counts()

0    2262
1    2261
Name: y, dtype: int64

In [8]:
dataset = data_balanced[[
'age',
'job',
'marital',
'education',
'default',
'balance',
'housing',
'loan',
'contact',
'day',
'month',
'duration',
'campaign',
'pdays',
'previous',
'poutcome']].copy()

target_name = data_balanced[['y']].copy()

In [9]:
from sklearn import preprocessing
names = dataset.columns
# creating scaler object
scaler = preprocessing.StandardScaler()
scaled_dataset = scaler.fit_transform(dataset)
scaled_dataset = pd.DataFrame(scaled_dataset, columns=names)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [10]:
from sklearn.model_selection import train_test_split
# Splitting Data into 80/20 
X_train, X_test, y_train, y_test = train_test_split(scaled_dataset, target_name, test_size=0.2,random_state=42)

In [11]:
from sklearn.neighbors import KNeighborsClassifier
#Create KNN Classifier
knn = KNeighborsClassifier(n_neighbors=5)

#Train the model using the training sets
knn.fit(X_train, y_train.values.ravel())

#Predict the response for test dataset
y_pred = knn.predict(X_test)

In [12]:
from sklearn import metrics
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.7038674033149172


In [13]:
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test,y_pred))

[[322 155]
 [113 315]]
              precision    recall  f1-score   support

           0       0.74      0.68      0.71       477
           1       0.67      0.74      0.70       428

   micro avg       0.70      0.70      0.70       905
   macro avg       0.71      0.71      0.70       905
weighted avg       0.71      0.70      0.70       905



In [14]:
testData = pd.read_csv('queries.txt', delimiter=',', header=None, names=feature_names)

testData = testData[testData.pdays != -1]
testData = testData.drop('id', axis='columns')

testData.head()

testData = testData.apply(le.fit_transform)
testData.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
1449,8,7,2,2,0,252,1,0,1,12,9,0,0,70,0,0,0
1455,13,10,0,2,0,291,1,1,0,16,9,0,0,64,14,1,0
1456,28,10,2,1,0,98,1,0,0,16,9,0,0,48,1,0,0
1457,17,7,1,2,0,158,1,0,0,16,9,0,0,74,1,1,0
1463,27,5,1,2,0,112,1,0,0,16,9,0,1,84,1,0,0


In [15]:
testDataset = testData[[
'age',
'job',
'marital',
'education',
'default',
'balance',
'housing',
'loan',
'contact',
'day',
'month',
'duration',
'campaign',
'pdays',
'previous',
'poutcome']].copy()

test_target_name = data[['y']].copy()

testDataset.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
1449,8,7,2,2,0,252,1,0,1,12,9,0,0,70,0,0
1455,13,10,0,2,0,291,1,1,0,16,9,0,0,64,14,1
1456,28,10,2,1,0,98,1,0,0,16,9,0,0,48,1,0
1457,17,7,1,2,0,158,1,0,0,16,9,0,0,74,1,1
1463,27,5,1,2,0,112,1,0,0,16,9,0,1,84,1,0


In [16]:
a_test = testDataset
a_pred = knn.predict(a_test)

In [17]:
print ("\n".join(str("Type A" if  a == 0 else "        Type B") for a in a_pred))

        Type B
Type A
Type A
        Type B
Type A
Type A
Type A
Type A
Type A
Type A
        Type B
        Type B
Type A
Type A
Type A
Type A
        Type B
Type A
        Type B
        Type B
        Type B
Type A
Type A
        Type B
Type A
        Type B
        Type B
Type A
        Type B
        Type B
        Type B
Type A
Type A
Type A
Type A
Type A
        Type B
        Type B
        Type B
        Type B
Type A
        Type B
        Type B
        Type B
Type A
Type A
Type A
Type A
Type A
Type A
Type A
Type A
Type A
Type A
        Type B
Type A
Type A
Type A
Type A
        Type B
Type A
Type A
        Type B
        Type B
        Type B
        Type B
        Type B
Type A
Type A
        Type B
        Type B
Type A
        Type B
Type A
Type A
Type A
Type A
Type A
Type A
Type A
Type A
Type A
        Type B
Type A
Type A
Type A
Type A
Type A
        Type B
Type A
Type A
Type A
Type A
        Type B
Type A
Type A
Type A
Type A
Type A
Type A
Type A
Type A
Type A
       

In [18]:
np_array = np.asarray(["Type A" if  a == 0 else "Type B" for a in a_pred])

In [19]:
myfile = open('KNNfixes.txt', 'w')

count = 0
for index, item in enumerate(np_array):
    count += 1
    myfile.writelines("Test" + str(count) + "," + str(item)+ '\n')

myfile.close()