<img src="mmu_logo.png" style="height: 80px;" align=left> 

# Learning Objectives

Towards the end of this lesson, you should be able to:
- understand and programming in Logistic regression, k-NN, Naive Bayes
- determine k value for optimal k-NN 
- perform prediction on new data


# Load Python Libraries

In [None]:
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, cross_val_predict

import matplotlib.pyplot as plt 
import seaborn as sns
sns.set(style="whitegrid", color_codes=True)
sns.set(rc={'figure.figsize':(11,6)})
plt.rc("font", size=14)

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

from scipy.stats import spearmanr 

import missingno as msno

from imblearn.over_sampling import SMOTE

import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', 500)

# !pip install missingno

# Logistic Regression

### Read Dataset

In [None]:
# load dataset

df = pd.read_csv('banking.csv')
df.head()

In [None]:
df.shape

In [None]:
df.columns

### Data Exploratory

In [None]:
# plot scatterplot for age vs. duration
# your codes here...



In [None]:
# check the correlation between age and duration

age = df.loc[1:100,['age']]
duration = df.loc[1:100,['duration']]

# your codes here...


### Check missing values

In [None]:
df.isnull().sum()
msno.bar(df)

In [None]:
df = df.dropna()
df = df.fillna(0)

### Check the class label distribution 'y'


In [None]:
df.y.value_counts()

In [None]:
b=sns.countplot(x='y', data = df)

for p in b.patches:
    	b.annotate("%.0f" % p.get_height(), (p.get_x() + 
	p.get_width() / 2., p.get_height()), 
    	ha='center', va='center', rotation=0, 
	xytext=(0, 18), textcoords='offset points')

### Basic Transformation

In [None]:
# before transformation

df['education'].unique()

In [None]:
# You need to replace basic.9y, basic.6y, basic.4y to "Basic"
# your codes here...

df['education']=np.where(df['education'] =='basic.9y', 'Basic', df['education'])
df['education']=np.where(df['education'] =='basic.6y', 'Basic', df['education'])
df['education']=np.where(df['education'] =='basic.4y', 'Basic', df['education'])

In [None]:
# after transformation

df['education'].unique()

In [None]:
# get the distribution of y

count_no_sub = len(df[df['y']==0])
count_sub = len(df[df['y']==1])

pct_of_no_sub = count_no_sub/(count_no_sub+count_sub)
print("percentage of no subscription is", pct_of_no_sub*100)

pct_of_sub = count_sub/(count_no_sub+count_sub)
print("percentage of subscription", pct_of_sub*100)

In [None]:
# get the mean for each attribute grouped by y
# your codes here...
 

### Visualizing attributes with respect to y

In [None]:
a=pd.crosstab(df.job,df.y).plot(kind='bar')
plt.title('Purchase Frequency for Job Title')
plt.xlabel('Job')
plt.ylabel('Frequency of Purchase') 


In [None]:
# build a crosstab for marital agains y and plot the barchart

# your codes here...



In [None]:
# build a crosstab for education agains y and plot the barchart

# your codes here...



In [None]:
# plot the histogram of age

df.age.hist()
plt.title('Histogram of Age')
plt.xlabel('Age')
plt.ylabel('Frequency') 

### Dummifying the Variables (one-hot encoding)

In [None]:
# Dummifying only the variables with object data type

cat_vars=['job','marital','education','default','housing','loan','contact','month','day_of_week','poutcome']

# your codes here...

df_vars=df.columns.values.tolist()
to_keep=[i for i in df_vars if i not in cat_vars]

In [None]:
df_final=df[to_keep]
df_final.columns.values

In [None]:
df_final

### Over-sampling using SMOTE

In [None]:
# Determine the X and y...

X = df_final.loc[:, df_final.columns != 'y']
y = df_final.loc[:, df_final.columns == 'y']

# construct the SMOTE model
# your codes here...

# train-test-split with test size 30% and random state=10
# your codes here...

# fit the smote model with training data only
# your codes here...

# change to dataframe

os_data_X = pd.DataFrame(data=os_data_X,columns=columns )
os_data_y= pd.DataFrame(data=os_data_y,columns=['y'])

# we can Check the numbers of our data
print("length of oversampled data is ",len(os_data_X))
print("Number of no subscription in oversampled data",len(os_data_y[os_data_y['y']==0]))
print("Number of subscription",len(os_data_y[os_data_y['y']==1]))
print("Proportion of no subscription data in oversampled data is ",len(os_data_y[os_data_y['y']==0])/len(os_data_X))
print("Proportion of subscription data in oversampled data is ",len(os_data_y[os_data_y['y']==1])/len(os_data_X))

### Using statsmodels for Logistic Regression

In [None]:
import statsmodels.api as sm

# create the logistic regression model using statsmodels. The max iteration = 200
# your codes here...


print(result.summary2())

### Using sklearn for Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

# Use solver='lbfgs' and max iteration = 200 for your logistic regression model
# your codes here...

# getting the model accuracy
# your codes here...


In [None]:
# Prediction

# your codes here...

print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

In [None]:
# get the predicted y values

# your codes here...


In [None]:
# Accuracy of the model

# your codes here...


# K-NN

### Load Dataset

In [None]:
# Load dataset

dataset = pd.read_csv('iris.csv')

In [None]:
dataset.info()

In [None]:
dataset.head(10)

### Train-Test-Split

In [None]:
# For this example, we use the mass, width, and height features of each fruit instance
X = dataset.drop('Species', axis=1)
y = dataset['Species']

In [None]:
# default is 75% / 25% train-test split. Random state = 10

# your codes here...

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

### Create the k-NN model

In [None]:
# Create the k-NN model with 5 neighbours

# your codes here...

**Use the trained k-NN classifier model to classify new, previously unseen objects:**

In [None]:
# Perform prediction

# your codes here...

print('species name is '+species_prediction[0])


In [None]:
# Perform prediction

# your codes here...

print('species name is '+species_prediction[0])

**Estimate the accuracy of the classifier on future data, using the test data:**

In [None]:
#accuracy

# your codes here...

### How sensitive is k-NN classification accuracy to the choice of the 'k' parameter?

In [None]:
k_range = range(1,20)
scores = []

# Using loop to study the influence of k towards the score. Use append() to append the scores into the list


# your codes here...

plt.figure()
plt.xlabel('k')
plt.ylabel('accuracy')
plt.title('Accuracy by n_neigbors')
plt.scatter(k_range, scores)
plt.xticks([0,5,10,15,20]);
plt.plot(k_range, scores, color='green', linestyle='dashed', linewidth=1, markersize=5)

# Naive Bayes

In [None]:
df=pd.read_csv('iris.csv')
df.info()

In [None]:
df.shape

### Train-Test-Split dataset

In [None]:
# For this example, we use the mass, width, and height features of each fruit instance
X = df.drop('Species', axis=1)
y = df['Species']

In [None]:
# train-test-split with 30% test size and random state=10

# your codes here...


In [None]:
# Construct the NB model

# your codes here...


In [None]:
# Perform prediction on X_test

# your codes here...


In [None]:
# get the predicted output for [2, 3.2, 1.5, 0.5]

# your codes here...

print('species name is '+species_prediction[0])

In [None]:
# get the predicted output for [2.5, 5, 4, 4]

# your codes here...

print('species name is '+species_prediction[0])

In [None]:
# accuracy of the constructed NB model

# your codes here...
