# Train and Test datasets in Machine Learning

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test= train_test_split(x, y,
test_size= 0.2, random_state=42)

# two modes in Jupyter Notebook

In [None]:
# See installed package
!pip list
# See the directiory
%pwd
# list of the folder in the directory
%ls
# See availabe line magics command
%lsmagic

# Covariance Analysis

In [None]:
# generate related variables
from numpy import mean
from numpy import std
from numpy.random import randn
from numpy.random import seed
# seed random number generator
seed(1)
# prepare data
data1 = 20 * randn(1000) + 100
data2 = data1 + (10 * randn(1000) + 50)
from numpy import cov
covariance = cov(data1, data2)
print(covariance)

# Correlation Analysis With Heatmap

In [None]:
# calculate Pearson's correlation
from scipy.stats import pearsonr
corr, _ = pearsonr(data1, data2)
# Note: corr, _ provies only value of r
# whereas corr proives r with p-value
print('Pearsons correlation: %.3f' % corr)
# calculate the spearmans's rank correlation between two
variables
from scipy.stats import spearmanr
corr, _ = spearmanr(data1, data2)
print('Spearmans correlation: %.3f' % corr)

# Linear Regression Analysis

In [None]:
# fit a linear regression model to the data
import statsmodels.api as sm
X = sm.add_constant(x.ravel())
results = sm.OLS(y,X).fit()
results.summary()
# Alternatively, using matchine learning tools
import scipy.stats
import scipy.stats slope, intercept, r, p, stderr =
scipy.stats.linregress(x, y)
print(slope, intercept, r, p, stderr)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x,
r={r:.2f}'
line
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
ax.plot(x, y, linewidth=0, marker='s', label='Data points')
ax.plot(x, intercept + slope * x, label=line)
ax.set_xlabel('x')
ax.set_ylabel('y')
ax.legend(facecolor='white')
plt.show()

# Polynomial Regression

In [None]:
## Polynomial Regression
import numpy
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
x = [1,2,3,5,6,7,8,9,10,12,13,14,15,16,18,19,21,22]
y = [100,90,80,60,60,55,60,65,70,70,75,76,78,79,90,99,99,100]
plt.scatter(x, y)
plt.show()
mymodel = numpy.poly1d(numpy.polyfit(x, y, 3))
myline = numpy.linspace(1, 22, 100)
plt.scatter(x, y)
plt.plot(myline, mymodel(myline))
plt.show()
print(r2_score(y, mymodel(x))) # print R^2
# Predict future value
speed = mymodel(17)
print(speed)

# Multiple Regression

In [None]:
import pandas
# df = pandas.read_csv("cars.csv")
df=pandas.read_csv("C:\\Users\\Dr. M R
Karim\\Dropbox\\a_Lecture Slides\\Python\R code and
Data\\cars.csv")
X = df[['Weight', 'Volume']]
y = df['CO2']
# Multiple regession Model,
import statsmodels.api as sm
X = sm.add_constant(X)
# X = sm.add_constant(X.ravel())
results = sm.OLS(y,X).fit()
results.summary()

In [None]:
# Alternatively,
from sklearn import linear_model # sklearn is a matchine
learing package
regr = linear_model.LinearRegression()
regr.fit(X, y)
print('Intercept: \n', regr.intercept_)
print('Coefficients: \n', regr.coef_)
#predict the CO2 emission of a car where the weight is
2300kg, and the volume is 1300ccm:
predictedCO2 = regr.predict([[2300, 1300]])
print(predictedCO2)

# Python Code: OLS for Linear Regression Model

In [None]:
# Simple Linear Regression Model
import statsmodels.api as sm
model =sm.OLS(boston.MEDV, sm.add_constant(boston.LSTAT))
result = model.fit()
print(result.summary())

# OLS for Multiple Linear Regression Model

In [None]:
# Multiple Linear Regression Model
import statsmodels.api as sm
X = boston[['LSTAT', 'CRIM']]
model =sm.OLS(boston.MEDV, sm.add_constant(X))
result = model.fit()
print(result.summary())
## Alternatively
import statsmodels.formula.api as smf
# formula: response ~ predictor + predictor
est = smf.ols(formula='MEDV ~ LSTAT + CRIM',
data=boston).fit()
print(est.summary())
# In GLM framework
Gaussian_model = sm.GLM(boston.MEDV, sm.add_constant(X),
family=sm.families.Gaussian()).fit()
print(Gaussian_model.summary())

# Python code for Naive Bayes Classication

In [None]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix,accuracy_score
# Data Loading
url='https://gist.githubusercontent.com/curran/a08a1080b88344b0c8a7/raw
/0e7a9b0a5d22642a06d3d5b9bcbad9890c8ee534/iris.csv'
df = pd.read_csv(url)
df.head()
## Confusion Matrix
x = df[['sepal_length', 'sepal_width', 'petal_length',
'petal_width']]
y = df['species']
x_train, x_test, y_train, y_test = train_test_split(x, y,
test_size=0.25, random_state =3)
nb=GaussianNB()
nb.fit(x_train,y_train)
GaussianNB(priors=None, var_smoothing=1e-09)
predictionsNB = nb.predict(x_test)
print('Classification
Report:\n',classification_report(y_test, predictionsNB))
print('Confusion Matrix:\n',confusion_matrix(y_test,
predictionsNB))
print('Accuracy Score:',accuracy_score(y_test,
predictionsNB))

# CART

In [None]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
dataset=pd.read_csv('temp.csv')
dataset=pd.DataFrame(data=dataset.iloc[:,1:6].values,columns=["outlook","temprature","humdity","windy","play"])
filter = dataset["outlook"]=="Rainy"
dataset.where(filter).count()
dataset_encoded=dataset.iloc[:,0:5]
le=LabelEncoder()
for i in dataset_encoded:
dataset_encoded[i]=le.fit_transform(dataset_encoded[i])
print(dataset_encoded)
print(dataset)

In [None]:
#Feature Set
X=dataset_encoded.iloc[:,0:4].values
#Label Set
y=dataset_encoded.iloc[:,4].values
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=1,random_state=2)
model=DecisionTreeClassifier(criterion='gini')
model.fit(X_train,y_train)
if model.predict([[0,1,0,1]])==1:
print("yes you can play")
else:
print("no you cant")

# Plot for the Decision Tree

In [None]:
import numpy as np
import pandas as pd
import sklearn.metrics as metrics
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
# Data Loading
url='https://gist.githubusercontent.com/curran/a08a1080b88344b0c8a7/raw/0e7a9b0a5d22642a06d3d5b9bcbad9890c8ee534/iris.csv'
df = pd.read_csv(url)
df.head()
x = df[['sepal_length', 'sepal_width', 'petal_length',
'petal_width']]
y = df['species']
x_train, x_test, y_train, y_test = train_test_split(x, y,
test_size=0.25, random_state =3)
# Create Decision Tree classifer object
clf = DecisionTreeClassifier()
# Train Decision Tree Classifer
clf = clf.fit(x_train,y_train)
#Predict the response for test dataset
y_pred = clf.predict(x_test)
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

# Decision Tree

In [None]:
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO
from IPython.display import Image
import pydotplus
x = df[['sepal_length', 'sepal_width', 'petal_length',
'petal_width']]
y = df['species']
feature_cols = ['sepal_length', 'sepal_width',
'petal_length', 'petal_width']
class_names=['setosa', 'versicolor', 'virginica']
dot_data = StringIO()
export_graphviz(clf, out_file=dot_data,
filled=True, rounded=True,
special_characters=True,feature_names =
feature_cols,class_names=class_names)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png('diabetes.png')
Image(graph.create_png())

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
#Create a Gaussian Classifier
clf=RandomForestClassifier(n_estimators=100)
#Train the model using the training sets
y_pred=clf.predict(X_test)
clf.fit(x_train,y_train)
y_pred=clf.predict(x_test)
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
clf.predict([[3, 5, 4, 2]])
from sklearn.ensemble import RandomForestClassifier
#Create a Gaussian Classifier
clf=RandomForestClassifier(n_estimators=100)
#Train the model using the training sets
y_pred=clf.predict(X_test)
clf.fit(x_train,y_train)
import pandas as pd
feature_imp =
pd.Series(clf.feature_importances_,index=iris.feature_names).sort_values(ascending=False)
feature_imp

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
# Creating a bar plot
sns.barplot(x=feature_imp, y=feature_imp.index)
# Add labels to your graph
plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
plt.title("Visualizing Important Features")
plt.legend()
plt.show()

# Classication and Regression Tree (CART)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# load Boston Housing dataset
from sklearn.datasets import load_boston
boston_dataset = load_boston()
boston = pd.DataFrame(boston_dataset.data,
columns=boston_dataset.feature_names)
boston['MEDV'] = boston_dataset.target
names = boston_dataset.feature_names
print(boston.head())
print(names)
print(boston.shape)
array = boston.values
X = array[:,0:13]
Y = array[:,13]
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
test_size=0.3, random_state=1234)

In [None]:
from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor(max_leaf_nodes = 20)
rt = model.fit(X_train, Y_train)
rt
import pandas as pd
import numpy as np
import random as rnd
rnd.seed(123458)
X_new = X[rnd.randrange(X.shape[0])]
X_new = X_new.reshape(1,13)
YHat = model.predict(X_new)
df = pd.DataFrame(X_new, columns = names)
df["Predicted Price"] = YHat
df.head(1)
from sklearn.metrics import r2_score
YHat = model.predict(X_test)
r2 = r2_score(Y_test, YHat)
print("R-Squared = ", r2)

In [None]:
from IPython.display import Image
from sklearn import tree
import matplotlib.image as mpimg
import pydotplus
import io
import matplotlib.pyplot as plt
import sys
str1 = "/usr/local/Cellar/graphviz/2.40.1_1/bin/"
sys.path.append(str1)
dot_data = io.StringIO()
tree.export_graphviz(rt, out_file=dot_data, filled=True,
feature_names = names, class_names = 'MEDV')
# Draw graph
pydotplus.graph_from_dot_data(dot_data.getvalue()).write_png('dt.png')
plt.figure(figsize=(100, 100))
img = mpimg.imread('dt.png')
imgplot = plt.imshow(img)
plt.show()

# KNN algorithm

In [None]:
import numpy as np
import pandas as pd
import sklearn.metrics as metrics
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix,accuracy_score

In [None]:
# Data Loading
url='https://gist.githubusercontent.com/curran/a08a1080b88344b0c8a7/raw/0e7a9b0a5d22642a06d3d5b9bcbad9890c8ee534/iris.csv'
df = pd.read_csv(url)
df.head()

In [None]:
plt.figure()
sns.pairplot(df, hue = "species", markers=["o", "s", "D"])
plt.show()

In [None]:
x = df[['sepal_length', 'sepal_width', 'petal_length',
'petal_width']]
y = df['species']
x_train, x_test, y_train, y_test = train_test_split(x, y,
test_size=0.25, random_state =3)
#Fitting K-NN classifier to the training set
from sklearn.neighbors import KNeighborsClassifier
knn= KNeighborsClassifier(n_neighbors=5, metric='minkowski',
p=2 )
knn.fit(x_train, y_train)
predictionsKNN = knn.predict(x_test)
print('Classification
Report:\n',classification_report(y_test, predictionsKNN))
print('Confusion Matrix:\n',confusion_matrix(y_test,
predictionsKNN))
print('Accuracy Score:',accuracy_score(y_test,
predictionsKNN))

In [None]:
# creating list of K for KNN
from sklearn.model_selection import cross_val_score
k_list = list(range(1,50,2))
# creating list of cv scores
cv_scores = []
# perform 10-fold cross validation
for k in k_list:
knn = KNeighborsClassifier(n_neighbors=k)
scores = cross_val_score(knn, x_train, y_train, cv=10,
scoring='accuracy')
cv_scores.append(scores.mean())

In [None]:
# changing to misclassification error
MSE = [1 - x for x in cv_scores]
plt.figure()
plt.figure(figsize=(15,10))
plt.title('The optimal number of neighbors', fontsize=20,
fontweight='bold')
plt.xlabel('Number of Neighbors K', fontsize=15)
plt.ylabel('Misclassification Error', fontsize=15)
sns.set_style("whitegrid")
plt.plot(k_list, MSE)
plt.show()
# finding best k
best_k = k_list[MSE.index(min(MSE))]
print("The optimal number of neighbors is %d." % best_k)

# k fold cross-validation.

In [None]:
# importing cross-validation from sklearn package.
from sklearn import cross_validation
# value of K is 10.
data = cross_validation.KFold(len(train_set), n_folds=10,
indices=False)


# k fold cross-validation for Logistic Classication

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
iris = load_iris()
logreg = LogisticRegression()
scores = cross_val_score(logreg, iris.data, iris.target)
print("Cross-validation scores: {}".format(scores))

In [None]:
scores = cross_val_score(logreg, iris.data, iris.target,
cv=5)
print("Cross-validation scores: {}".format(scores))

A common way to summarize the cross-validation accuracy is to compute
the mean:

In [None]:
print("Average cross-validation score:
{:.2f}".format(scores.mean()))

# OLS for Linear Regression Model

In [None]:
# Simple Linear Regression Model
import statsmodels.api as sm
model =sm.OLS(boston.MEDV, sm.add_constant(boston.LSTAT))
result = model.fit()
print(result.summary())

# OLS for Multiple Linear Regression Model

In [None]:
# Multiple Linear Regression Model
import statsmodels.api as sm
X = boston[['LSTAT', 'CRIM']]
model =sm.OLS(boston.MEDV, sm.add_constant(X))
result = model.fit()
print(result.summary())
## Alternatively
import statsmodels.formula.api as smf
# formula: response ~ predictor + predictor
est = smf.ols(formula='MEDV ~ LSTAT + CRIM',
data=boston).fit()
print(est.summary())
# In GLM framework
Gaussian_model = sm.GLM(boston.MEDV, sm.add_constant(X),
family=sm.families.Gaussian()).fit()
print(Gaussian_model.summary())

# Logistic Regression

In [None]:
# Multiple Linear Regression Model
import statsmodels.api as sm
X = boston[['LSTAT', 'CRIM']]
model =sm.OLS(boston.MEDV, sm.add_constant(X))
result = model.fit()
print(result.summary())
## Alternatively
import statsmodels.formula.api as smf
# formula: response ~ predictor + predictor
est = smf.ols(formula='MEDV ~ LSTAT + CRIM',
data=boston).fit()
print(est.summary())
# In GLM framework
Gaussian_model = sm.GLM(boston.MEDV, sm.add_constant(X),
family=sm.families.Gaussian()).fit()
print(Gaussian_model.summary())

In [None]:
# Creating and fitting the logistic regression model
model = LogisticRegression()
model.fit(X, y)
# Intercept and coefficients
intercept = model.intercept_[0]
coeff_age = model.coef_[0][0]
coeff_smoking = model.coef_[0][1]
print(f'Intercept (beta0): {intercept}')
print(f'Coefficient for Age (beta1): {coeff_age}')
print(f'Coefficient for Smoking (beta2): {coeff_smoking}')

In [None]:
import pandas as pd
import statsmodels.api as sm
# Step 1: Data Preparation
data = pd.DataFrame({
'Age': [40, 55, 48, 60, 35, 50, 45, 58, 42],
'Smoking': [0, 1, 1, 0, 0, 1, 0, 1, 0],
'HeartAttack': [0, 1, 0, 1, 0, 1, 0, 1, 0]
})
# Step 2: Model Training
X = sm.add_constant(data[['Age', 'Smoking']]) # add a constant
term for the intercept
y = data['HeartAttack']
model = sm.Logit(y, X)
result = model.fit()
# Step 3: Model Summary
print(result.summary())

# Logistic Regression Classier

In [None]:
# data load
import pandas as pd
from sklearn.datasets import load_iris
iris=load_iris()
# Convert data set to pandas data frame
df=pd.DataFrame(iris.data,columns=iris.feature_names)
df.head()
# See the categories of target variable
iris.target_names
# take a general notation
X = iris.data[:, :4] # we only take the first four features.
Y = iris.target

In [None]:
# Split dataset into training and test dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix,accuracy_score
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
test_size=0.3, random_state=1234)
# Create an instance of Logistic Regression Classifier and
fit the data.
logreg = LogisticRegression(C=1e5)
logreg.fit(X, Y)
# See the accuracy rate
logreg.score(X_test,Y_test)

In [None]:
# predict for the test dataset and generate classification
report
predictionsModel = logreg.predict(X_test)
print('Classification
Report:\n',classification_report(Y_test,
predictionsModel))
print('Confusion Matrix:\n',confusion_matrix(Y_test,
predictionsModel))
print('Accuracy Score:',accuracy_score(Y_test,
predictionsModel))

# SVM

In [None]:
# data load
import pandas as pd
from sklearn.datasets import load_iris
iris=load_iris()
# See the feature of the data set
iris.feature_names
# Convert data set to pandas data frame
df=pd.DataFrame(iris.data,columns=iris.feature_names)
df.head()
# See the target variable
df['target']=iris.target
df.head()
# See the categories of target variable
iris.target_names
# take a general notation
X = iris.data[:, :4] # we only take the first four features.
Y = iris.target
# Split dataset into training and test dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix,accuracy_score
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
test_size=0.3, random_state=1234)

In [None]:
# write function for Support vector machine
from sklearn.svm import SVC
model=SVC()
# model=SVC(C=10)
# model fitting
model.fit(X_train,Y_train)
# See the accuracy rate
model.score(X_test,Y_test)
# predict for the test dataset and generate classification
report
predictionsModel = model.predict(X_test)
print('Classification
Report:\n',classification_report(Y_test,
predictionsModel))
print('Confusion Matrix:\n',confusion_matrix(Y_test,
predictionsModel))
print('Accuracy Score:',accuracy_score(Y_test,
predictionsModel))

# Boruta Algorithm

In [None]:
!pip install boruta
from boruta import BorutaPy
from sklearn.ensemble import RandomForestRegressor
import numpy as np
###initialize Boruta
forest = RandomForestRegressor(
n_jobs = -1,
max_depth = 5
)
boruta = BorutaPy(
estimator = forest,
n_estimators = 'auto',
max_iter = 100 # number of trials to perform
)
### fit Boruta (it accepts np.array, not pd.DataFrame)
boruta.fit(np.array(X), np.array(y))
### print results
green_area = X.columns[boruta.support_].to_list()
blue_area = X.columns[boruta.support_weak_].to_list()
print('features in the green area:', green_area)
print('features in the blue area:', blue_area)