### Evaluating Training Algorithms

In [None]:
# Import all libraries
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer

In [None]:
# Data Cleaning
DATA_FRAME = pd.read_csv('../data/breast_cancer.csv')
imputer = SimpleImputer(strategy='median')

X=DATA_FRAME.drop(columns=['id','diagnosis'])
y=DATA_FRAME['diagnosis']
X_imputed = imputer.fit_transform(X)

DATA_FRAME

In [None]:
# Data Modeling
X_for_train,x_for_test,y_for_train,y_for_test=train_test_split(X_imputed,y,test_size=0.2)

Decision_Tree_Model=DecisionTreeClassifier()
Logistic_Regression_Model=LogisticRegression(solver='lbfgs',max_iter=10000)
Support_Vector_Machine_Model=svm.SVC(kernel='linear')
Random_Forest_Model=RandomForestClassifier(n_estimators=100)

# Training and measuring accuracy
Decision_Tree_Model.fit(X_for_train,y_for_train)
Logistic_Regression_Model.fit(X_for_train,y_for_train)
Support_Vector_Machine_Model.fit(X_for_train,y_for_train)
Random_Forest_Model.fit(X_for_train,y_for_train)

DT_prediction=Decision_Tree_Model.predict(x_for_test)
LR_prediction=Logistic_Regression_Model.predict(x_for_test)
SVM_prediction=Support_Vector_Machine_Model.predict(x_for_test)
RF_prediction=Random_Forest_Model.predict(x_for_test)

# Calculating accuracy
DT_score=accuracy_score(y_for_test,DT_prediction)
LR_score=accuracy_score(y_for_test,LR_prediction)
SVM_score=accuracy_score(y_for_test,SVM_prediction)
RF_score=accuracy_score(y_for_test,RF_prediction)

print ("Decision Tree accuracy =", DT_score*100,"%")
print ("Logistic Regression accuracy =", LR_score*100,"%")
print ("Suport Vector Machine accuracy =", SVM_score*100,"%")
print ("Random Forest accuracy =", RF_score*100,"%")

### Creating Persistent Model

In [None]:
# Import libraries
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import joblib
from sklearn.impute import SimpleImputer

# Data cleaning
DATA_FRAME = pd.read_csv('../data/breast_cancer.csv')
imputer = SimpleImputer(strategy='median')

X=DATA_FRAME.drop(columns=['id','diagnosis'])
y=DATA_FRAME['diagnosis']
X_imputed = imputer.fit_transform(X)

# Data Modeling
model=RandomForestClassifier(n_estimators=100)
model.fit(X_imputed,y)

# Creating a Persisting model
# joblib.dump(model,'model/breast_cancer_prediction_model.joblib')

In [None]:
# Testing if persistent model works accurately

model=joblib.load('model/breast_cancer_prediction_model.joblib')
test_data = [9.173,13.86,59.2,260.9,0.07721,0.08751,0.05988,0.0218,0.2341,0.06963,0.4098,2.265,2.608,23.52,0.008738,0.03938,0.04312,0.0156,0.04192,0.005822,10.01,19.23,65.59,310.1,0.09836,0.1678,0.1397,0.05087,0.3282,0.0849]
predictions = model.predict([test_data])
print("The Prediction:",predictions[0])
