# Import Dependencies

In [3]:
#import the dependencies
import numpy as np # for numpy arrays
import pandas as pd # provides dataset features: loading data into data frames
from sklearn.model_selection import train_test_split #this function is used to split test and train data automatically
from sklearn.linear_model import LogisticRegression #imports the machine learning algorithm model
from sklearn.metrics import accuracy_score # finds accuracy of our model

# Data collection and processing

In [5]:
# Loading the dataset to a pandas dataframe
# create a variable to hold the dataframe
sonar_data = pd.read_csv('sonar data.csv', header=None) #there is no header file in the csv

In [6]:
# lets have a snippet look at out dataset to see if it loaded
sonar_data.head() #loads the first 5 rows

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,51,52,53,54,55,56,57,58,59,60
0,0.02,0.0371,0.0428,0.0207,0.0954,0.0986,0.1539,0.1601,0.3109,0.2111,...,0.0027,0.0065,0.0159,0.0072,0.0167,0.018,0.0084,0.009,0.0032,R
1,0.0453,0.0523,0.0843,0.0689,0.1183,0.2583,0.2156,0.3481,0.3337,0.2872,...,0.0084,0.0089,0.0048,0.0094,0.0191,0.014,0.0049,0.0052,0.0044,R
2,0.0262,0.0582,0.1099,0.1083,0.0974,0.228,0.2431,0.3771,0.5598,0.6194,...,0.0232,0.0166,0.0095,0.018,0.0244,0.0316,0.0164,0.0095,0.0078,R
3,0.01,0.0171,0.0623,0.0205,0.0205,0.0368,0.1098,0.1276,0.0598,0.1264,...,0.0121,0.0036,0.015,0.0085,0.0073,0.005,0.0044,0.004,0.0117,R
4,0.0762,0.0666,0.0481,0.0394,0.059,0.0649,0.1209,0.2467,0.3564,0.4459,...,0.0031,0.0054,0.0105,0.011,0.0015,0.0072,0.0048,0.0107,0.0094,R


In [8]:
# Number of rows and columns
# Uses the shape attribute : a NumPy feature
sonar_data.shape

(208, 61)

In [9]:
# To get some statistical measures for this data
# decribe() method gives the count, mean, sd and other statistical features of the dataframe
sonar_data.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,50,51,52,53,54,55,56,57,58,59
count,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,...,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0
mean,0.029164,0.038437,0.043832,0.053892,0.075202,0.10457,0.121747,0.134799,0.178003,0.208259,...,0.016069,0.01342,0.010709,0.010941,0.00929,0.008222,0.00782,0.007949,0.007941,0.006507
std,0.022991,0.03296,0.038428,0.046528,0.055552,0.059105,0.061788,0.085152,0.118387,0.134416,...,0.012008,0.009634,0.00706,0.007301,0.007088,0.005736,0.005785,0.00647,0.006181,0.005031
min,0.0015,0.0006,0.0015,0.0058,0.0067,0.0102,0.0033,0.0055,0.0075,0.0113,...,0.0,0.0008,0.0005,0.001,0.0006,0.0004,0.0003,0.0003,0.0001,0.0006
25%,0.01335,0.01645,0.01895,0.024375,0.03805,0.067025,0.0809,0.080425,0.097025,0.111275,...,0.008425,0.007275,0.005075,0.005375,0.00415,0.0044,0.0037,0.0036,0.003675,0.0031
50%,0.0228,0.0308,0.0343,0.04405,0.0625,0.09215,0.10695,0.1121,0.15225,0.1824,...,0.0139,0.0114,0.00955,0.0093,0.0075,0.00685,0.00595,0.0058,0.0064,0.0053
75%,0.03555,0.04795,0.05795,0.0645,0.100275,0.134125,0.154,0.1696,0.233425,0.2687,...,0.020825,0.016725,0.0149,0.0145,0.0121,0.010575,0.010425,0.01035,0.010325,0.008525
max,0.1371,0.2339,0.3059,0.4264,0.401,0.3823,0.3729,0.459,0.6828,0.7106,...,0.1004,0.0709,0.039,0.0352,0.0447,0.0394,0.0355,0.044,0.0364,0.0439


In [10]:
# To find how many rock values and how many mine values are there
sonar_data[60].value_counts() #[60] is the index of the column the rocks and mines are defined in
# since the rocks and mines are almost equal the prediction will be almost accurate
# but the more the data the higher the accuracy: here we have only 298 instances which is very little in ML

M    111
R     97
Name: 60, dtype: int64

In [11]:
# Let's find the mean for each column in M and R
sonar_data.groupby(60).mean()
# this difference in mean of a column(eg col0:= 0.034989 and 0.22498) is important for the prediction

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,50,51,52,53,54,55,56,57,58,59
60,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
M,0.034989,0.045544,0.05072,0.064768,0.086715,0.111864,0.128359,0.149832,0.213492,0.251022,...,0.019352,0.016014,0.011643,0.012185,0.009923,0.008914,0.007825,0.00906,0.008695,0.00693
R,0.022498,0.030303,0.035951,0.041447,0.062028,0.096224,0.11418,0.117596,0.137392,0.159325,...,0.012311,0.010453,0.00964,0.009518,0.008567,0.00743,0.007814,0.006677,0.007078,0.006024


In [12]:
# Let's separate the data and the labels
# The data is the numerical values while the label is the last column(with R and M)

# ============= DATA ==================
# First, we store all the values in X except the 60th column
X = sonar_data.drop(columns=60, axis=1) #this will have row 0 to 59

# ============= LABEL ================
# Secondly, store the 60th column in Y 
Y = sonar_data[60] #this will have row 60

print(X)
print(Y)

         0       1       2       3       4       5       6       7       8   \
0    0.0200  0.0371  0.0428  0.0207  0.0954  0.0986  0.1539  0.1601  0.3109   
1    0.0453  0.0523  0.0843  0.0689  0.1183  0.2583  0.2156  0.3481  0.3337   
2    0.0262  0.0582  0.1099  0.1083  0.0974  0.2280  0.2431  0.3771  0.5598   
3    0.0100  0.0171  0.0623  0.0205  0.0205  0.0368  0.1098  0.1276  0.0598   
4    0.0762  0.0666  0.0481  0.0394  0.0590  0.0649  0.1209  0.2467  0.3564   
..      ...     ...     ...     ...     ...     ...     ...     ...     ...   
203  0.0187  0.0346  0.0168  0.0177  0.0393  0.1630  0.2028  0.1694  0.2328   
204  0.0323  0.0101  0.0298  0.0564  0.0760  0.0958  0.0990  0.1018  0.1030   
205  0.0522  0.0437  0.0180  0.0292  0.0351  0.1171  0.1257  0.1178  0.1258   
206  0.0303  0.0353  0.0490  0.0608  0.0167  0.1354  0.1465  0.1123  0.1945   
207  0.0260  0.0363  0.0136  0.0272  0.0214  0.0338  0.0655  0.1400  0.1843   

         9   ...      50      51      52      53   

# Split into Train and Test data

In [13]:
# We'll use the train_test_split function
# Create variables
# X_train holds the X data to be used for training and X_test is that to be used for testing
# Y_train holds the labels to be used in training and Y_test is the labels to be used in testing
# train_test_split(X,Y) === data we are spliting into the train and test sets
# test_size = 0.1 === we need 10% of the data to be test data and 90% train data
# stratify = Y === split into equal no. of rocks and mines in training data
# random_state=1 === splits data into a particular order
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, stratify=Y, random_state=1)

In [14]:
# Let's see its been split by looking at the shape
# (first og shape, split train set shape, split test set shape) in rows and columnsi
print(X.shape, X_train.shape, X_test.shape)
print(Y.shape, Y_train.shape, Y_test.shape)

(208, 60) (187, 60) (21, 60)
(208,) (187,) (21,)


In [17]:
print(X_train)
print(Y_train)

         0       1       2       3       4       5       6       7       8   \
115  0.0414  0.0436  0.0447  0.0844  0.0419  0.1215  0.2002  0.1516  0.0818   
38   0.0123  0.0022  0.0196  0.0206  0.0180  0.0492  0.0033  0.0398  0.0791   
56   0.0152  0.0102  0.0113  0.0263  0.0097  0.0391  0.0857  0.0915  0.0949   
123  0.0270  0.0163  0.0341  0.0247  0.0822  0.1256  0.1323  0.1584  0.2017   
18   0.0270  0.0092  0.0145  0.0278  0.0412  0.0757  0.1026  0.1138  0.0794   
..      ...     ...     ...     ...     ...     ...     ...     ...     ...   
140  0.0412  0.1135  0.0518  0.0232  0.0646  0.1124  0.1787  0.2407  0.2682   
5    0.0286  0.0453  0.0277  0.0174  0.0384  0.0990  0.1201  0.1833  0.2105   
154  0.0117  0.0069  0.0279  0.0583  0.0915  0.1267  0.1577  0.1927  0.2361   
131  0.1150  0.1163  0.0866  0.0358  0.0232  0.1267  0.2417  0.2661  0.4346   
203  0.0187  0.0346  0.0168  0.0177  0.0393  0.1630  0.2028  0.1694  0.2328   

         9   ...      50      51      52      53   

# Model Training --- Logistic Regression model

In [19]:
# Create a variable called model to hold the model we imported with the sklearn
model = LogisticRegression()


In [20]:
# Training the Logistic regression model with training data
# Use the model.fit() to load the X_train data and the Y_train label to the model
model.fit(X_train, Y_train)

LogisticRegression()

# Model Evaluation

In [21]:
# Using the accuracy score on training data 
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)


In [22]:
print("Accuracy on training data: ", training_data_accuracy  )

Accuracy on training data:  0.8342245989304813


In [23]:
# Using the accuracy score on testing data 
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [25]:
print("Accuracy on test data: ", test_data_accuracy  )

Accuracy on test data:  0.7619047619047619


# Making a predictive system
<!-- Tells whether its a mine or rock-->

In [32]:
# create a list variable that takes in sonar data to be used by the model to predict whather its a mine or rock
input_data = (0.0635,0.0709,0.0453,0.0333,0.0185,0.1260,0.1015,0.1918,0.3362,0.3900,0.4674,0.5632,0.5506,0.4343,0.3052,0.3492,0.3975,0.3875,0.5280,0.7198,0.7702,0.8562,0.8688,0.9236,1.0000,0.9662,0.9822,0.7360,0.4158,0.2918,0.3280,0.3690,0.3450,0.2863,0.0864,0.3724,0.4649,0.3488,0.1817,0.1142,0.1220,0.2621,0.4461,0.4726,0.3263,0.1423,0.0390,0.0406,0.0311,0.0086,0.0154,0.0048,0.0025,0.0087,0.0072,0.0095,0.0086,0.0085,0.0040,0.0051)

# convert the input data which is a list into a numpy array because the processing on numpy array is faster and easier
input_data_as_numpy = np.asarray(input_data)

# reshape the numpy array as we are predicting for one instance so that the model isnt confused by number of datapoints
input_data_reshaped = input_data_as_numpy.reshape(1, -1) #represents there is one instance == creates a 1D array
#print(input_data_reshaped) 

# make the prediction
# create a variable that stores the prediction result
# model.predict() takes in the features of the data and retuns a prediction using the model
prediction = model.predict(input_data_reshaped)
print(prediction)

# the R is inculded in a list, so [0] represents the first element in the list
if (prediction[0] == 'R'):
    print("The object is a Rock")
else:
    print("The object is a Mine")

['M']
The object is a Mine
