# LOGISTIC REGRESSION

Logistic regression is used when we have to draw binary decision from the dataset, that is either 'TRUE' or 'FALSE' OR 'YES' of 'NO' etc.

Example include :  detecting patients survival, detecting cancer in patience, checking spam email.

In [13]:
#Importing necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer

#Reading dataset
dataset = pd.read_csv("E:\\ML Zero to Hero\\titanic.csv")
data = dataset[["pclass","survived", "sex", "age"]]

#Check number of rows and column in data
print(data.shape)

#Check Data
print(data.describe())

#Find number or missing values in each column and returning its sum, Here axis=0 means sum values of column
print(data.isna().sum(axis=0))

#Drop unknown values
dataset = data.dropna(subset=["sex", "pclass", "survived"])

#Now Check number of NaN in each column
print(dataset.isna().sum(axis=0))

#Transform male and female value of sex column into integer as Machine can't process text
dataset["sex"] = dataset["sex"].map({"male":0, "female":1})

#Extract feautures(x and y values) from dataset
x = dataset[["sex","age","pclass"]]
y = dataset[["survived"]]

#Preprocessing of missing values in dataset using Imputation
imp = SimpleImputer(missing_values = np.nan, strategy = 'mean')
x = imp.fit_transform(x)

#Transform data into test and training dataset
x_train, x_test, y_train, y_test = train_test_split(x, y)

#Train model using Logistic Regression
LR = LogisticRegression()
LR.fit(x_train, y_train)

#Drawing prediction from model
prediction = LR.predict(x_test)

#Finding confusion matrix
CM = confusion_matrix(y_test, prediction)

#Finding accuracy
acc = accuracy_score(y_test, prediction)

#Printing the results
print("Confusion Matrix = \n", CM)
print("\nAccuracy = \n" + str(round(acc * 100)) + "%")


(1310, 4)
            pclass     survived          age
count  1309.000000  1309.000000  1046.000000
mean      2.294882     0.381971    29.881135
std       0.837836     0.486055    14.413500
min       1.000000     0.000000     0.166700
25%       2.000000     0.000000    21.000000
50%       3.000000     0.000000    28.000000
75%       3.000000     1.000000    39.000000
max       3.000000     1.000000    80.000000
pclass        1
survived      1
sex           1
age         264
dtype: int64
pclass        0
survived      0
sex           0
age         263
dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y = column_or_1d(y, warn=True)


Confusion Matrix = 
 [[178  32]
 [ 32  86]]

Accuracy = 
80.0%
