## **Importing Libraries**

In [132]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

## **Reading the Dataset**

In [133]:
data = pd.read_csv("/content/titanic.csv")
data

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29,0,0,24160,211.3375,B5,S,2,?,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11,?,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2,1,2,113781,151.55,C22 C26,S,?,?,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30,1,2,113781,151.55,C22 C26,S,?,135,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25,1,2,113781,151.55,C22 C26,S,?,?,"Montreal, PQ / Chesterville, ON"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,3,0,"Zabour, Miss. Hileni",female,14.5,1,0,2665,14.4542,?,C,?,328,?
1305,3,0,"Zabour, Miss. Thamine",female,?,1,0,2665,14.4542,?,C,?,?,?
1306,3,0,"Zakarian, Mr. Mapriededer",male,26.5,0,0,2656,7.225,?,C,?,304,?
1307,3,0,"Zakarian, Mr. Ortin",male,27,0,0,2670,7.225,?,C,?,?,?


## **Analyzing the Data**

In [134]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   pclass     1309 non-null   int64 
 1   survived   1309 non-null   int64 
 2   name       1309 non-null   object
 3   sex        1309 non-null   object
 4   age        1309 non-null   object
 5   sibsp      1309 non-null   int64 
 6   parch      1309 non-null   int64 
 7   ticket     1309 non-null   object
 8   fare       1309 non-null   object
 9   cabin      1309 non-null   object
 10  embarked   1309 non-null   object
 11  boat       1309 non-null   object
 12  body       1309 non-null   object
 13  home.dest  1309 non-null   object
dtypes: int64(4), object(10)
memory usage: 143.3+ KB


In [135]:
data.describe()

Unnamed: 0,pclass,survived,sibsp,parch
count,1309.0,1309.0,1309.0,1309.0
mean,2.294882,0.381971,0.498854,0.385027
std,0.837836,0.486055,1.041658,0.86556
min,1.0,0.0,0.0,0.0
25%,2.0,0.0,0.0,0.0
50%,3.0,0.0,0.0,0.0
75%,3.0,1.0,1.0,0.0
max,3.0,1.0,8.0,9.0


In [136]:
data.describe(include = 'object')

Unnamed: 0,name,sex,age,ticket,fare,cabin,embarked,boat,body,home.dest
count,1309,1309,1309,1309,1309.0,1309,1309,1309,1309,1309
unique,1307,2,99,929,282.0,187,4,28,122,370
top,"Connolly, Miss. Kate",male,?,CA. 2343,8.05,?,S,?,?,?
freq,2,843,263,11,60.0,1014,914,823,1188,564


## **Checking for Null values**

In [137]:
data.isnull().sum()

pclass       0
survived     0
name         0
sex          0
age          0
sibsp        0
parch        0
ticket       0
fare         0
cabin        0
embarked     0
boat         0
body         0
home.dest    0
dtype: int64

## **Removing unwanted columns permanently OR Feature Selection**

In [138]:
data.drop(columns= ['body','boat','home.dest','cabin','name','ticket','embarked'], inplace = True)

In [139]:
data

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare
0,1,1,female,29,0,0,211.3375
1,1,1,male,0.9167,1,2,151.55
2,1,0,female,2,1,2,151.55
3,1,0,male,30,1,2,151.55
4,1,0,female,25,1,2,151.55
...,...,...,...,...,...,...,...
1304,3,0,female,14.5,1,0,14.4542
1305,3,0,female,?,1,0,14.4542
1306,3,0,male,26.5,0,0,7.225
1307,3,0,male,27,0,0,7.225


## **Cleaning the data and Type Casting of Columns**

In [171]:
data['age'] # the column has two values : "?"

0       29.000000
1        0.916700
2        2.000000
3       30.000000
4       25.000000
          ...    
1304    14.500000
1305    23.877515
1306    26.500000
1307    27.000000
1308    29.000000
Name: age, Length: 1309, dtype: float64

In [141]:
data['age'] = data['age'].replace("?", 0)

In [142]:
data['age']= data['age'].astype('float')

In [143]:
data['age']

0       29.0000
1        0.9167
2        2.0000
3       30.0000
4       25.0000
         ...   
1304    14.5000
1305     0.0000
1306    26.5000
1307    27.0000
1308    29.0000
Name: age, Length: 1309, dtype: float64

In [144]:
data.age.mean()

23.877514667685258

### Replacing the age = 0 value, with the mean of the age column

In [147]:
data['age'] = data['age'].replace(0,data.age.mean())

In [148]:
data.age

0       29.000000
1        0.916700
2        2.000000
3       30.000000
4       25.000000
          ...    
1304    14.500000
1305    23.877515
1306    26.500000
1307    27.000000
1308    29.000000
Name: age, Length: 1309, dtype: float64

In [149]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   pclass    1309 non-null   int64  
 1   survived  1309 non-null   int64  
 2   sex       1309 non-null   object 
 3   age       1309 non-null   float64
 4   sibsp     1309 non-null   int64  
 5   parch     1309 non-null   int64  
 6   fare      1309 non-null   object 
dtypes: float64(1), int64(4), object(2)
memory usage: 71.7+ KB


## Fare column

In [150]:
data.fare = data.fare.replace('?',0.0)

In [151]:
data['fare'] = data.fare.astype("float")

In [152]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   pclass    1309 non-null   int64  
 1   survived  1309 non-null   int64  
 2   sex       1309 non-null   object 
 3   age       1309 non-null   float64
 4   sibsp     1309 non-null   int64  
 5   parch     1309 non-null   int64  
 6   fare      1309 non-null   float64
dtypes: float64(2), int64(4), object(1)
memory usage: 71.7+ KB


###  ~Memory usage also get reduced by changing the data types

In [154]:
data.head(7)

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare
0,1,1,female,29.0,0,0,211.3375
1,1,1,male,0.9167,1,2,151.55
2,1,0,female,2.0,1,2,151.55
3,1,0,male,30.0,1,2,151.55
4,1,0,female,25.0,1,2,151.55
5,1,1,male,48.0,0,0,26.55
6,1,1,female,63.0,1,0,77.9583


## **Now tha data is cleaned for applying Logistic Regression**

## APPLYING THE MODEL

## **Encoding of categorical variables (because ML models accepts integers)**

#### The get_dummies function is used to convert categorical variables into dummy or indicator variables.                                                                                                                                                        By dropping one of the one-hot encoded columns from each categorical feature, we ensure there are no "reference" columns—the remaining columns become linearly independent.

In [155]:
sex= pd.get_dummies(data["sex"],drop_first=True)

In [156]:
pclass = pd.get_dummies(data["pclass"],drop_first=True)


In [157]:
data = pd.concat([data,pclass,sex],axis=1)  # concatinating the original data with new dummy columns

In [158]:
data

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,2,3,male
0,1,1,female,29.000000,0,0,211.3375,0,0,0
1,1,1,male,0.916700,1,2,151.5500,0,0,1
2,1,0,female,2.000000,1,2,151.5500,0,0,0
3,1,0,male,30.000000,1,2,151.5500,0,0,1
4,1,0,female,25.000000,1,2,151.5500,0,0,0
...,...,...,...,...,...,...,...,...,...,...
1304,3,0,female,14.500000,1,0,14.4542,0,1,0
1305,3,0,female,23.877515,1,0,14.4542,0,1,0
1306,3,0,male,26.500000,0,0,7.2250,0,1,1
1307,3,0,male,27.000000,0,0,7.2250,0,1,1


In [159]:
data.drop(columns= ["sex",'pclass'],axis=1,inplace=True) # dropping the reference columns


In [160]:
data.head(7)

Unnamed: 0,survived,age,sibsp,parch,fare,2,3,male
0,1,29.0,0,0,211.3375,0,0,0
1,1,0.9167,1,2,151.55,0,0,1
2,0,2.0,1,2,151.55,0,0,0
3,0,30.0,1,2,151.55,0,0,1
4,0,25.0,1,2,151.55,0,0,0
5,1,48.0,0,0,26.55,0,0,1
6,1,63.0,1,0,77.9583,0,0,0


In [161]:
X = data.drop("survived",axis=1)  # Independent variables
y = data["survived"] # Dependent variable "Survived column"

In [162]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   survived  1309 non-null   int64  
 1   age       1309 non-null   float64
 2   sibsp     1309 non-null   int64  
 3   parch     1309 non-null   int64  
 4   fare      1309 non-null   float64
 5   2         1309 non-null   uint8  
 6   3         1309 non-null   uint8  
 7   male      1309 non-null   uint8  
dtypes: float64(2), int64(3), uint8(3)
memory usage: 55.1 KB


## Splitting the data into train and test where test ratio = 20%

In [163]:
X_train, X_test, Y_train, Y_test = model_selection.train_test_split (X,y, test_size = 0.2, random_state = 1)

## **making an object named 'model' of LogisticRegression Class**

In [164]:
model = LogisticRegression ()

## **Fitting the model**

In [165]:
model.fit (X_train.values, Y_train) # .values is used to get only the values of dependent features and not their names, otherwise an error woukd be generated

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## **Making predictions on test data**

In [124]:
predictions = model.predict (X_test.values)

## Comparing the model predictions with Y_test or ground truth of testing data

In [167]:
summary = classification_report (Y_test, predictions)
print(summary)

              precision    recall  f1-score   support

           0       0.82      0.91      0.86       156
           1       0.84      0.71      0.77       106

    accuracy                           0.83       262
   macro avg       0.83      0.81      0.82       262
weighted avg       0.83      0.83      0.83       262



## **Accuracy of the Results:**

In [168]:
accuracy_score (Y_test, predictions)

0.8282442748091603

## The model is 82.82% accuracte!

## **Confusion Matrix for more clarification:**

In [169]:
print (confusion_matrix (Y_test, predictions))

[[142  14]
 [ 31  75]]


**True positive: 142** (We predicted a positive result and it was positive)                                                                      
**True negative: 75** (We predicted a negative result and it was negative)                                                             
**False positive: 14** (We predicted a positive result and it was negative)                                                                       
**False negative: 31** (We predicted a negative result and it was positive)