In [1]:
import os 
import pandas as pd 
import numpy as np 
import plotly.express as px 
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm 

In [2]:
df = pd.read_csv('/kaggle/input/titanic-dataset/titanic.csv')

In [3]:
df.shape

(891, 12)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [5]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [6]:
df['Age'].fillna(30,inplace=True)
df['Age'] = df['Age'].astype(int)


In [7]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [8]:
features = df[['Survived', 'Pclass','Sex', 'Age', 'SibSp',
       'Parch','Fare']]

In [9]:
dummy = pd.get_dummies(features)
dummy

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male
0,0,3,22,1,0,7.2500,False,True
1,1,1,38,1,0,71.2833,True,False
2,1,3,26,0,0,7.9250,True,False
3,1,1,35,1,0,53.1000,True,False
4,0,3,35,0,0,8.0500,False,True
...,...,...,...,...,...,...,...,...
886,0,2,27,0,0,13.0000,False,True
887,1,1,19,0,0,30.0000,True,False
888,0,3,30,1,2,23.4500,True,False
889,1,1,26,0,0,30.0000,False,True


In [10]:
columns_to_convert = ['Sex_female', 'Sex_male']
dummy[columns_to_convert] = dummy[columns_to_convert].astype(int)


In [11]:
dummy

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male
0,0,3,22,1,0,7.2500,0,1
1,1,1,38,1,0,71.2833,1,0
2,1,3,26,0,0,7.9250,1,0
3,1,1,35,1,0,53.1000,1,0
4,0,3,35,0,0,8.0500,0,1
...,...,...,...,...,...,...,...,...
886,0,2,27,0,0,13.0000,0,1
887,1,1,19,0,0,30.0000,1,0
888,0,3,30,1,2,23.4500,1,0
889,1,1,26,0,0,30.0000,0,1


In [12]:
dummy.corr()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male
Survived,1.0,-0.338481,-0.070649,-0.035322,0.081629,0.257307,0.543351,-0.543351
Pclass,-0.338481,1.0,-0.329822,0.083081,0.018443,-0.5495,-0.1319,0.1319
Age,-0.070649,-0.329822,1.0,-0.232201,-0.180581,0.090782,-0.084241,0.084241
SibSp,-0.035322,0.083081,-0.232201,1.0,0.414838,0.159651,0.114631,-0.114631
Parch,0.081629,0.018443,-0.180581,0.414838,1.0,0.216225,0.245489,-0.245489
Fare,0.257307,-0.5495,0.090782,0.159651,0.216225,1.0,0.182333,-0.182333
Sex_female,0.543351,-0.1319,-0.084241,0.114631,0.245489,0.182333,1.0,-1.0
Sex_male,-0.543351,0.1319,0.084241,-0.114631,-0.245489,-0.182333,-1.0,1.0


In [13]:
fig = px.histogram(df,x='Sex',
                  title = 'Total Number on Board by Gender',
               width=300,
                  height=400)
fig.update_layout(font=dict(size=7))
fig.show()

fig = px.histogram(df,x='Sex',y='Survived',
                  title='Survivors By Gender',
                  width=300,
                  height=400,
                  labels={'Survived':'Survivors'})
fig.update_layout(font=dict(size=7))
fig

## Comment
The analysis reveals that there were more men on board than women on board having total of 577 men and 314 women, men had 64.76% population while women had 35.24%

Looking at the total number of survivors , more women  survived compared to that of the male where the total number of women who survived was 233 and the total number of men that survived was 109.

Men made up 64.76% of the total population but only 18.89% of all the men on baord survived while the women made up a total of 35.24% and 74.20% of the total woemn on baord survived 

In [14]:
fig = px.histogram(df,x='Pclass',
                  title = 'Total Number on Board by Class',
               width=300,
                  height=400)
fig.update_layout(font=dict(size=7))
fig.show()

fig = px.histogram(df,x='Pclass',y='Survived',
                  title='Survivors By Class',
                  width=300,
                  height=400,
                  labels={'Survived':'Survivors'})
fig.update_layout(font=dict(size=7))
fig


## Comment
Analysing the class of the passengers, analysis shows that out of all 891 passengers on the ship , 491 got the third class ticket which only means that 55% of passengers got the third class ticket, and 184 passenger got the second class ticket which makes it 21% of the total passengers on board while 216 passenger got the first class ticket which was  24% of the passengers who were on board 

Out of all 891 passengers who were on board only 342 passengers survived, 136 passengers(40% of survivors) who boarded the first class survived, 87 passengers(25% of survivors) boarded the second class while 119 passengers(35% of survivors) boarded third class 

In [15]:
fig = px.histogram(df,x='Sex',y='Fare',
                  title = 'Total Number on Board by Gender and Fare',
               width=250,
                  height=400)
fig.update_layout(font=dict(size=7))
fig.show()


fig = px.histogram(df,x='Fare',y='Survived',
                  title = 'Total Number Survivors by Fares',
               width=400,
                  height=400)
fig.update_layout(font=dict(size=7))
fig





In [16]:
dummy.columns

Index(['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_female',
       'Sex_male'],
      dtype='object')

In [17]:
x = dummy[['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_female',
       'Sex_male']]
y = dummy['Survived']

In [18]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2, random_state = 32)

In [19]:
model = linear_model.LinearRegression()
model

In [20]:
model.fit(x_train,y_train)

In [21]:
coefficient = model.coef_
coefficient

array([-0.16743027, -0.00605202, -0.05148265, -0.01517782,  0.00051191,
        0.251468  , -0.251468  ])

In [22]:
r2 = model.score(x_test,y_test)
r2

0.40641354091326587

In [29]:
prediction = model.predict()

## The Linear Regression model explains 40.64% in the varaibility of the Total number of survivvors which means that further investigation can be carried out to find out other variables that might have affected the vaiability of the survivors  

In [23]:
X = sm.add_constant(x)

In [24]:
model = sm.OLS(y,X).fit()

In [25]:
print(model.summary())


                            OLS Regression Results                            
Dep. Variable:               Survived   R-squared:                       0.395
Model:                            OLS   Adj. R-squared:                  0.391
Method:                 Least Squares   F-statistic:                     96.21
Date:                Wed, 29 Nov 2023   Prob (F-statistic):           5.19e-93
Time:                        13:17:56   Log-Likelihood:                -398.06
No. Observations:                 891   AIC:                             810.1
Df Residuals:                     884   BIC:                             843.7
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.6957      0.046     15.158      0.0

# Results

### R-squared: 
The model explains about 39.5% of the variation in 'Survived', meaning it captures a good portion of the trends in the data.

### Adjusted R-squared: 
This considers the number of predictors, giving a more realistic view of model performance. It's around 39.1%.

### Coefficients: 
Each variable has a coefficient indicating its impact on 'Survivors'. For instance, 'Passnger class' has a coefficient of -0.1699, suggesting that as 'Passenger class' increases, the survivors tends to decrease.

### p-values: 
These tell us if the coefficients are statistically significant. Smaller values (typically < 0.05) indicate significance. For example, 'Passenger class' and 'Age' seem significant, while 'Parent and children' and 'Fare' might not be.

## Interpretation Tips

### Negative Coefficients: 
A negative coefficient suggests that as the variable increases, 'Survivors' tends to decrease.

### Positive Coefficients: 
A positive coefficient suggests that as the variable increases, 'Survivors' tends to increase.

## Model Diagnostics

### Multicollinearity: 
The results mention possible multicollinearity issues. Check if any variables are highly correlated, as this can affect the stability of coefficient estimates.

## Recommendations

Examine variables with high p-values for potential exclusion or further analysis.

Investigate multicollinearity issues and consider addressing them.

These results provide insights into how different factors may influence the likelihood of survival. Keep in mind the nuances of interpreting coefficients and the need for further exploration.

In [27]:
df['Survival Probability']= prediction

In [28]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survival Probability
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.2500,,S,0.108003
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C,0.892659
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.9250,,S,0.640486
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1000,C123,S,0.902769
4,5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.0500,,S,0.075236
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27,0,0,211536,13.0000,,S,0.294220
887,888,1,1,"Graham, Miss. Margaret Edith",female,19,0,0,112053,30.0000,B42,S,1.030587
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,30,1,2,W./C. 6607,23.4500,,S,0.539815
889,890,1,1,"Behr, Mr. Karl Howell",male,26,0,0,111369,30.0000,C148,C,0.477043


pls upvote if you find this intersting 