In [1]:
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, roc_auc_score

st.title("Logistic Regression Assignment")

# Load dataset
df = pd.read_csv(r"C:\Users\Welcome\Desktop\Titanic_train.csv")
st.write(df.head())

# Display dataset info
st.write(df.info())
st.write(df.describe())
st.write("Shape of dataset:", df.shape)

# Correlation heatmap
z = df.corr(numeric_only=True)
fig, ax = plt.subplots()
sns.heatmap(z, annot=True, ax=ax)
fig, ax = plt.subplots()
st.pyplot(fig)

# Drop unnecessary columns and handle missing values
df = df.dropna()
df = df.drop(columns=["Cabin", "Embarked", "Name", "Sex", "Ticket", "PassengerId"])
st.write("Cleaned Data Shape:", df.shape)

# Summary statistics
summary = df.describe().T
st.write(summary)

# Histograms
fig, ax = plt.subplots(figsize=(15, 10))
df.hist(bins=20, color='skyblue', edgecolor='black', ax=ax)
plt.suptitle("Histogram of numerical values", fontsize=16)
fig, ax = plt.subplots()
st.pyplot(fig)

# Pairplot
fig = sns.pairplot(df)
fig, ax = plt.subplots()
st.pyplot(fig)

# Define features and target variable
X = df.drop('Survived', axis=1)
y = df['Survived']

# Train Logistic Regression model
classifier = LogisticRegression()
classifier.fit(X, y)

# Predictions
y_pred = classifier.predict(X)
y_pred_df = pd.DataFrame({'actual': y, 'predicted': y_pred})
st.write(y_pred_df.head())

# Confusion Matrix
conf_matrix = confusion_matrix(y, y_pred)
fig, ax = plt.subplots()
sns.heatmap(conf_matrix, annot=True, fmt='g', ax=ax)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix')
ax.xaxis.set_ticklabels(['0', '1'])
ax.yaxis.set_ticklabels(['0', '1'])
fig, ax = plt.subplots()
st.pyplot(fig)

# ROC Curve
fpr, tpr, thresholds = roc_curve(y, classifier.predict_proba(X)[:, 1])
auc = roc_auc_score(y, y_pred)
fig, ax = plt.subplots()
ax.plot(fpr, tpr, color='red', label='Logit model (AUC = %0.2f)' % auc)
ax.plot([0, 1], [0, 1], 'k--')
ax.set_xlabel('False Positive Rate or [1 - True Negative Rate]')
ax.set_ylabel('True Positive Rate')
ax.legend()
fig, ax = plt.subplots()
st.pyplot(fig)


2025-03-16 14:37:23.215 
  command:

    streamlit run C:\ProgramData\anaconda3\Lib\site-packages\ipykernel_launcher.py [ARGUMENTS]


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


  df.hist(bins=20, color='skyblue', edgecolor='black', ax=ax)


DeltaGenerator()