<a href="https://www.kaggle.com/code/jaysreeborgohain/titanic-linear-regression-notebook1?scriptVersionId=161204334" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/d/jaysreeborgohain/titanic-machine-learning-from-disaster/train.csv
/kaggle/input/d/jaysreeborgohain/titanic-machine-learning-from-disaster/test.csv


In [2]:
# Load the dataset
train_data = pd.read_csv("/kaggle/input/d/jaysreeborgohain/titanic-machine-learning-from-disaster/train.csv")
test_data = pd.read_csv("/kaggle/input/d/jaysreeborgohain/titanic-machine-learning-from-disaster/test.csv")

# Explore the first few rows of the dataset
print(train_data.head())
print(test_data.head())


# Check for missing values
print(test_data.isnull().sum())

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
  

In [3]:
# Remove leading and trailing whitespaces from column names
train_data.columns = train_data.columns.str.strip()
test_data.columns = test_data.columns.str.strip()

# Handling missing values
train_data['Age'].fillna(train_data['Age'].median(), inplace=True)
train_data['Embarked'].fillna(train_data['Embarked'].mode()[0], inplace=True)

test_data['Age'].fillna(test_data['Age'].median(), inplace=True)
test_data['Embarked'].fillna(test_data['Embarked'].mode()[0], inplace=True)
test_data['Fare'].fillna(test_data['Fare'].median(), inplace=True)

# Drop 'Cabin' column due to a large number of missing values
train_data.drop('Cabin', axis=1, inplace=True)

test_data.drop('Cabin', axis=1, inplace=True)

# Convert categorical variables to numerical - One hot encoding
train_data = pd.get_dummies(train_data, columns=['Sex','Embarked'])

test_data = pd.get_dummies(test_data, columns=['Sex','Embarked'])


In [4]:
# Feature engineering
train_data['FamilySize'] = train_data['SibSp'] + train_data['Parch'] 

test_data['FamilySize'] = test_data['SibSp'] + test_data['Parch'] 

#print(test_data.isnull().sum())

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

# Features and target variable

features = ['Pclass', 'Sex_female','Sex_male', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_C', 'Embarked_Q', 'Embarked_S', 'FamilySize']
X = train_data[features]
y = train_data['Survived']

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize and train the model
model = LogisticRegression(max_iter=100)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8044692737430168


In [6]:
# Trying Gridsearch to hypertune

from sklearn.model_selection import GridSearchCV


# Define the parameter grid
param_grid = {
    'max_iter': [100, 500, 1000, 1500]
}

# Initializing the Logistic Regression model
model = LogisticRegression()

# Create the GridSearchCV object
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best parameters from the grid search
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# Use the best model for predictions
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Best Parameters: {'max_iter': 100}
Accuracy: 0.8044692737430168


In [7]:
from sklearn.model_selection import GridSearchCV, cross_val_score


# GridSearchCV object with cross-validation
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best parameters from the grid search
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# Use the best model for predictions
best_model = grid_search.best_estimator_

# Evaluate the model using cross-validation
cv_scores = cross_val_score(best_model, X_train, y_train, cv=5, scoring='accuracy')
print("Cross-Validation Mean Accuracy:", cv_scores.mean())


Best Parameters: {'max_iter': 100}
Cross-Validation Mean Accuracy: 0.7906628582684921


In [8]:
model.fit(X_train, y_train)

# Working on Test Data
X_test = scaler.transform(test_data[features])

# Make predictions on the test data
y_pred_test = model.predict(X_test)

result_df = pd.DataFrame({
    'PassengerId': test_data['PassengerId'],  
    'Survived': y_pred_test
})

# Save the DataFrame to a CSV file ""/kaggle/working" Directory
result_df.to_csv('titanic_predictions.csv', index=False)