In [3]:
import pandas as pd
import plotly.express as px

# Assuming your CSV file is named 'your_data.csv'
df = pd.read_csv('stroke-data.csv')

# 1. Age vs. Avg Glucose Level with color-coded points based on 'smoking_status'
fig1 = px.scatter(df, x='age', y='avg_glucose_level', color='smoking_status', title='Age vs. Avg Glucose Level',
                  labels={'age': 'Age', 'avg_glucose_level': 'Avg Glucose Level'},
                  color_discrete_map={'smokes': 'blue', 'formerly smoked': 'orange', 'never smoked': 'green'})

# 2. BMI vs. Avg Glucose Level with color-coded points based on 'stroke' status
fig2 = px.scatter(df, x='bmi', y='avg_glucose_level', color='stroke', title='BMI vs. Avg Glucose Level',
                  labels={'bmi': 'BMI', 'avg_glucose_level': 'Avg Glucose Level'},
                  color_discrete_map={0: 'green', 1: 'red'})

# 3. Hypertension vs. Heart Disease with color-coded bars based on 'heart_disease' status
fig3 = px.bar(df, x='hypertension', color='heart_disease', title='Hypertension vs. Heart Disease',
              labels={'hypertension': 'Hypertension', 'count': 'Count'},
              color_discrete_map={0: 'green', 1: 'red'})

# 4. Age vs. Stroke with color-coded points based on 'stroke' status
fig4 = px.scatter(df, x='age', y='stroke', color='stroke', title='Age vs. Stroke',
                  labels={'age': 'Age', 'stroke': 'Stroke'},
                  color_discrete_map={0: 'green', 1: 'red'})

# 5. Gender Distribution with different colors for each gender
fig5 = px.bar(df, x='gender', color='gender', title='Gender Distribution',
              labels={'gender': 'Gender', 'count': 'Count'})

# 6. Work Type and Smoking Status with color-coded bars based on 'smoking_status'
fig6 = px.bar(df, x='work_type', color='smoking_status', title='Work Type and Smoking Status',
              labels={'work_type': 'Work Type', 'count': 'Count'},
              color_discrete_map={'smokes': 'blue', 'formerly smoked': 'orange', 'never smoked': 'green'})

# 7. Residence Type and Average Glucose Level with color-coded bars based on 'Residence_type'
fig7 = px.bar(df, x='Residence_type', y='avg_glucose_level', color='Residence_type',
              title='Residence Type and Avg Glucose Level',
              labels={'Residence_type': 'Residence Type', 'avg_glucose_level': 'Avg Glucose Level'},
              color_discrete_map={'Urban': 'blue', 'Rural': 'green'})

# Show the plots
fig1.show()
fig2.show()
fig3.show()
fig4.show()
fig5.show()
fig6.show()
fig7.show()


In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Assuming your CSV file is named 'your_data.csv'
df = pd.read_csv('stroke-data.csv')

# Selecting predictor variables (X) and the target variable (y)
X = df.drop('stroke', axis=1)
y = df['stroke']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Defining categorical and numerical columns
categorical_cols = ['gender', 'work_type', 'Residence_type', 'smoking_status']
numerical_cols = ['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi']

# Creating transformers for preprocessing
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder()

# Creating a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Creating a pipeline with preprocessing and logistic regression
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', LogisticRegression(random_state=42))])



# Training the model
model.fit(X_train, y_train)


# Making predictions on the test set
y_pred = model.predict(X_test)

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
# classification_rep = classification_report(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred, zero_division=1)




# Displaying the results
print(f'Accuracy: {accuracy:.2f}')
print('\nConfusion Matrix:')
print(conf_matrix)
print('\nClassification Report:')
print(classification_rep)


Accuracy: 0.94

Confusion Matrix:
[[960   0]
 [ 62   0]]

Classification Report:
              precision    recall  f1-score   support

           0       0.94      1.00      0.97       960
           1       1.00      0.00      0.00        62

    accuracy                           0.94      1022
   macro avg       0.97      0.50      0.48      1022
weighted avg       0.94      0.94      0.91      1022

