In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns
import zipfile

In [None]:
with zipfile.ZipFile("/content/full_data (1).xlsx.zip", 'r') as zip_ref:
    zip_ref.extractall("/content/")

data = pd.read_excel("/content/full_data (1).xlsx")

In [None]:
print(data.head())


print(data.describe())

# Check for missing values
print(data.isnull().sum())

   gender   age  hypertension  heart_disease ever_married      work_type  \
0    Male  67.0             0              1          Yes        Private   
1    Male  80.0             0              1          Yes        Private   
2  Female  49.0             0              0          Yes        Private   
3  Female  79.0             1              0          Yes  Self-employed   
4    Male  81.0             0              0          Yes        Private   

  Residence_type  avg_glucose_level   bmi   smoking_status  stroke  
0          Urban             228.69  36.6  formerly smoked       1  
1          Rural             105.92  32.5     never smoked       1  
2          Urban             171.23  34.4           smokes       1  
3          Rural             174.12  24.0     never smoked       1  
4          Urban             186.21  29.0  formerly smoked       1  
               age  hypertension  heart_disease  avg_glucose_level  \
count  4981.000000   4981.000000    4981.000000        4981

In [None]:
# Visualize the distribution of the target variable (stroke)
sns.countplot(data['stroke'])
plt.title('Distribution of Stroke')
plt.show()

In [None]:
# One-hot encode categorical variables
data_encoded = pd.get_dummies(data)

# Visualize the correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(data_encoded.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix')
plt.show()

In [None]:
# Check for categorical columns
categorical_columns = data.select_dtypes(include=['object', 'category']).columns

# Print the list of categorical columns
print("Categorical columns:", categorical_columns)


In [None]:
# Check unique values of all columns
for col in data.columns:
    print(col, data[col].unique())


In [None]:
# One-hot encode categorical variables
data_encoded = pd.get_dummies(data, columns=categorical_columns)


In [None]:
# Check the data types of each column
print(data.dtypes)


In [None]:
# One-hot encode categorical variables
data_encoded = pd.get_dummies(data, columns=['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status'])


In [None]:
# Convert categorical columns to 'category' data type
categorical_columns = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
for col in categorical_columns:
    data[col] = data[col].astype('category')

# One-hot encode categorical variables
data_encoded = pd.get_dummies(data, columns=categorical_columns)


In [None]:
# Preprocessing: Handle missing values, categorical variables, etc.

# Assuming 'stroke' is the target variable
X = data.drop('stroke', axis=1)
y = data['stroke']

In [None]:
X_train_60, X_test_60, y_train_60, y_test_60 = train_test_split(X, y, test_size=0.4, random_state=42)
X_train_70, X_test_70, y_train_70, y_test_70 = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
def train_evaluate_model(X_train, X_test, y_train, y_test, n_estimators, n_jobs):
    rf_classifier = RandomForestClassifier(n_estimators=n_estimators, n_jobs=n_jobs, random_state=42)
    rf_classifier.fit(X_train, y_train)
    y_pred = rf_classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    return accuracy, mse, rmse, mae

In [None]:
def plot_performance_variation(x_values, y_values, xlabel, ylabel, title):
    plt.figure(figsize=(8, 6))
    plt.plot(x_values, y_values, marker='o', linestyle='-')
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.title(title)
    plt.grid(True)
    plt.show()

In [None]:
train_test_ratios = [(0.6, 0.4), (0.7, 0.3)]
estimators_jobs_combinations = [(50, -1), (100, -1), (50, 2)]
accuracies = []
mses = []
rmes = []
maes = []

for train_ratio, test_ratio in train_test_ratios:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_ratio, random_state=42)
    for n_estimators, n_jobs in estimators_jobs_combinations:
        accuracy, mse, rmse, mae = train_evaluate_model(X_train, X_test, y_train, y_test, n_estimators, n_jobs)
        accuracies.append(accuracy)
        mses.append(mse)
        rmes.append(rmse)
        maes.append(mae)


In [None]:
# Plotting the variation in mean squared error:
plot_performance_variation(range(len(mses)), mses, 'Combinations', 'Mean Squared Error', 'Mean Squared Error Fluctuations')

# Plotting the variation in root mean squared error:
plot_performance_variation(range(len(rmes)), rmes, 'Combinations', 'Root Mean Squared Error', 'Root Mean Squared Error Shifts')

# Plotting the variation in mean absolute error:
plot_performance_variation(range(len(maes)), maes, 'Combinations', 'Mean Absolute Error', 'Mean Absolute Error Oscillations')

# Plotting the variation in accuracy with different combinations:
plot_performance_variation(range(len(accuracies)), accuracies, 'Combinations', 'Accuracy', 'Accuracy Changes')

# Creating labels for estimators and jobs combinations:
estimators_jobs_labels = [f"Estimators: {n_estimators}, Jobs: {n_jobs}" for n_estimators, n_jobs in estimators_jobs_combinations]

# Plotting mean squared error variation with estimators and jobs:
plot_performance_variation(estimators_jobs_labels, mses[-len(estimators_jobs_combinations):], 'Estimators & Jobs', 'Mean Squared Error', 'Mean Squared Error Cha
