In [None]:
import pandas as pd
import numpy as np

import plotly.graph_objects as go
import plotly.figure_factory as ff
import plotly.express as px

import matplotlib.pyplot as plt
import seaborn as sns

from imblearn.over_sampling import SMOTE
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression

px.defaults.template = "plotly_dark"

import warnings
warnings.filterwarnings('ignore')



In [None]:
df = pd.read_csv('dataset_skizofrenia.csv')


In [None]:
df.head()

Unnamed: 0,Name,Age,Gender,Marital_Status,Fatigue,Slowing,Pain,Hygiene,Movement,Schizophrenia
0,Leslie Goodwin,68,Female,Single,0.698075,0.123064,0.375303,0.234639,0.251869,Elevated Proneness
1,Dr. Troy Castaneda,88,Male,Married,0.049245,-0.04208,0.432807,0.501238,0.379948,Moderate Proneness
2,Chelsey Allen,67,Female,Married,0.651995,0.187117,,0.301942,0.302588,Elevated Proneness
3,Dr. Devin Skinner DVM,95,Female,Widowed,0.036324,0.580808,0.005356,0.306968,0.813618,Moderate Proneness
4,Megan Mendez,81,Female,Widowed,0.926727,0.484202,0.702405,0.736054,0.579448,High Proneness


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Name            5000 non-null   object 
 1   Age             5000 non-null   int64  
 2   Gender          5000 non-null   object 
 3   Marital_Status  5000 non-null   object 
 4   Fatigue         4756 non-null   float64
 5   Slowing         4771 non-null   float64
 6   Pain            4758 non-null   float64
 7   Hygiene         5000 non-null   float64
 8   Movement        5000 non-null   float64
 9   Schizophrenia   5000 non-null   object 
dtypes: float64(5), int64(1), object(4)
memory usage: 390.8+ KB


In [None]:
df = pd.get_dummies(df, columns=['Gender'])



In [None]:
nas = (df.isnull().sum() / len(df)) * 100
nas = nas[nas > 0]
labels = nas.index
values = nas.values
fig = go.Figure(data=[go.Pie(labels=labels, values=values)])
fig.update_layout(
    title='Percentage of Missing Values by Column',
    template='plotly_dark'
)

fig.show()


In [None]:
target = df['Schizophrenia'].value_counts()

In [None]:
fig = go.Figure(data=go.Bar(
    x=target.index,
    y=target.values
))

In [None]:
fig.update_layout(
    title='schizophernia Status',
    xaxis=dict(title='Shizo'),

    template='plotly_dark'
)


fig.show()

In [None]:
mapping = {
    'Low Proneness': 0,
    'Moderate Proneness': 0,
    'High Proneness': 1,
    'Very High Proneness': 1,
    'Elevated Proneness' : 1
}

In [None]:
df['Schizophrenia'] = df['Schizophrenia'].map(mapping)


In [None]:
target = df['Schizophrenia'].value_counts()


In [None]:
target

Schizophrenia
1    4043
0     957
Name: count, dtype: int64

In [None]:
df = df.rename(columns={'Schizophrenia': 'Target'})

In [None]:
df = df.drop(columns=['Name','Marital_Status'])

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Age            5000 non-null   int64  
 1   Fatigue        4756 non-null   float64
 2   Slowing        4771 non-null   float64
 3   Pain           4758 non-null   float64
 4   Hygiene        5000 non-null   float64
 5   Movement       5000 non-null   float64
 6   Target         5000 non-null   int64  
 7   Gender_Female  5000 non-null   bool   
 8   Gender_Male    5000 non-null   bool   
dtypes: bool(2), float64(5), int64(2)
memory usage: 283.3 KB


In [None]:
def fill_missing_values(df):
    # Calculate mean values of Fatigue, Slowing, and Pain for each class of Target
    fatigue_mean = df.groupby('Target')['Fatigue'].mean().reset_index()
    slowing_mean = df.groupby('Target')['Slowing'].mean().reset_index()
    pain_mean = df.groupby('Target')['Pain'].mean().reset_index()

    # Merge mean values with the original DataFrame
    df = df.merge(fatigue_mean, on='Target', suffixes=('', '_mean_fatigue'))
    df = df.merge(slowing_mean, on='Target', suffixes=('', '_mean_slowing'))
    df = df.merge(pain_mean, on='Target', suffixes=('', '_mean_pain'))

    # Fill missing values of Fatigue, Slowing, and Pain based on the mean values of each class of Target
    df['Fatigue'] = df['Fatigue'].fillna(df['Fatigue_mean_fatigue'])
    df['Slowing'] = df['Slowing'].fillna(df['Slowing_mean_slowing'])
    df['Pain'] = df['Pain'].fillna(df['Pain_mean_pain'])

    # Drop the columns containing mean values
    df.drop(columns=['Fatigue_mean_fatigue', 'Slowing_mean_slowing', 'Pain_mean_pain'], inplace=True)

    return df

In [None]:
df = fill_missing_values(df)

In [None]:
def balancer(df):
    # Separate features and target variable
    X = df.drop(columns='Target')
    y = df['Target']

    # Apply SMOTE
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X, y)

    # Combine features and target variable into a DataFrame
    balanced_df = pd.DataFrame(X_resampled, columns=X.columns)
    balanced_df['Target'] = y_resampled
    return balanced_df

In [None]:
train = balancer(df)

In [None]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8086 entries, 0 to 8085
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Age            8086 non-null   int64  
 1   Fatigue        8086 non-null   float64
 2   Slowing        8086 non-null   float64
 3   Pain           8086 non-null   float64
 4   Hygiene        8086 non-null   float64
 5   Movement       8086 non-null   float64
 6   Gender_Female  8086 non-null   bool   
 7   Gender_Male    8086 non-null   bool   
 8   Target         8086 non-null   int64  
dtypes: bool(2), float64(5), int64(2)
memory usage: 458.1 KB


In [None]:
def neuro_features(df):
    np.random.seed(42)

    n_samples = len(df)

    schizophrenic_gray_mean = 713
    schizophrenic_gray_std = 77
    schizophrenic_white_mean = 415
    schizophrenic_white_std = 56
    schizophrenic_csf_mean = 331
    schizophrenic_csf_std = 54

    df['Schizophrenic_Gray_Volume'] = np.random.normal(schizophrenic_gray_mean, schizophrenic_gray_std, n_samples)
    df['Schizophrenic_White_Volume'] = np.random.normal(schizophrenic_white_mean, schizophrenic_white_std, n_samples)
    df['Schizophrenic_CSF_Volume'] = np.random.normal(schizophrenic_csf_mean, schizophrenic_csf_std, n_samples)

    comparison_gray_mean = 810
    comparison_gray_std = 81
    comparison_white_mean = 441
    comparison_white_std = 51
    comparison_csf_mean = 321
    comparison_csf_std = 45

    df['Comparison_Gray_Volume'] = np.random.normal(comparison_gray_mean, comparison_gray_std, n_samples)
    df['Comparison_White_Volume'] = np.random.normal(comparison_white_mean, comparison_white_std, n_samples)
    df['Comparison_CSF_Volume'] = np.random.normal(comparison_csf_mean, comparison_csf_std, n_samples)

    regions = ['Mediodorsal Thalamus', 'Occipitoparietal Cortex', 'Premotor Cortex', 'Inferolateral Temporal Lobe']
    n_regions = len(regions)

    regional_gray_difference = [-50, -20, -15, -10]  # Decrease in gray matter volume in schizophrenic patients
    regional_white_difference = [0, -5, 0, 0]  # No significant difference in white matter volume

    for i, region in enumerate(regions):
        df[f'{region}_Schizophrenic_Gray_Volume'] = schizophrenic_gray_mean + regional_gray_difference[i]
        df[f'{region}_Comparison_Gray_Volume'] = comparison_gray_mean
        df[f'{region}_Schizophrenic_White_Volume'] = schizophrenic_white_mean + regional_white_difference[i]
        df[f'{region}_Comparison_White_Volume'] = comparison_white_mean

    return df

In [None]:
def demo_features(df):
    np.random.seed(42)
    df['Family_History_of_Psychosis'] = np.random.randint(0, 2, size=len(df))  # Binary variable
    df['Education'] = 0.5 * df['Age'] + np.random.normal(0, 2, len(df))  # Assuming education is somewhat correlated with age
    df['Social_Class'] = 0.3 * df['Education'] + 0.2 * df['Age'] + np.random.normal(0, 1, len(df))  # Assuming social class is correlated with education and age
    df['Duration_of_Illness'] = 0.1 * df['Age'] + np.random.normal(0, 5, len(df))  # Assuming duration of illness is somewhat correlated with age
    return df

In [None]:
train  = demo_features(train)
train = neuro_features(train)

In [None]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8086 entries, 0 to 8085
Data columns (total 35 columns):
 #   Column                                                  Non-Null Count  Dtype  
---  ------                                                  --------------  -----  
 0   Age                                                     8086 non-null   int64  
 1   Fatigue                                                 8086 non-null   float64
 2   Slowing                                                 8086 non-null   float64
 3   Pain                                                    8086 non-null   float64
 4   Hygiene                                                 8086 non-null   float64
 5   Movement                                                8086 non-null   float64
 6   Gender_Female                                           8086 non-null   bool   
 7   Gender_Male                                             8086 non-null   bool   
 8   Target                                

In [None]:
fig = px.scatter(train, x='Schizophrenic_Gray_Volume', y='Comparison_Gray_Volume', color='Target', opacity=0.7,
                 title='Scatter Plot: Schizophrenic vs Comparison Gray Matter Volume',
                 labels={'Schizophrenic_Gray_Volume': 'Schizophrenic Gray Matter Volume',
                         'Comparison_Gray_Volume': 'Comparison Gray Matter Volume'},
                 color_continuous_scale=px.colors.sequential.Viridis)

fig.add_scatter(x=train['Schizophrenic_White_Volume'], y=train['Comparison_White_Volume'],
                mode='markers', marker=dict(color='red'), name='White Matter Volume')

fig.add_scatter(x=train['Schizophrenic_CSF_Volume'], y=train['Comparison_CSF_Volume'],
                mode='markers', marker=dict(color='green'), name='CSF Volume')

fig.update_layout(showlegend=True)

fig.show()

In [None]:
fig = px.scatter_matrix(train[['Age' ,'Education', 'Social_Class', 'Duration_of_Illness', 'Target']],
                        dimensions=['Age', 'Education', 'Social_Class', 'Duration_of_Illness'],
                        color='Target', title='Scatter Plot: Demographic Features vs. Target',
                        labels={'Target': 'Schizophrenia'},
                        color_continuous_scale=px.colors.diverging.Tealrose)

fig.update_traces(diagonal_visible=False)
fig.update_layout(height=800, width=800, showlegend=True)
fig.show()

In [None]:
class CustomStandardScaler:
    def __init__(self, columns):
        self.columns = columns
        self.means = None
        self.stds = None

    def fit_transform(self, df):
        self.means = df[self.columns].mean()
        self.stds = df[self.columns].std()
        df_scaled = df.copy()
        df_scaled[self.columns] = (df[self.columns] - self.means) / self.stds
        return df_scaled

    def transform(self, df):
        df_scaled = df.copy()
        df_scaled[self.columns] = (df[self.columns] - self.means) / self.stds
        return df_scaled

In [None]:
non_norm = ['Fatigue', 'Slowing', 'Pain', 'Hygiene', 'Movement','Education','Social_Class','Duration_of_Illness']

In [None]:
scaler = CustomStandardScaler(columns=non_norm)
train = scaler.fit_transform(train)

In [None]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8086 entries, 0 to 8085
Data columns (total 35 columns):
 #   Column                                                  Non-Null Count  Dtype  
---  ------                                                  --------------  -----  
 0   Age                                                     8086 non-null   int64  
 1   Fatigue                                                 8086 non-null   float64
 2   Slowing                                                 8086 non-null   float64
 3   Pain                                                    8086 non-null   float64
 4   Hygiene                                                 8086 non-null   float64
 5   Movement                                                8086 non-null   float64
 6   Gender_Female                                           8086 non-null   bool   
 7   Gender_Male                                             8086 non-null   bool   
 8   Target                                

**MODEL BUILDING**









In [None]:
X = train[['Age', 'Fatigue', 'Slowing', 'Pain', 'Hygiene', 'Movement','Education', 'Social_Class', 'Duration_of_Illness','Schizophrenic_White_Volume', 'Schizophrenic_Gray_Volume']].values
y = train['Target'].values

def train_test_split(X, y, random_state=42, test_size=0.2):
    n_samples = X.shape[0]
    np.random.seed(random_state)
    shuffled_indices = np.random.permutation(np.arange(n_samples))
    test_size = int(n_samples * test_size)
    test_indices = shuffled_indices[:test_size]
    train_indices = shuffled_indices[test_size:]
    X_train, X_test = X[train_indices], X[test_indices]
    y_train, y_test = y[train_indices], y[test_indices]
    return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y)

**Logistic Regression Model**

In [None]:
def sigmoid(z):
    sigmoid_result = 1 / (1 + np.exp(-z))
    return sigmoid_result

In [None]:
class LogisticRegression:

    def __init__(self, learning_rate=0.00000001):
        np.random.seed(1)
        self.learning_rate = learning_rate

    def initialize_parameter(self):
        self.W = np.zeros(self.X.shape[1])
        self.b = 0.0

    def forward(self, X):
        Z = np.matmul(X, self.W) + self.b
        A = sigmoid(Z)
        return A

    def compute_cost(self, predictions):
        m = self.X.shape[0]
        cost = np.sum((-np.log(predictions + 1e-8) * self.y) + (-np.log(1 - predictions + 1e-8)) * (1 - self.y))
        cost = cost / m
        return cost

    def compute_gradient(self, predictions):
        m = self.X.shape[0]
        self.dW = np.matmul(self.X.T, (predictions - self.y))
        self.dW = np.array([np.mean(grad) for grad in self.dW])
        self.db = np.sum(np.subtract(predictions, self.y))
        self.dW = self.dW * 1 / m
        self.db = self.db * 1 / m

    def fit(self, X, y, iterations, plot_cost=True):
        self.X = X
        self.y = y
        self.initialize_parameter()
        costs = []
        for i in range(iterations):
            predictions = self.forward(self.X)
            cost = self.compute_cost(predictions)
            costs.append(cost)
            self.compute_gradient(predictions)
            self.W = self.W - self.learning_rate * self.dW
            self.b = self.b - self.learning_rate * self.db
            if i % 10000 == 0:
                print("Cost after iteration {}: {}".format(i, cost))
        if plot_cost:
            fig = px.line(y=costs, title="Cost vs Iteration", template="plotly_dark")
            fig.update_layout(
                title_font_color="#41BEE9",
                xaxis=dict(color="#41BEE9", title="Iterations"),
                yaxis=dict(color="#41BEE9", title="cost")
            )
            fig.show()

    def predict(self, X):
        predictions = self.forward(X)
        return np.round(predictions)

    def save_model(self, filename=None):
        model_data = {
            'learning_rate': self.learning_rate,
            'W': self.W,
            'b': self.b
        }
        with open(filename, 'wb') as file:
            pickle.dump(model_data, file)

    def load_model(cls, filename):
        with open(filename, 'rb') as file:
            model_data = pickle.load(file)
        loaded_model = cls(model_data['learning_rate'])
        loaded_model.W = model_data['W']
        loaded_model.b = model_data['b']
        return loaded_model

In [None]:
lg = LogisticRegression(learning_rate=0.00001)

In [None]:
lg.fit(X=X_train, y=y_train, iterations=300000)

Cost after iteration 0: 0.6931471605599454
Cost after iteration 10000: 0.671570238082864
Cost after iteration 20000: 0.6517005344347452
Cost after iteration 30000: 0.6330421739058439
Cost after iteration 40000: 0.6155128752697887
Cost after iteration 50000: 0.599034066115614
Cost after iteration 60000: 0.5835311964605405
Cost after iteration 70000: 0.5689340380866917
Cost after iteration 80000: 0.5551768256708499
Cost after iteration 90000: 0.5421982718734211
Cost after iteration 100000: 0.5299414863583426
Cost after iteration 110000: 0.518353824985495
Cost after iteration 120000: 0.5073866911657201
Cost after iteration 130000: 0.4969953071333445
Cost after iteration 140000: 0.48713846899907104
Cost after iteration 150000: 0.4777782960622111
Cost after iteration 160000: 0.4688799820385193
Cost after iteration 170000: 0.4604115535820024
Cost after iteration 180000: 0.45234363969152386
Cost after iteration 190000: 0.4446492542252257
Cost after iteration 200000: 0.4373035927242076
Cost af

In [None]:
class ClassificationMetrics:
    def accuracy(y_true, y_pred):
        y_true = y_true.flatten()
        total_samples = len(y_true)
        correct_predictions = np.sum(y_true == y_pred)
        return (correct_predictions / total_samples)
    def precision(y_true, y_pred):
        true_positives = np.sum((y_true == 1) & (y_pred == 1))
        false_positives = np.sum((y_true == 0) & (y_pred == 1))
        return true_positives / (true_positives + false_positives)
    def recall(y_true, y_pred):
        true_positives = np.sum((y_true == 1) & (y_pred == 1))
        false_negatives = np.sum((y_true == 1) & (y_pred == 0))
        return true_positives / (true_positives + false_negatives)
    def f1_score(y_true, y_pred):
        precision_value = ClassificationMetrics.precision(y_true, y_pred)
        recall_value = ClassificationMetrics.recall(y_true, y_pred)
        return 2 * (precision_value * recall_value) / (precision_value + recall_value)

In [None]:
model = lg

In [None]:
y_pred = model.predict(X_test)
accuracy = ClassificationMetrics.accuracy(y_test, y_pred)
precision = ClassificationMetrics.precision(y_test, y_pred)
recall = ClassificationMetrics.recall(y_test, y_pred)
f1_score = ClassificationMetrics.f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.2%}")
print(f"Precision: {precision:.2%}")
print(f"Recall: {recall:.2%}")
print(f"F1-Score: {f1_score:.2%}")

Accuracy: 93.51%
Precision: 99.86%
Recall: 87.19%
F1-Score: 93.10%
