# **Analysis of Mental Health Data Nepal**


## Importing Necessary Libraries

In [2]:
import pandas as pd  # Importing pandas library for data manipulation
import numpy as np  # Importing numpy library for numerical operations and array handling
import plotly.express as px  # Importing plotly express for interactive plotting and visualization
from sklearn.preprocessing import LabelEncoder  # Importing LabelEncoder for encoding categorical variables
from sklearn.model_selection import train_test_split  # Importing train_test_split for splitting data into training and testing sets
from sklearn.ensemble import RandomForestClassifier  # Importing RandomForestClassifier for classification using ensemble method
from sklearn.linear_model import LogisticRegression  # Importing LogisticRegression for logistic regression classification
from sklearn.neighbors import KNeighborsClassifier  # Importing KNeighborsClassifier for K-nearest neighbors classification
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix  # Importing metrics for evaluating classification models.


In [3]:
url = '/content/drive/MyDrive/GroupAssignmentDataAnalyticsandML/Mental_health_dataset.csv'
df = pd.read_csv(url)


## Data Exploration

In [4]:
df.head()

Unnamed: 0,S.N,District_Name,Zone,Ecological_Belt,Development_Region,Year_BS,Year_AD,condition,type,Male,Female,Age,Married,Unmarried,Education,Employment,lat,long
0,0,Taplejung,Mechi,Mountain,Eastern,2069,2012,Severe,Dipression,26,24,19,27,23,"Some College, short continuing education or eq...",no,27.618589,87.856661
1,1,Taplejung,Mechi,Mountain,Eastern,2069,2012,Severe,Psychosis,53,30,50,57,26,,yes,27.618589,87.856661
2,2,Taplejung,Mechi,Mountain,Eastern,2069,2012,Major,Anxiety (Neurosis),24,32,21,37,19,#####,yes,27.618589,87.856661
3,3,Taplejung,Mechi,Mountain,Eastern,2069,2012,Major,Mental retardation,48,46,20,51,43,"College degree, bachelor, master",yes,27.618589,87.856661
4,4,Taplejung,Mechi,Mountain,Eastern,2069,2012,Major,Conversive disorder (Hysteria),49,29,60,45,33,#####,retired,27.618589,87.856661


In [5]:
df.tail()

Unnamed: 0,S.N,District_Name,Zone,Ecological_Belt,Development_Region,Year_BS,Year_AD,condition,type,Male,Female,Age,Married,Unmarried,Education,Employment,lat,long
2620,2620,Darchula,Mahakali,Mountain,Far-Western,2077,2020,Minor,Anxiety (Neurosis),47,41,53,47,41,"College degree, bachelor, master",yes,29.892711,80.741361
2621,2621,Darchula,Mahakali,Mountain,Far-Western,2077,2020,Severe,Mental retardation,18,22,38,24,16,"College degree, bachelor, master",no,29.892711,80.741361
2622,2622,Darchula,Mahakali,Mountain,Far-Western,2077,2020,Severe,Conversive disorder (Hysteria),27,30,18,5,52,Up to 12 years of school,no,29.892711,80.741361
2623,2623,Darchula,Mahakali,Mountain,Far-Western,2077,2020,Normal,Alcoholism,55,36,60,33,58,"College degree, bachelor, master",#####,29.892711,80.741361
2624,2624,Darchula,Mahakali,Mountain,Far-Western,2077,2020,Normal,Epilesy,39,30,24,48,21,"College degree, bachelor, master",yes,29.892711,80.741361


In [6]:
df.info() #finding basic information of all columns

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2625 entries, 0 to 2624
Data columns (total 18 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   S.N                 2625 non-null   int64  
 1   District_Name       2625 non-null   object 
 2   Zone                2625 non-null   object 
 3   Ecological_Belt     2625 non-null   object 
 4   Development_Region  2625 non-null   object 
 5   Year_BS             2625 non-null   int64  
 6   Year_AD             2625 non-null   int64  
 7   condition           2625 non-null   object 
 8   type                2625 non-null   object 
 9   Male                2625 non-null   int64  
 10  Female              2625 non-null   int64  
 11  Age                 2625 non-null   int64  
 12  Married             2625 non-null   int64  
 13  Unmarried           2625 non-null   int64  
 14  Education           2625 non-null   object 
 15  Employment          2625 non-null   object 
 16  lat   

In [7]:
df.describe()

Unnamed: 0,S.N,Year_BS,Year_AD,Male,Female,Age,Married,Unmarried,lat,long
count,2625.0,2625.0,2625.0,2625.0,2625.0,2625.0,2625.0,2625.0,2625.0,2625.0
mean,1312.0,2071.862857,2014.862857,39.217143,38.765333,38.762286,37.357333,40.625143,28.003789,84.254905
std,757.916552,2.568618,2.568618,12.639741,12.423508,12.513433,13.006108,21.568092,0.885817,2.170864
min,0.0,2069.0,2012.0,18.0,18.0,18.0,1.0,-6.0,26.571975,80.38209
25%,656.0,2070.0,2013.0,28.0,28.0,28.0,27.0,25.0,27.21832,82.359404
50%,1312.0,2071.0,2014.0,39.0,38.0,39.0,38.0,40.0,27.948879,84.235742
75%,1968.0,2074.0,2017.0,51.0,50.0,50.0,48.0,55.0,28.692376,86.00722
max,2624.0,2077.0,2020.0,60.0,60.0,60.0,60.0,108.0,30.035133,87.955321


In [8]:
df.dtypes

S.N                     int64
District_Name          object
Zone                   object
Ecological_Belt        object
Development_Region     object
Year_BS                 int64
Year_AD                 int64
condition              object
type                   object
Male                    int64
Female                  int64
Age                     int64
Married                 int64
Unmarried               int64
Education              object
Employment             object
lat                   float64
long                  float64
dtype: object

In [9]:
total_counties = df['District_Name'].nunique()
print("Total Counties Studied:", total_counties)

Total Counties Studied: 75


In [10]:
missing_values = df.isnull().sum()

print("Number of missing values in the columns:")
print(missing_values)

Number of missing values in the columns:
S.N                   0
District_Name         0
Zone                  0
Ecological_Belt       0
Development_Region    0
Year_BS               0
Year_AD               0
condition             0
type                  0
Male                  0
Female                0
Age                   0
Married               0
Unmarried             0
Education             0
Employment            0
lat                   0
long                  0
dtype: int64


In [11]:
#checking unique values of each columns to find out possible values that is missing or that needs to be cleaned
for column in df.columns:
    unique_values = df[column].unique()
    print(f'Unique values in {column} column:')
    print(unique_values)
    print()

Unique values in S.N column:
[   0    1    2 ... 2622 2623 2624]

Unique values in District_Name column:
['Taplejung' 'Panchthar' 'Ilam' 'Jhapa' 'Morang' 'Sunsari' 'Dhankuta'
 'Teharthum' 'Sankhuwasabha' 'Bhojpur' 'Solukhumbu' 'Okhaldhunga'
 'Khotang' 'Udaypur' 'Saptari' 'Siraha' 'Dhanusha' 'Mahottari' 'Sarlahi'
 'Sindhuli' 'Ramechhap' 'Dolkha' 'Sindhupalchowk' 'Kavre' 'Lalitpur'
 'Bhaktapur' 'Kathmandu' 'Nuwakot' 'Rasuwa' 'Dhading' 'Makawanpur'
 'Rautahat' 'Bara' 'Parsa' 'Chitwan' 'Gorkha' 'Lamjung' 'Tanahu' 'Syangja'
 'Kaski' 'Manang' 'Mustang' 'Myagdi' 'Parbat' 'Baglung' 'Gulmi' 'Palpa'
 'Nawalparasi' 'Rupandehi' 'Kapilvastu' 'Arghakhanchi' 'Pyuthan' 'Rolpa'
 'Rukum' 'Salyan' 'Dang' 'Banke' 'Bardiya' 'Surkhet' 'Dailekh' 'Jajarkot'
 'Dolpa' 'Jumla' 'Kalikot' 'Mugu' 'Humla' 'Bajura' 'Bajhang' 'Achham'
 'Doti' 'Kailali' 'Kanchanpur' 'Dadeldhura' 'Baitadi' 'Darchula']

Unique values in Zone column:
['Mechi' 'Kosi' 'Sagarmatha' 'Janakpur' 'Bagmati' 'Narayani' 'Gandaki'
 'Dhawalagiri' 'Lu

## Data Preprocessing

In [12]:
#correcting spellings
df['type'] = df['type'].str.replace('Dipression', 'Depression')
df['type'] = df['type'].str.replace('Epilesy', 'Epilepsy')

In [13]:
# Renaming specific types of disorders for consistency and clarity:
df['type'] = df['type'].replace('Anxiety (Neurosis)', 'Anxiety') # Replacing 'Anxiety (Neurosis)' with 'Anxiety'
df['type'] = df['type'].replace('Conversive disorder (Hysteria)','Hysteria')# Replacing 'Conversive disorder (Hysteria)' with 'Hysteria'

Handling Missing Values

In [14]:
df['Education'] = df['Education'].replace(['#####','######','#######' , '####', 'na/na','Na/na'], np.nan)
df['Education'] = df['Education'].replace('none', 'None')
nan_count_Education = df['Education'].isna().sum()
print(nan_count_Education)

177


In [15]:
df['Employment'] = df['Employment'].replace(['####', '######' ,'#######','#####'], np.nan)
nan_count_Employment= df['Employment'].isna().sum()
print(nan_count_Employment)

360


In [16]:
df.fillna('Unknown', inplace = True)

Handling Unrealistic Values

In [17]:
#finding unrealistic values in Unmarried column

filtered_df = df[df['Unmarried']<0]
print("Unrealistic Values in Unmarried Column (Unmarried < 0):")
print("\nS.N   Unmarried")
print(filtered_df['Unmarried'])

Unrealistic Values in Unmarried Column (Unmarried < 0):

S.N   Unmarried
37     -2
97     -6
822    -3
1159   -2
1612   -2
1672   -6
2397   -3
Name: Unmarried, dtype: int64


In [18]:
#Replacing negative values of unmarried column with mean value of the entire "Unmarried" column.
df['Unmarried'] = df['Unmarried'].apply(lambda x: x if x >= 0 else df['Unmarried'].mean())

Checking Duplicate Values

In [19]:
#finding out duplicate rows if any
duplicate_rows = df.duplicated()
print("Duplicate rows:")
print(df[duplicate_rows])

Duplicate rows:
Empty DataFrame
Columns: [S.N, District_Name, Zone, Ecological_Belt, Development_Region, Year_BS, Year_AD, condition, type, Male, Female, Age, Married, Unmarried, Education, Employment, lat, long]
Index: []


In [20]:
df.head()

Unnamed: 0,S.N,District_Name,Zone,Ecological_Belt,Development_Region,Year_BS,Year_AD,condition,type,Male,Female,Age,Married,Unmarried,Education,Employment,lat,long
0,0,Taplejung,Mechi,Mountain,Eastern,2069,2012,Severe,Depression,26,24,19,27,23.0,"Some College, short continuing education or eq...",no,27.618589,87.856661
1,1,Taplejung,Mechi,Mountain,Eastern,2069,2012,Severe,Psychosis,53,30,50,57,26.0,,yes,27.618589,87.856661
2,2,Taplejung,Mechi,Mountain,Eastern,2069,2012,Major,Anxiety,24,32,21,37,19.0,Unknown,yes,27.618589,87.856661
3,3,Taplejung,Mechi,Mountain,Eastern,2069,2012,Major,Mental retardation,48,46,20,51,43.0,"College degree, bachelor, master",yes,27.618589,87.856661
4,4,Taplejung,Mechi,Mountain,Eastern,2069,2012,Major,Hysteria,49,29,60,45,33.0,Unknown,retired,27.618589,87.856661


In [21]:
bins = [0,20,40,float('inf')]
labels = ['young', 'Middle-age', 'old']
df['Age_Group'] = pd.cut(df['Age'], bins=bins, labels=labels, right=False)
df.head()

Unnamed: 0,S.N,District_Name,Zone,Ecological_Belt,Development_Region,Year_BS,Year_AD,condition,type,Male,Female,Age,Married,Unmarried,Education,Employment,lat,long,Age_Group
0,0,Taplejung,Mechi,Mountain,Eastern,2069,2012,Severe,Depression,26,24,19,27,23.0,"Some College, short continuing education or eq...",no,27.618589,87.856661,young
1,1,Taplejung,Mechi,Mountain,Eastern,2069,2012,Severe,Psychosis,53,30,50,57,26.0,,yes,27.618589,87.856661,old
2,2,Taplejung,Mechi,Mountain,Eastern,2069,2012,Major,Anxiety,24,32,21,37,19.0,Unknown,yes,27.618589,87.856661,Middle-age
3,3,Taplejung,Mechi,Mountain,Eastern,2069,2012,Major,Mental retardation,48,46,20,51,43.0,"College degree, bachelor, master",yes,27.618589,87.856661,Middle-age
4,4,Taplejung,Mechi,Mountain,Eastern,2069,2012,Major,Hysteria,49,29,60,45,33.0,Unknown,retired,27.618589,87.856661,old


In [22]:
#Checking number of types in type column
numberOfTypes = df['type'].nunique()
print("Unique values in type column:")
print(df['type'].unique())
print("Total number of types :", numberOfTypes)

Unique values in type column:
['Depression' 'Psychosis' 'Anxiety' 'Mental retardation' 'Hysteria'
 'Alcoholism' 'Epilepsy']
Total number of types : 7


In [23]:
district_distribution = df['District_Name'].value_counts()
print("Frequency distribution of mental health phenomena by district:\n")
print(district_distribution)

Frequency distribution of mental health phenomena by district:

Taplejung         35
Banke             35
Salyan            35
Rukum             35
Rolpa             35
                  ..
Lalitpur          35
Kavre             35
Sindhupalchowk    35
Dolkha            35
Darchula          35
Name: District_Name, Length: 75, dtype: int64


In [24]:
max_district = district_distribution.idxmax()
max_count = district_distribution.max()

print("\nThe district with the highest number of mental health phenomena is:", max_district)
print("Number of mental health phenomena in this district:", max_count)


The district with the highest number of mental health phenomena is: Taplejung
Number of mental health phenomena in this district: 35


In [25]:
# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the condition columnD
df['encoded_Conditions'] = label_encoder.fit_transform(df['condition'])
df['encoded_Types'] = label_encoder.fit_transform(df['type'])
df['encoded_Education'] = label_encoder.fit_transform(df['Education'])
df['encoded_Employment'] = label_encoder.fit_transform(df['Employment'])

df.head()

Unnamed: 0,S.N,District_Name,Zone,Ecological_Belt,Development_Region,Year_BS,Year_AD,condition,type,Male,...,Unmarried,Education,Employment,lat,long,Age_Group,encoded_Conditions,encoded_Types,encoded_Education,encoded_Employment
0,0,Taplejung,Mechi,Mountain,Eastern,2069,2012,Severe,Depression,26,...,23.0,"Some College, short continuing education or eq...",no,27.618589,87.856661,young,3,2,3,1
1,1,Taplejung,Mechi,Mountain,Eastern,2069,2012,Severe,Psychosis,53,...,26.0,,yes,27.618589,87.856661,old,3,6,1,3
2,2,Taplejung,Mechi,Mountain,Eastern,2069,2012,Major,Anxiety,24,...,19.0,Unknown,yes,27.618589,87.856661,Middle-age,0,1,4,3
3,3,Taplejung,Mechi,Mountain,Eastern,2069,2012,Major,Mental retardation,48,...,43.0,"College degree, bachelor, master",yes,27.618589,87.856661,Middle-age,0,5,0,3
4,4,Taplejung,Mechi,Mountain,Eastern,2069,2012,Major,Hysteria,49,...,33.0,Unknown,retired,27.618589,87.856661,old,0,4,4,2


## Visualization

In [26]:
condition_counts = df['condition'].value_counts().reset_index()

# Plotting pie chart
fig = px.pie(condition_counts, values='condition', names='index', title='Distribution of Conditions')
fig.show()

In [27]:
# Box Plot

fig = px.box(df, x='Employment', y='Age', title='Age distribution by Employment')
fig.show()

In [28]:
# Box Plot

fig = px.box(df, x='type', y='Age', color='condition', title='Age distribution by Mental Health type and their conditions')
fig.show()

In [29]:
# Grouping by education level and counting occurrences
education_counts = df['Education'].value_counts().reset_index()

# Plotting pie chart
fig = px.pie(education_counts, values='Education', names='index', title='Distribution of Education Levels')
fig.show()

In [30]:
time_series = df.groupby(['Year_AD', 'condition']).size().reset_index(name='counts')
fig = px.line(time_series, x='Year_AD', y='counts', color='condition', title='Mental Health Condition Over Time')
fig.show()

In [31]:
# Histogram
fig = px.histogram(df, x='Age', color='condition', title='Distribution of Age among Mental Health Conditions', nbins=100)
fig.show()


In [32]:
fig = px.scatter(df, x='Education', y='Age', color='type', title='Scatter Plot of Age and Education')
fig.show()

In [33]:
depression_data = df[(df['type'] == 'Depression') & (df['Year_AD'] == 2012)]
fig = px.scatter_mapbox(depression_data, lat='lat', lon='long',
                    zoom=5,
                    mapbox_style="open-street-map",
                    color='condition',
                    hover_name='District_Name',
                    title='Condition of Depression by District in 2012 AD')

fig.show()

In [34]:
# Filter depression_data for each condition
severe_depression = depression_data[depression_data['condition'] == 'Severe']
normal_depression = depression_data[depression_data['condition'] == 'Normal']
major_depression = depression_data[depression_data['condition'] == 'Major']
minor_depression = depression_data[depression_data['condition'] == 'Minor']

# Count the unique districts for each condition
num_severe = len(severe_depression['District_Name'].unique())
num_normal = len(normal_depression['District_Name'].unique())
num_major = len(major_depression['District_Name'].unique())
num_minor = len(minor_depression['District_Name'].unique())



# Print the results
print("Condition of Depression by District in 2012 AD")
print("Number of districts with severe depression:", num_severe)
print("Number of districts with normal depression:", num_normal)
print("Number of districts with major depression:", num_major)
print("Number of districts with minor depression:", num_minor)


Condition of Depression by District in 2012 AD
Number of districts with severe depression: 20
Number of districts with normal depression: 18
Number of districts with major depression: 24
Number of districts with minor depression: 13


In [35]:
depression_data_2020 = df[(df['type'] == 'Depression') & (df['Year_AD'] == 2020)]
fig = px.scatter_mapbox(depression_data_2020, lat='lat', lon='long',
                    zoom=5,
                    mapbox_style="open-street-map",
                    color='condition',
                    hover_name='District_Name',
                    title='Condition of Depression by District in 2020 AD')

fig.show()

In [36]:
districts_depression_2020 = depression_data_2020['District_Name'].nunique()
print("Total number of Districts suffering from depression in 2020 AD =", districts_depression_2020)

Total number of Districts suffering from depression in 2020 AD = 15


In [37]:
df.to_csv('/content/drive/MyDrive/GroupAssignmentDataAnalyticsandML/Cleaned_Mental_HealthData.csv', index=False)

## Correlation Matrix

In [38]:
corr_matrix = df.corr()
condition_correlation = corr_matrix['encoded_Conditions']

# Print the correlation values
print(condition_correlation)

S.N                   0.018953
Year_BS               0.020503
Year_AD               0.020503
Male                 -0.005553
Female                0.004533
Age                  -0.024765
Married              -0.007749
Unmarried             0.002819
lat                   0.033840
long                 -0.032860
encoded_Conditions    1.000000
encoded_Types        -0.014775
encoded_Education     0.007947
encoded_Employment   -0.013090
Name: encoded_Conditions, dtype: float64






In [39]:
heatmap_fig = px.imshow(
    corr_matrix,
    x=corr_matrix.columns,
    y=corr_matrix.columns,
)

heatmap_fig.update_layout(title="Correlation Matrix")
heatmap_fig.show()

## Train-Test Split

In [40]:

X=df[['Age','encoded_Types','encoded_Education','encoded_Employment',
      'Male','Female','Married','Unmarried']]
y=df["condition"]

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



## Random Forest Model

In [41]:
# Creating and training the Random Forest model
random_forest_model = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest_model.fit(X_train, y_train)



In [42]:
# Making predictions on the test set
y_pred = random_forest_model.predict(X_test)
y_pred

array(['Major', 'Severe', 'Major', 'Minor', 'Severe', 'Major', 'Normal',
       'Normal', 'Severe', 'Major', 'Severe', 'Severe', 'Severe', 'Major',
       'Major', 'Minor', 'Minor', 'Severe', 'Normal', 'Severe', 'Minor',
       'Normal', 'Severe', 'Minor', 'Severe', 'Normal', 'Major', 'Severe',
       'Normal', 'Normal', 'Severe', 'Minor', 'Minor', 'Normal', 'Major',
       'Normal', 'Normal', 'Major', 'Severe', 'Major', 'Normal', 'Minor',
       'Severe', 'Major', 'Normal', 'Severe', 'Normal', 'Major', 'Normal',
       'Minor', 'Severe', 'Minor', 'Minor', 'Major', 'Normal', 'Normal',
       'Minor', 'Major', 'Normal', 'Normal', 'Severe', 'Normal', 'Minor',
       'Severe', 'Severe', 'Normal', 'Normal', 'Normal', 'Severe',
       'Minor', 'Severe', 'Minor', 'Severe', 'Severe', 'Major', 'Major',
       'Severe', 'Normal', 'Minor', 'Major', 'Severe', 'Normal', 'Normal',
       'Normal', 'Minor', 'Severe', 'Normal', 'Normal', 'Major', 'Minor',
       'Major', 'Minor', 'Minor', 'Major', 'M

In [43]:
# Create a DataFrame for visualization
viz_df = X_test.copy()
viz_df['Predicted_Conditions'] = y_pred

# Create a histogram to visualize the distribution of predicted conditions
fig = px.histogram(viz_df, x='Predicted_Conditions',nbins= 7, title='Distribution of Predicted Conditions By Random Forest Classifier')
fig.show()

In [44]:
# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Displaying additional evaluation metrics
print("\nClassification Report(Random Forest):\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)

Accuracy: 0.7561904761904762

Classification Report(Random Forest):
               precision    recall  f1-score   support

       Major       0.70      0.74      0.72       135
       Minor       0.82      0.74      0.78       126
      Normal       0.83      0.76      0.79       145
      Severe       0.69      0.79      0.74       119

    accuracy                           0.76       525
   macro avg       0.76      0.76      0.76       525
weighted avg       0.76      0.76      0.76       525


Confusion Matrix:
[[100  10   7  18]
 [ 13  93   8  12]
 [ 14   9 110  12]
 [ 15   2   8  94]]


In [45]:
fig = px.imshow(conf_matrix,
                labels=dict(x="Predicted", y="Actual", color="Count"),
                x=['Major', 'Minor', 'Normal', 'Severe'],
                y=['Major', 'Minor', 'Normal', 'Severe'],title="Confusion Matrix (Random Forest)")

# Show the plot
fig.show()

In [46]:
cr_data_randomforest= {
    "Class": ["Major", "Minor", "Normal", "Severe"],
    "Precision": [0.70, 0.82, 0.83, 0.69],
    "Recall": [0.74, 0.74, 0.76, 0.79],
    "F1-score": [0.72, 0.78, 0.79, 0.74]
}

df_metrics = pd.DataFrame(cr_data_randomforest)

fig = px.bar(df_metrics, x='Class', y=['Precision', 'Recall', 'F1-score'],
             title='Visualization of Random Forest Classification Report',
             labels={'value': 'Score', 'variable': 'Metric', 'Class': 'Class'},
             barmode='group')

fig.show()

## Logistic Regression Model

In [47]:

logistic_model = LogisticRegression(max_iter=1000)
logistic_model.fit(X_train, y_train)

In [48]:
y_pred = logistic_model.predict(X_test)
y_pred

array(['Severe', 'Normal', 'Major', 'Minor', 'Minor', 'Normal', 'Severe',
       'Severe', 'Major', 'Severe', 'Normal', 'Severe', 'Major', 'Minor',
       'Major', 'Minor', 'Minor', 'Minor', 'Normal', 'Severe', 'Normal',
       'Severe', 'Severe', 'Severe', 'Normal', 'Severe', 'Normal',
       'Major', 'Minor', 'Normal', 'Severe', 'Minor', 'Minor', 'Normal',
       'Severe', 'Minor', 'Normal', 'Major', 'Severe', 'Major', 'Major',
       'Normal', 'Major', 'Major', 'Minor', 'Severe', 'Major', 'Minor',
       'Major', 'Severe', 'Minor', 'Major', 'Severe', 'Minor', 'Normal',
       'Severe', 'Minor', 'Major', 'Normal', 'Normal', 'Minor', 'Minor',
       'Major', 'Major', 'Severe', 'Severe', 'Major', 'Major', 'Major',
       'Major', 'Severe', 'Normal', 'Severe', 'Severe', 'Normal', 'Minor',
       'Severe', 'Major', 'Minor', 'Minor', 'Minor', 'Normal', 'Major',
       'Major', 'Major', 'Severe', 'Major', 'Severe', 'Minor', 'Minor',
       'Major', 'Minor', 'Minor', 'Severe', 'Major', 'Min

In [49]:
# Create a DataFrame for visualization
viz_df = X_test.copy()
viz_df['Predicted_Conditions'] = y_pred

# Create a histogram to visualize the distribution of predicted conditions
fig = px.histogram(viz_df, x='Predicted_Conditions',nbins= 7, title='Distribution of Predicted Conditions By Logistic regression')
fig.show()

In [50]:
# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Additional evaluation metrics
print("\nClassification Report(Logistic Regresssion):")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)

Accuracy: 0.30095238095238097

Classification Report(Logistic Regresssion):
              precision    recall  f1-score   support

       Major       0.29      0.30      0.29       135
       Minor       0.34      0.36      0.35       126
      Normal       0.37      0.23      0.28       145
      Severe       0.24      0.34      0.28       119

    accuracy                           0.30       525
   macro avg       0.31      0.30      0.30       525
weighted avg       0.31      0.30      0.30       525


Confusion Matrix:
[[40 31 20 44]
 [25 45 21 35]
 [33 34 33 45]
 [39 24 16 40]]


In [51]:
fig = px.imshow(conf_matrix,
                labels=dict(x="Predicted", y="Actual", color="Count"),
                x=['Major', 'Minor', 'Normal', 'Severe'],
                y=['Major', 'Minor', 'Normal', 'Severe'],title="Confusion Matrix (logistic Regression)")

# Show the plot
fig.show()

In [52]:
cr_data_logistic_regression = {
    "Class": ["Major", "Minor", "Normal", "Severe"],
    "Precision": [0.29, 0.34, 0.37, 0.24],
    "Recall": [0.30, 0.36, 0.23, 0.34],
    "F1-score": [0.29, 0.35, 0.28, 0.28]
}

df_metrics_logistic_regression = pd.DataFrame(cr_data_logistic_regression)

fig_lr = px.bar(df_metrics_logistic_regression, x='Class', y=['Precision', 'Recall', 'F1-score'],
             title='Visualization of Logistic Regression Classification Report ',
             labels={'value': 'Score', 'variable': 'Metric', 'Class': 'Class'},
             barmode='group')

fig_lr.show()


## K-Nearest Neighbor Model

In [53]:

KNN_model = KNeighborsClassifier(n_neighbors=5)

# Training the KNN model
KNN_model.fit(X_train, y_train)

In [54]:
y_pred = KNN_model.predict(X_test)
y_pred

array(['Major', 'Major', 'Major', 'Major', 'Minor', 'Minor', 'Minor',
       'Severe', 'Major', 'Major', 'Severe', 'Major', 'Severe', 'Major',
       'Normal', 'Minor', 'Severe', 'Severe', 'Minor', 'Major', 'Major',
       'Normal', 'Major', 'Major', 'Minor', 'Major', 'Major', 'Severe',
       'Normal', 'Normal', 'Severe', 'Normal', 'Minor', 'Normal', 'Major',
       'Minor', 'Normal', 'Major', 'Normal', 'Normal', 'Major', 'Minor',
       'Severe', 'Minor', 'Major', 'Minor', 'Normal', 'Major', 'Normal',
       'Severe', 'Normal', 'Normal', 'Minor', 'Major', 'Major', 'Minor',
       'Minor', 'Minor', 'Minor', 'Normal', 'Major', 'Normal', 'Major',
       'Major', 'Major', 'Minor', 'Minor', 'Normal', 'Severe', 'Normal',
       'Major', 'Minor', 'Major', 'Minor', 'Major', 'Normal', 'Normal',
       'Minor', 'Major', 'Major', 'Severe', 'Major', 'Normal', 'Normal',
       'Minor', 'Severe', 'Major', 'Normal', 'Normal', 'Minor', 'Major',
       'Minor', 'Minor', 'Major', 'Major', 'Major', 'Ma

In [55]:
# Create a DataFrame for visualization
viz_df = X_test.copy()
viz_df['Predicted_Conditions'] = y_pred

# Create a histogram to visualize the distribution of predicted conditions
fig = px.histogram(viz_df, x='Predicted_Conditions',nbins= 7, title='Distribution of Predicted Conditions By KNN')
fig.show()

In [56]:

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Additional evaluation metrics
print("\nClassification Report(KNN):")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)


Accuracy: 0.3904761904761905

Classification Report(KNN):
              precision    recall  f1-score   support

       Major       0.35      0.47      0.40       135
       Minor       0.42      0.48      0.44       126
      Normal       0.43      0.34      0.38       145
      Severe       0.39      0.28      0.33       119

    accuracy                           0.39       525
   macro avg       0.40      0.39      0.39       525
weighted avg       0.40      0.39      0.39       525


Confusion Matrix:
[[63 22 27 23]
 [32 60 18 16]
 [44 40 49 12]
 [43 22 21 33]]


In [57]:
fig = px.imshow(conf_matrix,
                labels=dict(x="Predicted", y="Actual", color="Count"),
                x=['Major', 'Minor', 'Normal', 'Severe'],
                y=['Major', 'Minor', 'Normal', 'Severe'], title="Confusion Matrix (KNN)")

# Show the plot
fig.show()

In [58]:
cr_data_knn = {
    "Class": ["Major", "Minor", "Normal", "Severe"],
    "Precision": [0.35, 0.42, 0.43, 0.39],
    "Recall": [0.47, 0.48, 0.34, 0.28],
    "F1-score": [0.40, 0.44, 0.38, 0.33]
}

df_metrics_knn = pd.DataFrame(cr_data_knn)

fig_knn = px.bar(df_metrics_knn, x='Class', y=['Precision', 'Recall', 'F1-score'],
             title='Visualization of KNN Classification Report',
             labels={'value': 'Score', 'variable': 'Metric', 'Class': 'Class'},
             barmode='group')

fig_knn.show()
