In [120]:
# data preproccessing
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

# Load the data
data = pd.read_csv('C:\data_set\depression.csv')



# Remove two columns
data = data[["Gender","Age","Education Level","Marital Status","Household Size","Household Income","Full Time Work","Work Type","Time In Current Job","Out Of Work","Trouble Sleeping History","Sleep Hours","Vigorous Recreation","Sedentary Time","Vigorous Work","Moderate Work","Cant Work","Lifetime Alcohol Consumption","Pregnant","Health Problem Heart","Health Problem Diabetes","Current Smoker","Depression"]]
# Replace "missing" with NaN in the entire dataframe
data = data.apply(lambda x: x.str.strip().replace('Missing', pd.NA) if x.dtype == "object" else x)
encoder = LabelEncoder()
data['Gender'] = encoder.fit_transform(data['Gender'])
data["Depression"] = data["Depression"].replace({"Not Depressed": 0, "Depressed": 1})
# Reverse the label encoding
data['Gender']

0        0
1        1
2        1
3        0
4        1
        ..
36254    1
36255    0
36256    1
36257    0
36258    1
Name: Gender, Length: 36259, dtype: int32

In [121]:
data.shape                                                          

(36259, 23)

In [122]:
data.isnull().sum()

Gender                              0
Age                                 0
Education Level                  2157
Marital Status                   1636
Household Size                      0
Household Income                 1813
Full Time Work                  30492
Work Type                       16284
Time In Current Job                 0
Out Of Work                     21461
Trouble Sleeping History            0
Sleep Hours                         0
Vigorous Recreation                 3
Sedentary Time                      0
Vigorous Work                       0
Moderate Work                       0
Cant Work                           0
Lifetime Alcohol Consumption    22323
Pregnant                        28366
Health Problem Heart                0
Health Problem Diabetes             0
Current Smoker                  20789
Depression                          0
dtype: int64

In [123]:
# Drop the columns that have more than 10000 null values
data = data.dropna(axis=1, thresh=len(data)-10000)

# Drop the rows that contain any null value
data = data.dropna(axis=0)

In [124]:
data.corr()["Depression"]

Gender                -0.077534
Age                   -0.010262
Household Size        -0.007909
Time In Current Job   -0.087375
Sleep Hours           -0.085884
Sedentary Time         0.033815
Depression             1.000000
Name: Depression, dtype: float64

In [125]:
data["Depression"].value_counts()

0    29578
1     2838
Name: Depression, dtype: int64

In [126]:


# Replace the "Some College or AA Degree" category with "College"
data["Education Level"] = data["Education Level"].replace("Some College or AA Degree", "College or Degree")

data["Education Level"].value_counts()

College or Degree            9715
College Graduate or Above    7528
High School                  7517
9-11th Grade                 4541
Less Than 9th Grade          3115
Name: Education Level, dtype: int64

In [127]:
data["Marital Status"].value_counts()

Married          16781
Never Married     5765
Divorced          3619
Partner           2643
Widowed           2528
Separated         1080
Name: Marital Status, dtype: int64

In [128]:


# Define a custom function to categorize income
def categorize_income(income):
    if income in ["Below $5K", "Below $10K", "Below $15K", "Below $20K", "Below $25K", "Below $35K"]:
        return "Below 40k"
    else:
        return "Above 40k"

# Apply the custom function to the "Household Income" variable
data["Household Income"] = data["Household Income"].apply(categorize_income)

data["Household Income"].value_counts()

Above 40k    17916
Below 40k    14500
Name: Household Income, dtype: int64

In [129]:
data["Trouble Sleeping"] = data["Trouble Sleeping History"].replace({"Yes": 1, "No": 0})


In [130]:
data["Recreation"] = data["Vigorous Recreation"].replace({"Yes": 1, "No": 0})


In [131]:
data["Vigorous Work"] = data["Vigorous Work"].replace({"Yes": 1, "No": 0})

In [132]:
data["Moderate Work"] = data["Moderate Work"].replace({"Yes": 1, "No": 0})

In [133]:
data["Can Work"] = data["Cant Work"].replace({"Yes": 0, "No": 1})

In [134]:
data["Heart Problem"] = data["Health Problem Heart"].replace({"Yes": 1, "No": 0})
data["Diabetes"] = data["Health Problem Diabetes"].replace({"Yes": 1, "No": 0})

columns_to_drop = ["Trouble Sleeping History", "Vigorous Recreation", "Cant Work","Health Problem Diabetes","Health Problem Heart"]

# Use the drop() method to drop the columns
data = data.drop(columns_to_drop, axis=1)

TypeError: 'Index' object is not callable

In [None]:
data = pd.get_dummies(data, columns=["Education Level","Marital Status","Household Income"])

In [None]:
data.info()

In [None]:
data.head()

In [None]:
data.columns

In [None]:
# Divide the data into independent and target variables
X = data.drop("Depression", axis=1) # Independent variables
y = data["Depression"] # Target variable
print(X)
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [None]:

# Train a logistic regression model
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
# Evaluate the performance of the model on the testing set
score = logreg.score(X_test, y_test)
print("Accuracy on testing set: {:.2f}%".format(score * 100))

In [None]:
import json

# Example variable to export
send = {'accuracyl':(score * 100)}

with open('accuracyl.json', 'w') as f:
    json.dump(send, f)

In [None]:
X_test.head()

In [None]:
import pickle
with open('lr_model.pickle', 'wb') as f:
    pickle.dump(logreg, f)

print("complete")