## Test data cleaning

In [1]:
import numpy as np
import pandas as pd

test = pd.read_csv("test.csv")

# Most columns go through this process of replacing insignificant/faulty values with the most popular one
test_degree = test["Degree"]
top_deg = test_degree.value_counts().index[0]
for deg in test_degree.value_counts().index:
    #print(deg)
    deg_n = len(test[test_degree == deg])
    if deg_n < 10: # If the total value of these would be insignificant (in all cases such quantity of values were < 10) replace them with the most
                   # popular value
        #print(test[test_degree == deg])
        test.loc[:, "Degree"] = test["Degree"].replace(deg, top_deg)
test["Degree"].value_counts()

Degree
Class 12    9896
B.Ed        7762
B.Arch      6037
B.Com       5439
B.Pharm     3987
BCA         3869
M.Ed        3707
MCA         3438
BBA         3387
BSc         3314
LLM         3133
MSc         3096
M.Tech      3017
M.Pharm     2995
LLB         2938
B.Tech      2885
BHM         2776
BA          2490
MBA         2470
ME          2391
MD          2289
BE          2166
MHM         2149
M.Com       2128
PhD         2073
MBBS        2033
MA          1933
Name: count, dtype: int64

In [2]:
# Combining "Pressure" and "Satisfaction" columns
# Unite columns by selecting the non-empty value
test['Pressure'] = test['Academic Pressure'].combine_first(test['Work Pressure'])
position = test.columns.get_loc('Academic Pressure')

# Rearrange columns: insert 'Pressure' at the desired position
columns = (
    test.columns[:position]  # Columns before the position
    .to_list() + ['Pressure'] +  # Add the new column
    test.columns[position + 2:]  # Exclude old columns being replaced
    .to_list()
)

test = test[columns]
test = test.iloc[:, :-1]


In [3]:
# Same process repeated 
test['Satisfaction'] = test['Study Satisfaction'].combine_first(test['Job Satisfaction'])
position = test.columns.get_loc('Study Satisfaction')

columns = (
    test.columns[:position]  
    .to_list() + ['Satisfaction'] +  
    test.columns[position + 2:] 
    .to_list()
)
test = test[columns]
test = test.iloc[:, :-1]
print(test.head())


       id     Name  Gender   Age           City  \
0  140700   Shivam    Male  53.0  Visakhapatnam   
1  140701    Sanya  Female  58.0        Kolkata   
2  140702     Yash    Male  53.0         Jaipur   
3  140703   Nalini  Female  23.0         Rajkot   
4  140704  Shaurya    Male  47.0         Kalyan   

  Working Professional or Student              Profession  Pressure  CGPA  \
0            Working Professional                   Judge       2.0   NaN   
1            Working Professional  Educational Consultant       2.0   NaN   
2            Working Professional                 Teacher       4.0   NaN   
3                         Student                     NaN       5.0  6.84   
4            Working Professional                 Teacher       5.0   NaN   

   Satisfaction     Sleep Duration Dietary Habits  Degree  \
0           5.0  Less than 5 hours       Moderate     LLB   
1           4.0  Less than 5 hours       Moderate    B.Ed   
2           1.0          7-8 hours       Modera

In [4]:
# More cleaning of bad values
diet = test["Dietary Habits"]
#print(diet.value_counts())

top_diet = diet.value_counts().index[0]
for d in diet.value_counts().index:
    #print(d)
    d_n = len(test[diet == d])
    if d_n < 10:
        #print(test[diet == d])
        test.loc[:, "Dietary Habits"] = test["Dietary Habits"].replace(d, top_diet)
test["Dietary Habits"].value_counts()

Dietary Habits
Moderate     33043
Unhealthy    30786
Healthy      29966
Name: count, dtype: int64

In [5]:
uned = test["Sleep Duration"]
#print(uned.value_counts())

top_uni = uned.value_counts().index[0]
for uni in uned.value_counts().index:
    uni_n = len(test[uned == uni])
    if uni_n < 10:
        test.loc[:, "Sleep Duration"] = test["Sleep Duration"].replace(uni, top_uni)
test["Sleep Duration"].value_counts()

Sleep Duration
Less than 5 hours    25715
7-8 hours            24491
More than 8 hours    22190
5-6 hours            21404
Name: count, dtype: int64

In [6]:
cities = test["City"]

top_city = cities.value_counts().index[0]
for city in cities.value_counts().index:
    city_n = len(test[cities == city])
    if city_n < 10:
        test.loc[:, "City"] = test["City"].replace(city, top_city)
test["City"].value_counts()

profs = test["Profession"]

top_prof = profs.value_counts().index[0]
for prof in profs.value_counts().index:
    prof_n = len(test[profs == prof])
    if prof == "Finanancial Analyst":
        test.loc[:, "Profession"] = test["Profession"].replace(prof, "Financial Analyst")
    elif prof_n < 10:
        test.loc[:, "Profession"] = test["Profession"].replace(prof, top_prof)
print(test["Profession"].value_counts())

degs = test["Degree"]
#top_deg = degs.value_counts().index[0]
for deg in degs.value_counts().index:
    #print(uni)
    if deg[0] == "B" or deg == "LLB":
        test.loc[:, "Degree"] = test["Degree"].replace(deg, "Bachelor")
    elif deg[0] == "M" or deg == "LLM":
        test.loc[:, "Degree"] = test["Degree"].replace(deg, "Master")

Profession
Teacher                   16429
Content Writer             5187
Architect                  2982
Consultant                 2920
Pharmacist                 2656
HR Manager                 2601
Doctor                     2198
Business Analyst           2186
Chemist                    1967
Financial Analyst          1942
Entrepreneur               1935
Chef                       1844
Educational Consultant     1827
Data Scientist             1582
Lawyer                     1497
Researcher                 1496
Pilot                      1448
Customer Support           1422
Marketing Manager          1284
Judge                      1189
Travel Consultant          1188
Manager                    1155
Sales Executive            1139
Plumber                    1123
Electrician                1121
Software Engineer          1002
Digital Marketer            942
Civil Engineer              938
UX/UI Designer              915
Accountant                  853
Mechanical Engineer         8

In [7]:
test.head()

Unnamed: 0,id,Name,Gender,Age,City,Working Professional or Student,Profession,Pressure,CGPA,Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness
0,140700,Shivam,Male,53.0,Visakhapatnam,Working Professional,Judge,2.0,,5.0,Less than 5 hours,Moderate,Bachelor,No,9.0,3.0,Yes
1,140701,Sanya,Female,58.0,Kolkata,Working Professional,Educational Consultant,2.0,,4.0,Less than 5 hours,Moderate,Bachelor,No,6.0,4.0,No
2,140702,Yash,Male,53.0,Jaipur,Working Professional,Teacher,4.0,,1.0,7-8 hours,Moderate,Bachelor,Yes,12.0,4.0,No
3,140703,Nalini,Female,23.0,Rajkot,Student,,5.0,6.84,1.0,More than 8 hours,Moderate,Bachelor,Yes,10.0,4.0,No
4,140704,Shaurya,Male,47.0,Kalyan,Working Professional,Teacher,5.0,,5.0,7-8 hours,Moderate,Bachelor,Yes,3.0,4.0,No


In [8]:
# Once all data is cleaned, categorical values will be one-hot vectorized
test = pd.get_dummies(test, columns=['Gender',
                                      'Working Professional or Student', 
                                      'Dietary Habits', 
                                      "Have you ever had suicidal thoughts ?", 
                                      "Family History of Mental Illness",
                                      "Sleep Duration",
                                      "City",
                                      "Profession",
                                      "Degree"], drop_first=True)
test.head()

Unnamed: 0,id,Name,Age,Pressure,CGPA,Satisfaction,Work/Study Hours,Financial Stress,Gender_Male,Working Professional or Student_Working Professional,...,Profession_Research Analyst,Profession_Researcher,Profession_Sales Executive,Profession_Software Engineer,Profession_Teacher,Profession_Travel Consultant,Profession_UX/UI Designer,Degree_Class 12,Degree_Master,Degree_PhD
0,140700,Shivam,53.0,2.0,,5.0,9.0,3.0,True,True,...,False,False,False,False,False,False,False,False,False,False
1,140701,Sanya,58.0,2.0,,4.0,6.0,4.0,False,True,...,False,False,False,False,False,False,False,False,False,False
2,140702,Yash,53.0,4.0,,1.0,12.0,4.0,True,True,...,False,False,False,False,True,False,False,False,False,False
3,140703,Nalini,23.0,5.0,6.84,1.0,10.0,4.0,False,False,...,False,False,False,False,False,False,False,False,False,False
4,140704,Shaurya,47.0,5.0,,5.0,3.0,4.0,True,True,...,False,False,False,False,True,False,False,False,False,False


In [9]:
# Exporting the cleaned test data
#test.to_csv("test_cleaner2.csv", index=False)