In [None]:
# this is a CORE COURSE project done my jason and kathir

# Does time spent on Social Media affect Mental Health? A Correlational Study (2022)

Data set link - https://docs.google.com/spreadsheets/d/1lWFIL7h0F7xtmJHNPJX7ttPkO4v9j3xQ2E9Qb1wjek4/edit#gid=1240299773.

 It consists of 7 variables, and 12 Likert scale based questions giving us points that measure either frequency or intensity of various aspects of Mental Health. A low score of 0 generally indicates low frequency or intensity, and a high score of 5 typically indicates high frequency or intensity. The main objective of this study is to investigate whether there is a correlation between Social Media usage and Mental health, and to explore and try to predict whether the individual is suffering from mental health symptoms and should be recommended a mental health checkup, based on multivariate predictive modelling.

## Data Description

Variables

1. Age
2. Gender
3. Relationship Status
4. Occupation Status
5. Affiliated Organizations
6. Social Medias Used
7. Time spent - social media use, in hours



Measurements of Frequency or Intensity of mental health symptoms is take through Likert Scale questions -

1.   Purposeless use of Social Media [ADHD] - Question 9
2.   Distracted by Social Media [ADHD] - Question 10
3. Restlessness if Social Media not used [Anxiety] - Question 11
4. Ease of Distraction by Social Media [ADHD] - Question 12
5. Bothered by worries [Anxiety] - Question 13
6. Difficulty in concentrating [ADHD] - Question 14
7. Comparison of self to peers [Self Esteem] - Question 15
8. Feelings about above comparison [Self Esteem] - Question 16
9. Validation sought from Social Media [Self Esteem] - Question 17
10. Feelings of Depression [Depression] - Question 18
11. Fluctuation of interest [Depression] - Question 19
12. Sleep Issues [Depression] - Question 20



In [None]:
# Importing necessary packages for the project
import pandas as pd
import numpy as np

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# loading the dataset used in for the project
from google.colab import drive

In [None]:
drive.mount('/content/gdrive')
data=pd.read_csv('/content/gdrive/MyDrive/Colab Notebooks/Social Media and Mental Health Dataset.csv')

KeyboardInterrupt: 

In [None]:
# Setting to display all columns
pd.set_option("display.max_columns", None)

In [None]:
# Printing first 5 entries in the data set
data.head()

In [None]:
# Printing last 5 entries in the data set
data.tail()

In [None]:
# Dimension of data shape is a s follows -
data.shape

## Data Pre-processing and Cleaning

### Renaming the Columns

Renaming is done for simplifying the project in other parts of the Research.

In [None]:
data.rename(columns = {'1. What is your age?':'Age','2. Gender':'Sex','3. Relationship Status':'Relationship Status',
                       '4. Occupation Status':'Occupation',
                       '5. What type of organizations are you affiliated with?':'Affiliations',
                       '6. Do you use social media?':'Social Media User?',
                       '7. What social media platforms do you commonly use?':'Platforms Used',
                       '8. What is the average time you spend on social media every day?':'Time Spent',
                       '9. How often do you find yourself using Social media without a specific purpose?':'ADHD Q1',
                       '10. How often do you get distracted by Social media when you are busy doing something?':'ADHD Q2',
                       "11. Do you feel restless if you haven't used Social media in a while?":'Anxiety Q1',
                       '12. On a scale of 1 to 5, how easily distracted are you?':'ADHD Q3',
                       '13. On a scale of 1 to 5, how much are you bothered by worries?':'Anxiety Q2',
                       '14. Do you find it difficult to concentrate on things?':'ADHD Q4',
                       '15. On a scale of 1-5, how often do you compare yourself to other successful people through the use of social media?':'Self Esteem Q1',
                       '16. Following the previous question, how do you feel about these comparisons, generally speaking?':'Self Esteem Q2',
                       '17. How often do you look to seek validation from features of social media?':'Self Esteem Q3',
                       '18. How often do you feel depressed or down?':'Depression Q1',
                       '19. On a scale of 1 to 5, how frequently does your interest in daily activities fluctuate?':'Depression Q2',
                       '20. On a scale of 1 to 5, how often do you face issues regarding sleep?':'Depression Q3' },inplace=True)

### Re-arranging the Columns

In [None]:
titles = list(data.columns)
titles

In [None]:
# rearranging ADHD and anxiety question columns so that they are sequential

titles[11], titles[12] = titles[12], titles[11]
titles[12], titles[14] = titles[14], titles[12]
titles[13], titles[14] = titles[14], titles[13]
data = data[titles]
titles

## Missing Value Detection and Treatment

#The following values in a data set are considered to be missing values -

1. Blank Values
2. NaN
3. null

Excluding the 'Affiliations' column, if the number of records is less than 474, we can conclude that there are missing
values. This is unlikely to happen since the questionnaire consisted of required fields for all questions except for
'Affiliations'.

In [None]:
# Check number of records in each column of the data set.
data.info()

There are no missing values as expected. "Affiliations" has null values since in the questionnaire, individuals could leave it blank indicating they are not affiliated with anyone.

# Data Transformation

### Gender

When asking for the participant's Gender in the questionnaire, options that could be selected were "Male", "Female" and "Others". If "Others" was selected, the questionnaire field would prompt participant to input their gender by themselves as a string value. Thus, "Genders" column proves to be an aspect of this study that might require some manipulation and transformation to avoid complications in graphs.

In [None]:
#List all the unique Gender/Sex entries.

Genders = set(data['Sex'])
print(Genders)

Participants with the answer "There are others???" are deemed to have not filled out the questionnaire seriously. Thus, we will be excluding all entries pertaining to that answer.

In [None]:
data.drop(data.loc[data['Sex'] =='There are others???'].index, inplace=True)

In [None]:
Genders = set(data['Sex'])
print(Genders)

There are many unique entries in the Gender section that could all be considered under the "Others" type. This seemed to have happened because of the user input nature of selecting "Others" in the Gender section of the questionnaire.

In [None]:
#Combining the unique entries that all fall under the "Others" category
data.replace('Non-binary','Others', inplace=True)
data.replace('Nonbinary ','Others', inplace=True)
data.replace('NB','Others', inplace=True)
data.replace('unsure ','Others', inplace=True)
data.replace('Non binary ','Others', inplace=True)
data.replace('Trans','Others', inplace=True)

In [None]:
Genders = set(data['Sex'])
print(Genders)

We have successfully removed one entry while categorizing many of the unique string names into the 'Others' category.

In [None]:
data.info()

### Age

Note that 'Age' is erroneously detected as float64 value in the above section. This is because of the single data record # 382. We should thus convert the 'Age' column to int64 type.

In [None]:
#Showing the age entry of record #382
data.loc[382,'Age']

In [None]:
#Converting Age from float64 to int64 and displaying record # 382
data['Age'] = data['Age'].astype('int64')

In [None]:
#float64 changed to int32
data.info()

In [None]:
data.loc[382,'Age']

26.7 float value is converted to int value of 26.

In [None]:
data.describe()

In [None]:
data.median(numeric_only=True)

### Scalar Adjustment

Before manipulating columns (example: summing), we must address the issue of Self Esteem question #2. Originally, the question was -

"Following the previous question, how do you feel about these comparisons, generally speaking?".

The problem lies in what the scores represent, which for this question is a bit different from all the other questions.

Very Negative - 1

Slightly Negative - 2

Neutral - 3

Slightly Positive - 4

Very Positive - 5


In this research, a greater accumulation of points for one aspect of mental well being means that the person is doing bad in that regard. Therefore, for that condition to remain true, the scoring system of this question must be altered. The following is taken to be the new system -

Very negative - 4

Slightly negative - 2

Neutral - 0

Slightly Positive - 0

Very Positive - 0

Note that "Slightly Positive" and "Very positive" are assigned 0 values since they are not relevant to this study. We are measuring how mental health is negatively affected, not positively. Therefore, we are only dealing with the "Neutral", "Slightly negative" and "Very negative" options.

In [None]:
#setting scores of 3,4 and 5 to 0.
data.loc[data['Self Esteem Q2'] == 3, 'Self Esteem Q2'] = 0
data.loc[data['Self Esteem Q2'] == 4, 'Self Esteem Q2'] = 0
data.loc[data['Self Esteem Q2'] == 5, 'Self Esteem Q2'] = 0
#Setting scores of '1' to '4' and '2' to '2'.
data.loc[data['Self Esteem Q2'] == 1, 'Self Esteem Q2'] = 4
data.loc[data['Self Esteem Q2'] == 2, 'Self Esteem Q2'] = 2

In [None]:
# Dataset after the adjustments have been made
data.head(5)

From the above, modified dataset, we can infer that Self Esteem question # 2 has been properly scaled and adjusted.

### Summation of Scores of different aspects of mental well being

One of the requirements for this research to be valid is to calculate the total number of points accrued by the different questions on various aspects of mental health and wellbeing.

Questions measure 4 aspects of mental wellbeing -

1. Attention Deficit Hyperactivity Disorder (ADHD)
2. Anxiety
3. Self Esteem
4. Depression

Therefore, new columns are created for each of the 4 aspects, and another column named "Total Score" is to be created. Since it is assigned the sum total of all the questions as a numerical value, it can have a maximum value of 59.

In [None]:
#Summing scores from ADHD, Anxiety, Self Esteem and Depression individually and creating a new column

ADHD = ['ADHD Q1', 'ADHD Q2', 'ADHD Q3', 'ADHD Q4']
data['ADHD Score'] = data[ADHD].sum(axis=1)

Anxiety = ['Anxiety Q1', 'Anxiety Q2']
data['Anxiety Score'] = data[Anxiety].sum(axis=1)

SelfEsteem = ['Self Esteem Q1', 'Self Esteem Q2','Self Esteem Q3']
data['Self Esteem Score'] = data[SelfEsteem].sum(axis=1)

Depression = ['Depression Q1', 'Depression Q2','Depression Q3']
data['Depression Score'] = data[Depression].sum(axis=1)

Total = ['ADHD Score', 'Anxiety Score','Self Esteem Score','Depression Score']
data['Total Score'] = data[Total].sum(axis=1)

#Deleting question columns and timestamp columns as they are no longer used
data.drop(data.iloc[:, 9:21], inplace = True, axis = 1)
data.drop(['Timestamp'], inplace = True, axis = 1)

In [None]:
data.head(5)

### Adding an "Outcome" column

In this section, we consider adding an "Outcome" variable based on the "Total Score" of the participant.

Previously, it was established that the "Total score" indicates magnitude of individual experiencing negative symptoms of mental health. An accumulated total score of 59 is the highest an individual can obtain from the questionnaire, which would indicate that the individual is definitely experiencing negative symptoms in some aspect of mental health, based on binary classification.

The Outcome variable is defined as whether we think that the individual is experiencing mental health disease symptoms in some aspect, and whether we are recommending the participant to get a mental health checkup or not.

A participant with a score of 3 out of 5 on every question on the questionnaire suggests that they are are experiencing slight to moderate symptoms in every aspect of mental health, but they may not be severe or frequent enough, giving a score of 35 (12 questions, scores of 3 on each question except for self esteem question #2 with score of 2 = 35 score total score).

Therefore, we assign a rational value of 40 to be the point where we can reliably say that the individual is very likely to be suffering severely and extremely frequently from some symptoms, and thus we highly recommend a mental health checkup.

An Outcome of 0 means that individual is not confirmed to be experiencing severe mental health symptoms. Therefore we do not think the individual needs to go to get a mental health check up.

An Outcome of 1 means that the individual definitely experiencing some severe negative symptoms of mental health. They are recommended to go to get a mental health check up.

Note that the Total score variable will be dropped later when we use logistic regression to train and predict data.

In [None]:
def map_score(score):
  if score < 40:
    return "0"
  elif score >= 40:
    return "1"

data['Outcome']= data['Total Score'].apply(lambda score: map_score(score))
data['Outcome'] = data['Outcome'].astype('int64')

In [None]:
data.shape

In [None]:
# Checking the data after cleaning and transformation is applied and adding an outcome column
data.describe()

# Data Visualisation

In [None]:
# For Starters, let's understand the distribution of data for 473 participants and their time spent on Social media

data.groupby('Time Spent').size().plot.bar(xlabel='Time Spent on Social Media', ylabel='Frequency')

From the above plot, we can infer that in the sample, there are less than 40 people who have an average social media use of less than an hour. The other groups each have 60 to 120 people with average social media use of 1 to 5 hours or more.

In [None]:
# Let's understand the distribution of data for 473 participants based on their Gender.

total=float(len(data))
ax = sns.countplot(x="Sex", data=data)
for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2,height + 3,'{0:.0%}'.format(height/total),ha="center")

Approximately 260 participants out of 473 are female, making up the majority in the sample. 'Others' make up approximately 1% of the sample size, which makes it impossible to make statistical inferences based on the "Other" category specifically.

In [None]:
# Let's understand the distribution of data for 473 participants based on "Outcome" -
# whether or not the individual is experiencing severe mental health issues and therefore
# whether we recommend the individual to go to get a mental health check up.

total=float(len(data))
ax = sns.countplot(x="Outcome", data=data)
for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2,height + 3,'{0:.0%}'.format(height/total),ha="center")

Approximately 37% of the sample of 473 participants meet criteria of scoring 40 points and above, are experiencing severe mental health symptoms and are recommended to go get their mental health evaluated by a professional.

In [None]:
# Let's understand the distribution of data for 473 participants based on their Occupation.
total=float(len(data))
ax = sns.countplot(x="Occupation", data=data)
for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2,height + 3,'{0:.0%}'.format(height/total),ha="center")

The sample is over-representated by University students, making up an overwhelming 62% of the sample.

In [None]:
#Let's see the mean ADHD score of each Time group of participants

data.groupby('Time Spent')['ADHD Score'].mean().sort_values(ascending=True).plot.bar(ylabel='Mean ADHD Score')

In [None]:
#Let's see the mean Anxiety score of each Time group of participants

data.groupby('Time Spent')['Anxiety Score'].mean().sort_values(ascending=True).plot.bar(ylabel='Mean Anxiety Score')

In [None]:
#Let's see the mean Self Esteem score of each Time group of participants

data.groupby('Time Spent')['Self Esteem Score'].mean().sort_values(ascending=True).plot.bar(ylabel='Mean Self Esteem Score')

In [None]:
#Let's see the mean Depression score of each Time group of participants

data.groupby('Time Spent')['Depression Score'].mean().sort_values(ascending=True).plot.bar(ylabel='Mean Depression Score')

In [None]:
#Let's see the mean Total score of each Time group of participants

data.groupby('Time Spent')['Total Score'].mean().sort_values(ascending=True).plot.bar(ylabel='Total Score')

### Converting Time Spent category to Numerical Values

Before attempting to look at the corresponding heatmap/correlation matrix of our dataset, let us convert the 'Time Spent" column from string to integer. This is necessary because heatmaps correlations can be drawn only from numerical values. Without this step, we will not obtain any correlations between the time spent and other independent variables in our study.

This is done by assigning the various 'Time Spent' groups to number based strings, and then converting the whole column from object type to int64.

'Less than an Hour'     = 0

'Between 1 and 2 hours' = 1

'Between 2 and 3 hours' = 2

'Between 3 and 4 hours' = 3

'Between 4 and 5 hours' = 4

'More than 5 hours'     = 5


In [None]:
data.loc[data['Time Spent'] == 'Less than an Hour', 'Time Spent'] = 0
data.loc[data['Time Spent'] == 'Between 1 and 2 hours', 'Time Spent'] = 1
data.loc[data['Time Spent'] == 'Between 2 and 3 hours', 'Time Spent'] = 2
data.loc[data['Time Spent'] == 'Between 3 and 4 hours', 'Time Spent'] = 3
data.loc[data['Time Spent'] == 'Between 4 and 5 hours', 'Time Spent'] = 4
data.loc[data['Time Spent'] == 'More than 5 hours', 'Time Spent'] = 5

In [None]:
#Converting Time Spent from object type to int64.
data['Time Spent'] = data['Time Spent'].astype('int64')

We will also give the Gender variable numerical values so that they can be used in the correlation plots, heatmaps and machine learning.

In [None]:
#setting Male to 0, Female to 1, and Others to 2.
data.loc[data['Sex'] == 'Male', 'Sex'] = 0
data.loc[data['Sex'] == 'Female', 'Sex'] = 1
data.loc[data['Sex'] == 'Others', 'Sex'] = 2
data['Sex'] = data['Sex'].astype('int64')

In [None]:
data.head(5)

Successfully assigned specific values to specific replies in the "Time Spent" and "Sex" column and converted said columns to integer types.

### Correlation Plot and Heatmap

In [None]:
#Drop Total score column and display correlation plot


Note that "Total Score" variable is dropped since it is essentially the sum of 4 other independent variable columns. Therefore it is a dependant variable that is not meaningful in the machine learning part of this project.

From the above correlation table, it can be inferred that the time spent on various social media platforms has a moderate positive correlation with ADHD, Anxiety and Total Scores, with r values of 0.45, 0.438 and 0.44, respectively.

Correlation between Time Spent on social media and Self esteem scores and Depression scores are on the positive weaker side, with r values of 0.138 and 0.35 respectively.

There is a negative weak correlation between Age and all the other variables. The interpretation may be that the higher the participant's age is, the lower their social media usage and mental health scores will be. Note that this is a weak correlation, with r values between -0.35 and 0 for all variables.

In [None]:
sns.pairplot(data,hue='Outcome',diag_kind='kde')

From the above pairplot, the kernal density plots (diagonal) suggest that the distribution for No and Yes outcomes (0 and 1) are at times overlapping, and at times not overlapping. It is unclear whether it's possible to differentiate between No and Yes outcome individuals.

The scatterplots suggest low to moderately corelated data. There is a chance that machine learning algorithms may pick out hidden patterns during the training phase of predictive model. Hence models built on this data be able to achieve high levels of accuracy.

In [None]:
f, ax = plt.subplots(figsize=(20, 10))
corr = data.corr("pearson")
sns.heatmap(corr, mask=np.zeros_like(corr, dtype=bool), cmap=sns.dark_palette("#69d", reverse=True, as_cmap=True),
            square=True, ax=ax,annot=True)

From the above Correlation table and heatmap, it can be inffered that multi-collinearity between different supposedly independent variables exist in ranges from low to medium (0.11 < r < 0.68). The independent variables are Time spent, Sex,  Age, scores of ADHD, anxiety, self esteem and depression.

Age seems to have negative low colinearity values against the other variables.

Sex seems to have low colinearity values for all other variables.

# Predictive Modelling

### Dropping unneeded columns

First off, let us drop unneeded categorical columns before we feed our dataset to the training model.

In [None]:
#Deleting columns and updating dataset for training and predicting.
data.drop(data.iloc[:, 2:7], inplace = True, axis = 1)

In [None]:
#printing the new dataset that will be used for machine learning sequence
data.head()

In [None]:
#importing necessary libraries for machine learning models

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
standardizer = StandardScaler()

In [None]:
#Splitting up the data into "Train" and "Test". 80% train, 20% test.
X = data.drop(['Outcome'], axis = 1)
y = data['Outcome']
#X = standardizer.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=0.30,random_state=6)

In [None]:
#Create Logistic regression model and GaussianNB model
model = LogisticRegression()
modelNB = GaussianNB()

### Logistic Regression

In [None]:
# Call LinearRegression() to predict.
model.fit(X_train, y_train)

In [None]:
predicted= model.predict(X_test)
accuracy= model.score(X_test,y_test)
from sklearn import metrics
cm = metrics.confusion_matrix(y_test, predicted)
print(cm)

In [None]:
print("Predicted Values using Logistic Regression: ", predicted)

In [None]:
print("Accuracy of Logistic Regression: ",metrics.accuracy_score(y_test, predicted))

In [None]:
# Model Accuracy
print('Accuracy score using the Logistic regression model: ', accuracy*100,'%')

### Gaussian Naive Bayes

In [None]:
#Create GaussianNB model
modelNB = GaussianNB()

In [None]:
# Call GaussianNB() to to predict.
modelNB.fit(X_train, y_train)

In [None]:
predicted= modelNB.predict(X_test)
accuracy= modelNB.score(X_test,y_test)
from sklearn import metrics
cm = metrics.confusion_matrix(y_test, predicted)
print(cm)

In [None]:
print("Predicted Values using Gaussian Naive Bayes Model: ", predicted)

In [None]:
print("Accuracy of Gaussian Naive Bayes Model: ",metrics.accuracy_score(y_test, predicted))

In [None]:
# Model Accuracy
print('Accuracy score using the Gaussian Naive Bayes Model: ', accuracy*100,'%')