In [None]:
# Importing the necessary libraries

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Loading the dataset

df = pd.read_csv('/content/Students Social Media Addiction (1).csv')

In [None]:
# Showing the first five records from the dataframe

df.head()

In [None]:
# Showing a random sample of 5 records

df.sample(5)

In [None]:
# Check the number of columns and rows in the dataframe

print('Number of rows:', df.shape[0])
print('Number of columns:', df.shape[1])

In [None]:
# Descriptive view of the dataframe

df.describe()

In [None]:
### Data cleaning

In [None]:
# check missing values

df.isnull().sum()

In [None]:
df.dropna(inplace = True)

In [None]:
# Checking missing values before applying fillna
print("Missing values BEFORE applying fillna:")
print(df.isnull().sum())
print('-' * 50)

numerical_cols = df.select_dtypes(include = np.number).columns

for col in numerical_cols:
    df[col].fillna(df[col].mean(), inplace = True)

# Checking missing values after applying fillna
print("\nMissing values AFTER applying fillna:")
print(df.isnull().sum())

In [None]:
# Identify incorrect datatypes

df.dtypes

In [None]:
# if Age column (integer) had incorrect datatype, the following code will fix it

df['Age'] = df['Age'].astype(int)


In [None]:
# Check the datatype before conversion
print("\nDatatype of 'Avg_Daily_Usage_Hours' BEFORE conversion:")
print(df['Avg_Daily_Usage_Hours'].dtype)
print('-' * 80)

# Convert 'Avg_Daily_Usage_Hours' to numeric
df['Avg_Daily_Usage_Hours'] = pd.to_numeric(df['Avg_Daily_Usage_Hours'], errors = 'coerce')

# Check for any values that couldn't be converted
print("\nNumber of non-numeric values after converting 'Avg_Daily_Usage_Hours':")
print(df['Avg_Daily_Usage_Hours'].isnull().sum())
print('-' * 80)

# Verify the datatype
print("\nDatatype of 'Avg_Daily_Usage_Hours' AFTER conversion:")
print(df['Avg_Daily_Usage_Hours'].dtype)

In [None]:
# Average daily usage hours by age
usage_by_age = df.groupby('Age')['Avg_Daily_Usage_Hours'].mean()
print("Average Daily Usage by Age:")
print(f"{usage_by_age}\n")
print('-' * 50)

# Average daily usage hours by gender
print("\nAverage Daily Usage by Gender:")
usage_by_gender = df.groupby('Gender')['Avg_Daily_Usage_Hours'].mean()
print(usage_by_gender)

In [None]:
# Average addiction score by Gender
print("Average Addiction Score by Gender:")
print(f"{df.groupby('Gender')['Addicted_Score'].mean()}\n")
print('-' * 50)

# Average addiction score by Academic Level
print("\nAverage Addiction Score by Academic Level:")
print(f"{df.groupby('Academic_Level')['Addicted_Score'].mean()}\n")
print('-' * 50)

# Average addiction score by Age
print("\nAverage Addiction Score by Age:")
print(df.groupby('Age')['Addicted_Score'].mean())

In [None]:
print("Average addiction level by Gender:")
print('-' * 40)
print(df.groupby('Gender')['Addicted_Score'].mean())

Average addiction level by Gender:
----------------------------------------
Gender
Female    6.515581
Male      6.357955
Name: Addicted_Score, dtype: float64


In [None]:
print("\nAverage addiction level by Education level:")
print('-' * 45)
print(df.groupby('Academic_Level')['Addicted_Score'].mean())

In [None]:
def classify_risk(usage_hours):
    '''
    This function returns the risk level based on the average daily usage hours passed in the argument.
    It uses if elif else conditions to return the appropriate risk level.
    '''

    if usage_hours < 3:
        return "Low Risk"

    elif 3 <= usage_hours < 6:
        return "Medium Risk"

    else:
        return "High Risk"

# creating a new column for risk level with the risk level returned from above function
df['Risk_Level'] = df['Avg_Daily_Usage_Hours'].apply(classify_risk)

print("Risk Level Classification:")
print('-' * 30)
print(df['Risk_Level'].value_counts())

In [None]:
def suggest_detox(risk_level):
    '''
    This function returns digital detox strategies based on the risk level passed in the argument.
    It is using if elif else conditions to return the appropriate strategy.
    '''

    if risk_level == "High Risk":
        return "Consider significantly reducing usage, setting strict limits, and seeking professional help if needed."

    elif risk_level == "Medium Risk":
        return "Try setting daily time limits, scheduling screen-free activities, and being mindful of usage."

    else:
        return "Continue healthy usage habits, be aware of potential triggers, and maintain a balanced lifestyle."

# creating a new column for Detox suggestions with the values returned from the above function
df['Detox_Suggestion'] = df['Risk_Level'].apply(suggest_detox)

print("\nDigital Detox Suggestions:")
print('-' * 30)

# taking a random sample of 5 records to show a the Detox assessment
for index, row in df.sample(5).iterrows():
    print(f"For Student ID {row['Student_ID']} ({row['Risk_Level']}): {row['Detox_Suggestion']}\n")

In [None]:
plt.figure(figsize = (10, 6))

# creating a bar plot
ax = sns.barplot(x = 'Academic_Level', y = 'Addicted_Score', data = df, palette = 'Set2', ci = None)
plt.title('Average Addiction Score by Academic Level', color = 'darkgreen', weight = 'bold')
plt.ylabel('Average Addiction Score', color = 'red')
plt.xlabel('Academic Level', color = 'red')

# Add labels above the bars with padding
for container in ax.containers:
    ax.bar_label(container, fmt = '%.2f', padding = 3, color = 'black', weight = 'bold')

plt.tight_layout()
plt.show()

In [None]:
risk_counts = df['Risk_Level'].value_counts() # count of risk level
plt.figure(figsize = (8, 8))

# creating a pie chart
plt.pie(
    risk_counts, labels = risk_counts.index, autopct = '%1.1f%%',
    startangle = 90, colors = sns.color_palette('rocket'), textprops={'color': 'white'}
        )
plt.title('Distribution of Risk Levels', color = 'darkgreen', weight = 'bold')
plt.legend(title="Risk Levels", loc="upper left", bbox_to_anchor=(0, 1))
plt.show()

In [None]:
plt.figure(figsize = (12, 8))

# creating a heat map
sns.heatmap(
    df[['Age', 'Avg_Daily_Usage_Hours', 'Sleep_Hours_Per_Night', 'Mental_Health_Score', 'Conflicts_Over_Social_Media', 'Addicted_Score']] # cols to be used
            .corr(), annot = True, cmap = 'Paired', fmt = ".2f"
            )
plt.title('Correlation Matrix of Numerical Columns', color = 'darkgreen', weight = 'bold')
plt.xticks(rotation = 45, ha = 'right') # Rotate x-axis labels
plt.yticks(rotation = 0)
plt.show()

In [None]:
data = df.groupby('Age')['Avg_Daily_Usage_Hours'].mean().reset_index()
plt.figure(figsize = (10, 6))

# creating a lineplot
ax = sns.lineplot(x = 'Age', y = 'Avg_Daily_Usage_Hours',
             data = data, marker = 'o',
                  markerfacecolor='purple', color = 'blue'
             )
plt.title('Average Daily Usage Hours by Age', color = 'darkgreen', weight= 'bold')
plt.ylabel('Average Daily Usage Hours', color = 'red')
plt.xlabel('Age', color = 'red')
plt.grid(True)

# Add labels to the data points
for x, y in df.groupby('Age')['Avg_Daily_Usage_Hours'].mean().reset_index().values:
    plt.text(x, y, f'{y:.2f}', ha = 'left', va = 'bottom', color = 'black', fontsize = 11)

plt.tight_layout()

plt.show()