In [73]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [74]:
# Load dataset
file_path = "Time-Wasters on Social Media.csv"
df = pd.read_csv(file_path)

In [75]:
# Keep relevant columns
df.columns

Index(['UserID', 'Age', 'Gender', 'Location', 'Income', 'Debt',
       'Owns Property', 'Profession', 'Demographics', 'Platform',
       'Total Time Spent', 'Number of Sessions', 'Video ID', 'Video Category',
       'Video Length', 'Engagement', 'Importance Score', 'Time Spent On Video',
       'Number of Videos Watched', 'Scroll Rate', 'Frequency',
       'ProductivityLoss', 'Satisfaction', 'Watch Reason', 'DeviceType', 'OS',
       'Watch Time', 'Self Control', 'Addiction Level', 'CurrentActivity',
       'ConnectionType'],
      dtype='object')

In [76]:
columns_to_keep = [
    'Age', 'Gender', 'Location', 'Platform', 'Total Time Spent',
    'Video Category', 'Engagement', 'Number of Videos Watched', 'ProductivityLoss',
    'Watch Reason', 'DeviceType', 'Watch Time', 'Self Control', 'Addiction Level',
    'ConnectionType' 
]
df = df[columns_to_keep]

In [77]:
df.columns

Index(['Age', 'Gender', 'Location', 'Platform', 'Total Time Spent',
       'Video Category', 'Engagement', 'Number of Videos Watched',
       'ProductivityLoss', 'Watch Reason', 'DeviceType', 'Watch Time',
       'Self Control', 'Addiction Level', 'ConnectionType'],
      dtype='object')

In [78]:
# Handle missing values
imputer = SimpleImputer(strategy='most_frequent')
df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

In [79]:
df.head()

Unnamed: 0,Age,Gender,Location,Platform,Total Time Spent,Video Category,Engagement,Number of Videos Watched,ProductivityLoss,Watch Reason,DeviceType,Watch Time,Self Control,Addiction Level,ConnectionType
0,56,Male,Pakistan,Instagram,80,Pranks,7867,22,3,Procrastination,Smartphone,9:00 PM,5,5,Mobile Data
1,46,Female,Mexico,Instagram,228,Pranks,5944,31,5,Habit,Computer,5:00 PM,7,3,Wi-Fi
2,32,Female,United States,Facebook,30,Vlogs,8674,7,6,Entertainment,Tablet,2:00 PM,8,2,Mobile Data
3,60,Male,Barzil,YouTube,101,Vlogs,2477,41,3,Habit,Smartphone,9:00 PM,5,5,Mobile Data
4,25,Male,Pakistan,TikTok,136,Gaming,3093,21,8,Boredom,Smartphone,8:00 AM,10,0,Mobile Data


In [80]:
# Encode categorical variables
categorical_columns = ['Gender', 'Location', 'Platform', 'Video Category', 'Watch Reason', 'DeviceType', 'ConnectionType']
le = LabelEncoder()
for col in categorical_columns:
    df[col] = le.fit_transform(df[col])

In [81]:
df.head()

Unnamed: 0,Age,Gender,Location,Platform,Total Time Spent,Video Category,Engagement,Number of Videos Watched,ProductivityLoss,Watch Reason,DeviceType,Watch Time,Self Control,Addiction Level,ConnectionType
0,56,1,6,1,80,6,7867,22,3,3,1,9:00 PM,5,5,0
1,46,0,5,1,228,6,5944,31,5,2,0,5:00 PM,7,3,1
2,32,0,8,0,30,8,8674,7,6,1,2,2:00 PM,8,2,0
3,60,1,0,3,101,8,2477,41,3,2,1,9:00 PM,5,5,0
4,25,1,6,2,136,3,3093,21,8,0,1,8:00 AM,10,0,0


In [82]:
df.head()

Unnamed: 0,Age,Gender,Location,Platform,Total Time Spent,Video Category,Engagement,Number of Videos Watched,ProductivityLoss,Watch Reason,DeviceType,Watch Time,Self Control,Addiction Level,ConnectionType
0,56,1,6,1,80,6,7867,22,3,3,1,9:00 PM,5,5,0
1,46,0,5,1,228,6,5944,31,5,2,0,5:00 PM,7,3,1
2,32,0,8,0,30,8,8674,7,6,1,2,2:00 PM,8,2,0
3,60,1,0,3,101,8,2477,41,3,2,1,9:00 PM,5,5,0
4,25,1,6,2,136,3,3093,21,8,0,1,8:00 AM,10,0,0


In [83]:
# Display info about the preprocessed dataset
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 15 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1000 non-null   object
 1   Gender                    1000 non-null   int32 
 2   Location                  1000 non-null   int32 
 3   Platform                  1000 non-null   int32 
 4   Total Time Spent          1000 non-null   object
 5   Video Category            1000 non-null   int32 
 6   Engagement                1000 non-null   object
 7   Number of Videos Watched  1000 non-null   object
 8   ProductivityLoss          1000 non-null   object
 9   Watch Reason              1000 non-null   int32 
 10  DeviceType                1000 non-null   int32 
 11  Watch Time                1000 non-null   object
 12  Self Control              1000 non-null   object
 13  Addiction Level           1000 non-null   object
 14  ConnectionType           

In [84]:
# Show the first few rows of the preprocessed data
print("\
First few rows of preprocessed data:")
df.head()

First few rows of preprocessed data:


Unnamed: 0,Age,Gender,Location,Platform,Total Time Spent,Video Category,Engagement,Number of Videos Watched,ProductivityLoss,Watch Reason,DeviceType,Watch Time,Self Control,Addiction Level,ConnectionType
0,56,1,6,1,80,6,7867,22,3,3,1,9:00 PM,5,5,0
1,46,0,5,1,228,6,5944,31,5,2,0,5:00 PM,7,3,1
2,32,0,8,0,30,8,8674,7,6,1,2,2:00 PM,8,2,0
3,60,1,0,3,101,8,2477,41,3,2,1,9:00 PM,5,5,0
4,25,1,6,2,136,3,3093,21,8,0,1,8:00 AM,10,0,0
