# Preprocessing
Link to data set: https://www.kaggle.com/datasets/muhammadalirazazaidi/screen-time-data-productivity-and-attention-span

In [29]:
import sklearn
print(sklearn.__version__)

1.6.0


In [30]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

In [31]:
# Load the data

In [32]:
data = pd.read_csv('screen_time.csv')
data = data.drop(columns=['Unnamed: 0'])

In [33]:
data.shape

(200, 15)

In [34]:
data.head()

Unnamed: 0,Age Group,Gender,Education Level,Occupation,Average Screen Time,Device,Screen Activity,App Category,Screen Time Period,Environment,Productivity,Attention Span,Work Strategy,Notification Handling,Usage of Productivity Apps
0,18–24,Male,Undergraduate,Student,More than 10,Smartphone,"Entertainment (gaming, streaming, social media...","Social Media (e.g., Facebook, Instagram, Linke...",Evening (6 PM–10 PM),Quite workplace,Moderately productive,10–30 minutes,Take regular breaks,Check them briefly and resume my work,"Yes, but i did not find them of any help"
1,18–24,Male,Undergraduate,Professional,8-10,Smartphone,"Entertainment (gaming, streaming, social media...","Streaming (e.g., YouTube, Netflix)",Late night (10 PM–6 AM),Quite workplace,Moderately productive,More than 1 hour,"None, i prefer to work without any strategies",Ignore them until my task is completed,"No, i do not use them"
2,45 and above,Female,Graduate,Professional,4–6,Smartphone,Academic/Work-related,"Social Media (e.g., Facebook, Instagram, Linke...",Afternoon (12 PM–6 PM),I can work in any environment,Moderately productive,10–30 minutes,Take regular breaks,Check them briefly and resume my work,"No, i do not use them"
3,25–34,Male,Undergraduate,Professional,8-10,Laptop/PC,Academic/Work-related,"Social Media (e.g., Facebook, Instagram, Linke...",Afternoon (12 PM–6 PM),Quite workplace,"Extremely productive, i efficiently complete m...",10–30 minutes,"None, i prefer to work without any strategies",Ignore them until my task is completed,"No, i do not use them"
4,45 and above,Male,Graduate,Professional,8-10,Laptop/PC,Academic/Work-related,"Productivity (e.g., Microsoft Office, Notion)",Afternoon (12 PM–6 PM),I can work in any environment,"Extremely productive, i efficiently complete m...",30–60 minutes,"None, i prefer to work without any strategies",Check them briefly and resume my work,"No, i do not use them"


In [35]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 15 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Age Group                   200 non-null    object
 1   Gender                      200 non-null    object
 2   Education Level             200 non-null    object
 3   Occupation                  200 non-null    object
 4   Average Screen Time         200 non-null    object
 5   Device                      200 non-null    object
 6   Screen Activity             200 non-null    object
 7   App Category                200 non-null    object
 8   Screen Time Period          200 non-null    object
 9   Environment                 198 non-null    object
 10  Productivity                200 non-null    object
 11  Attention Span              200 non-null    object
 12  Work Strategy               197 non-null    object
 13  Notification Handling       199 non-null    object

In [36]:
data.describe()

Unnamed: 0,Age Group,Gender,Education Level,Occupation,Average Screen Time,Device,Screen Activity,App Category,Screen Time Period,Environment,Productivity,Attention Span,Work Strategy,Notification Handling,Usage of Productivity Apps
count,200,200,200,200,200,200,200,200,200,198,200,200,197,199,200
unique,5,2,3,2,6,4,2,5,4,4,3,4,4,4,3
top,18–24,Male,Undergraduate,Student,6–8,Smartphone,"Entertainment (gaming, streaming, social media...","Social Media (e.g., Facebook, Instagram, Linke...",Evening (6 PM–10 PM),Quite workplace,Moderately productive,10–30 minutes,"None, i prefer to work without any strategies",Check them briefly and resume my work,"No, i do not use them"
freq,126,132,106,124,51,126,112,100,95,99,107,64,71,72,123


#### Step 1: Check for missing values

In [37]:
data.isnull().sum()

Age Group                     0
Gender                        0
Education Level               0
Occupation                    0
Average Screen Time           0
Device                        0
Screen Activity               0
App Category                  0
Screen Time Period            0
Environment                   2
Productivity                  0
Attention Span                0
Work Strategy                 3
Notification Handling         1
Usage of Productivity Apps    0
dtype: int64

In [38]:
list(data.columns[1:])

['Gender',
 'Education Level',
 'Occupation',
 'Average Screen Time',
 'Device',
 'Screen Activity',
 'App Category',
 'Screen Time Period',
 'Environment',
 'Productivity',
 'Attention Span',
 'Work Strategy',
 'Notification Handling',
 'Usage of Productivity Apps']

In [39]:
for col in (list(data.columns[1:])):
    data[col].fillna(data[col].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].mode()[0], inplace=True)


In [40]:
data.isnull().sum()

Age Group                     0
Gender                        0
Education Level               0
Occupation                    0
Average Screen Time           0
Device                        0
Screen Activity               0
App Category                  0
Screen Time Period            0
Environment                   0
Productivity                  0
Attention Span                0
Work Strategy                 0
Notification Handling         0
Usage of Productivity Apps    0
dtype: int64

#### Step 2: Preparing for Encoding the categorical variables

In [41]:
data.head()

Unnamed: 0,Age Group,Gender,Education Level,Occupation,Average Screen Time,Device,Screen Activity,App Category,Screen Time Period,Environment,Productivity,Attention Span,Work Strategy,Notification Handling,Usage of Productivity Apps
0,18–24,Male,Undergraduate,Student,More than 10,Smartphone,"Entertainment (gaming, streaming, social media...","Social Media (e.g., Facebook, Instagram, Linke...",Evening (6 PM–10 PM),Quite workplace,Moderately productive,10–30 minutes,Take regular breaks,Check them briefly and resume my work,"Yes, but i did not find them of any help"
1,18–24,Male,Undergraduate,Professional,8-10,Smartphone,"Entertainment (gaming, streaming, social media...","Streaming (e.g., YouTube, Netflix)",Late night (10 PM–6 AM),Quite workplace,Moderately productive,More than 1 hour,"None, i prefer to work without any strategies",Ignore them until my task is completed,"No, i do not use them"
2,45 and above,Female,Graduate,Professional,4–6,Smartphone,Academic/Work-related,"Social Media (e.g., Facebook, Instagram, Linke...",Afternoon (12 PM–6 PM),I can work in any environment,Moderately productive,10–30 minutes,Take regular breaks,Check them briefly and resume my work,"No, i do not use them"
3,25–34,Male,Undergraduate,Professional,8-10,Laptop/PC,Academic/Work-related,"Social Media (e.g., Facebook, Instagram, Linke...",Afternoon (12 PM–6 PM),Quite workplace,"Extremely productive, i efficiently complete m...",10–30 minutes,"None, i prefer to work without any strategies",Ignore them until my task is completed,"No, i do not use them"
4,45 and above,Male,Graduate,Professional,8-10,Laptop/PC,Academic/Work-related,"Productivity (e.g., Microsoft Office, Notion)",Afternoon (12 PM–6 PM),I can work in any environment,"Extremely productive, i efficiently complete m...",30–60 minutes,"None, i prefer to work without any strategies",Check them briefly and resume my work,"No, i do not use them"


In [42]:
one_hot_encoder = OneHotEncoder(sparse_output=False) 

In [43]:
categorial_cols = data.select_dtypes(include=['object', 'category']).columns.tolist()
print(len(categorial_cols))

15


In [44]:
ordinal_cols = ["Age Group", "Education Level", "Average Screen Time", "Productivity", "Attention Span"]
ordinal_cols_index = [data.columns.get_loc(col) for col in ordinal_cols]
print(ordinal_cols_index)

[0, 2, 4, 10, 11]


In [45]:
nominal_columns = [
    "Gender", "Occupation", "Device", "Screen Activity", "App Category",
    "Screen Time Period", "Environment", "Work Strategy",
    "Notification Handling", "Usage of Productivity Apps"]
nominal_columns_index = [data.columns.get_loc(col) for col in nominal_columns]

In [46]:
ordinal_categories = [['Below 18','18–24','25–34','35–44','45 and above'],
['High school or below','Undergraduate', 'Graduate'],
['Less than 2','2–4' ,'4–6' ,'6–8' ,'More than 10' ,'8-10'],
['Unproductive, i might not have completed the task and got carried away','Moderately productive','Extremely productive, i efficiently complete my tasks'],
['Less than 10 minutes','10–30 minutes' ,'30–60 minutes','More than 1 hour']]
ordinal_columns = ["Age Group", "Education Level", "Average Screen Time", "Productivity", "Attention Span"]

#### Step 3:  Apply Label Encoding to ordinal columns

In [47]:
for i in ordinal_cols:
    print(data[i].unique())

['18–24' '45 and above' '25–34' 'Below 18' '35–44']
['Undergraduate' 'Graduate' 'High school or below']
['More than 10' '8-10' '4–6' '6–8' '2–4' 'Less than 2']
['Moderately productive'
 'Extremely productive, i efficiently complete my tasks'
 'Unproductive, i might not have completed the task and got carried away']
['10–30 minutes' 'More than 1 hour' '30–60 minutes' 'Less than 10 minutes']


In [48]:
encoder = OrdinalEncoder(categories=ordinal_categories)
data[ordinal_columns] = encoder.fit_transform(data[ordinal_columns])

data[ordinal_columns] = data[ordinal_columns].astype(int)

In [49]:
data.head()

Unnamed: 0,Age Group,Gender,Education Level,Occupation,Average Screen Time,Device,Screen Activity,App Category,Screen Time Period,Environment,Productivity,Attention Span,Work Strategy,Notification Handling,Usage of Productivity Apps
0,1,Male,1,Student,4,Smartphone,"Entertainment (gaming, streaming, social media...","Social Media (e.g., Facebook, Instagram, Linke...",Evening (6 PM–10 PM),Quite workplace,1,1,Take regular breaks,Check them briefly and resume my work,"Yes, but i did not find them of any help"
1,1,Male,1,Professional,5,Smartphone,"Entertainment (gaming, streaming, social media...","Streaming (e.g., YouTube, Netflix)",Late night (10 PM–6 AM),Quite workplace,1,3,"None, i prefer to work without any strategies",Ignore them until my task is completed,"No, i do not use them"
2,4,Female,2,Professional,2,Smartphone,Academic/Work-related,"Social Media (e.g., Facebook, Instagram, Linke...",Afternoon (12 PM–6 PM),I can work in any environment,1,1,Take regular breaks,Check them briefly and resume my work,"No, i do not use them"
3,2,Male,1,Professional,5,Laptop/PC,Academic/Work-related,"Social Media (e.g., Facebook, Instagram, Linke...",Afternoon (12 PM–6 PM),Quite workplace,2,1,"None, i prefer to work without any strategies",Ignore them until my task is completed,"No, i do not use them"
4,4,Male,2,Professional,5,Laptop/PC,Academic/Work-related,"Productivity (e.g., Microsoft Office, Notion)",Afternoon (12 PM–6 PM),I can work in any environment,2,2,"None, i prefer to work without any strategies",Check them briefly and resume my work,"No, i do not use them"


#### Step 4: Apply one-hot encoding to nominal columns

In [50]:
column_transformer = ColumnTransformer(
    transformers=[('onehot', OneHotEncoder(sparse_output=False), nominal_columns)  # Apply OneHotEncoder to categorical columns
    ],remainder='passthrough')

df = column_transformer.fit_transform(data)
encoded_col_names = column_transformer.get_feature_names_out()
df = pd.DataFrame(df, columns=encoded_col_names, index=data.index)

In [51]:
df.head()

Unnamed: 0,onehot__Gender_Female,onehot__Gender_Male,onehot__Occupation_Professional,onehot__Occupation_Student,onehot__Device_Laptop/PC,onehot__Device_Smartphone,onehot__Device_Tablet,onehot__Device_Television,onehot__Screen Activity_Academic/Work-related,"onehot__Screen Activity_Entertainment (gaming, streaming, social media, etc.)",...,onehot__Notification Handling_Spend time interacting with the notifications,onehot__Notification Handling_Turn off notifications altogether,"onehot__Usage of Productivity Apps_No, i do not use them","onehot__Usage of Productivity Apps_Yes, but i did not find them of any help","onehot__Usage of Productivity Apps_Yes, they are extremely helpful",remainder__Age Group,remainder__Education Level,remainder__Average Screen Time,remainder__Productivity,remainder__Attention Span
0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,4.0,1.0,1.0
1,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,5.0,1.0,3.0
2,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,4.0,2.0,2.0,1.0,1.0
3,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,2.0,1.0,5.0,2.0,1.0
4,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,4.0,2.0,5.0,2.0,2.0


No Normalization is required as no numerical columns are present in the dataset.

#### Step 5: Split the data into training and testing sets

The target variable is "Attention Span"

In [52]:
df.columns

Index(['onehot__Gender_Female', 'onehot__Gender_Male',
       'onehot__Occupation_Professional', 'onehot__Occupation_Student',
       'onehot__Device_Laptop/PC', 'onehot__Device_Smartphone',
       'onehot__Device_Tablet', 'onehot__Device_Television',
       'onehot__Screen Activity_Academic/Work-related',
       'onehot__Screen Activity_Entertainment (gaming, streaming, social media, etc.)',
       'onehot__App Category_Gaming',
       'onehot__App Category_Messaging (e.g., WhatsApp, Messenger)',
       'onehot__App Category_Productivity (e.g., Microsoft Office, Notion)',
       'onehot__App Category_Social Media (e.g., Facebook, Instagram, LinkedIn, Twitter)',
       'onehot__App Category_Streaming (e.g., YouTube, Netflix)',
       'onehot__Screen Time Period_Afternoon (12 PM–6 PM)',
       'onehot__Screen Time Period_Evening (6 PM–10 PM)',
       'onehot__Screen Time Period_Late night (10 PM–6 AM)',
       'onehot__Screen Time Period_Morning (6 AM–12 PM)',
       'onehot__Environmen

In [53]:
X = df.drop(columns=['remainder__Attention Span'])
y = df['remainder__Attention Span']

In [54]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [55]:
print("Training Set:", X_train.shape, y_train.shape)
print("Testing Set:", X_test.shape, y_test.shape)

Training Set: (160, 38) (160,)
Testing Set: (40, 38) (40,)
