# Preprocessing
Link to data set: https://www.kaggle.com/datasets/muhammadalirazazaidi/screen-time-data-productivity-and-attention-span

In [29]:
import sklearn
print(sklearn.__version__)

In [30]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

In [31]:
# Load the data

In [32]:
data = pd.read_csv('screen_time.csv')
data = data.drop(columns=['Unnamed: 0'])

In [33]:
data.shape

In [34]:
data.head()

In [35]:
data.info()

In [36]:
data.describe()

#### Step 1: Check for missing values

In [37]:
data.isnull().sum()

In [38]:
list(data.columns[1:])

In [39]:
for col in (list(data.columns[1:])):
    data[col].fillna(data[col].mode()[0], inplace=True)

In [40]:
data.isnull().sum()

#### Step 2: Preparing for Encoding the categorical variables

In [41]:
data.head()

In [42]:
one_hot_encoder = OneHotEncoder(sparse_output=False) 

In [43]:
categorial_cols = data.select_dtypes(include=['object', 'category']).columns.tolist()
print(len(categorial_cols))

In [44]:
ordinal_cols = ["Age Group", "Education Level", "Average Screen Time", "Productivity", "Attention Span"]
ordinal_cols_index = [data.columns.get_loc(col) for col in ordinal_cols]
print(ordinal_cols_index)

In [45]:
nominal_columns = [
    "Gender", "Occupation", "Device", "Screen Activity", "App Category",
    "Screen Time Period", "Environment", "Work Strategy",
    "Notification Handling", "Usage of Productivity Apps"]
nominal_columns_index = [data.columns.get_loc(col) for col in nominal_columns]

In [46]:
ordinal_categories = [['Below 18','18–24','25–34','35–44','45 and above'],
['High school or below','Undergraduate', 'Graduate'],
['Less than 2','2–4' ,'4–6' ,'6–8' ,'More than 10' ,'8-10'],
['Unproductive, i might not have completed the task and got carried away','Moderately productive','Extremely productive, i efficiently complete my tasks'],
['Less than 10 minutes','10–30 minutes' ,'30–60 minutes','More than 1 hour']]
ordinal_columns = ["Age Group", "Education Level", "Average Screen Time", "Productivity", "Attention Span"]

#### Step 3:  Apply Label Encoding to ordinal columns

In [47]:
for i in ordinal_cols:
    print(data[i].unique())

In [48]:
encoder = OrdinalEncoder(categories=ordinal_categories)
data[ordinal_columns] = encoder.fit_transform(data[ordinal_columns])

data[ordinal_columns] = data[ordinal_columns].astype(int)

In [49]:
data.head()

#### Step 4: Apply one-hot encoding to nominal columns

In [50]:
column_transformer = ColumnTransformer(
    transformers=[('onehot', OneHotEncoder(sparse_output=False), nominal_columns)  # Apply OneHotEncoder to categorical columns
    ],remainder='passthrough')

df = column_transformer.fit_transform(data)
encoded_col_names = column_transformer.get_feature_names_out()
df = pd.DataFrame(df, columns=encoded_col_names, index=data.index)

In [51]:
df.head()

No Normalization is required as no numerical columns are present in the dataset.

#### Step 5: Split the data into training and testing sets

The target variable is "Attention Span"

In [52]:
df.columns

In [53]:
X = df.drop(columns=['remainder__Attention Span'])
y = df['remainder__Attention Span']

In [54]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [55]:
print("Training Set:", X_train.shape, y_train.shape)
print("Testing Set:", X_test.shape, y_test.shape)