In [18]:
import pandas as pd
import numpy as np
from scipy import stats
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

# 1. Load the dataset
data = pd.read_csv('Time-Wasters on Social Media.csv')
print(f"Number of rows and columns after loading: {data.shape}")

# 2. Drop unnecessary columns
columns_to_drop = [
    'UserID', 'Video ID', 'Video Category', 
    'Watch Reason', 'DeviceType', 'OS', 'Watch Time'
]
data.drop(columns=columns_to_drop, inplace=True)
print(f"Number of rows and columns after dropping columns: {data.shape}")

# 3. Drop rows with missing values
data.dropna(inplace=True)
print(f"Number of rows and columns after dropping missing values: {data.shape}")

# 4. Convert data types
int_columns = [
    'Total Time Spent', 'Number of Sessions', 'Video Length', 
    'Engagement', 'Importance Score', 'Time Spent On Video', 
    'Number of Videos Watched', 'Scroll Rate', 
    'ProductivityLoss', 'Satisfaction', 'Self Control', 'Addiction Level'
]
data[int_columns] = data[int_columns].astype(int)
print(f"Number of rows and columns after converting data types: {data.shape}")

# 5. Feature engineering
data['Average Video Length'] = data['Time Spent On Video'] / data['Number of Videos Watched']
data['Engagement Rate'] = data['Engagement'] / data['Number of Videos Watched']
data['Time Spent Per Session'] = data['Total Time Spent'] / data['Number of Sessions']
print(f"Number of rows and columns after feature engineering: {data.shape}")

# 6. Handle 'Unknown' CurrentActivity
data['CurrentActivity'] = data['CurrentActivity'].replace('Unknown', 'Missing')

# 7. Remove duplicates
data.drop_duplicates(inplace=True)
print(f"Number of rows and columns after removing duplicates: {data.shape}")

# 8. Handle outliers (using Z-score)
z = np.abs(stats.zscore(data[int_columns]))
data = data[(z < 3).all(axis=1)]  # Remove rows with Z-score greater than 3
print(f"Number of rows and columns after handling outliers: {data.shape}")

# 9. Correct misspelled labels
possible_professions = [
    'Engineer', 'Artist', 'Waiting staff', 
    'Students', 'Manager', 'Driver', 'Labor/Worker', 
    'Cashier', 'Teacher'
]
data['Profession'] = data['Profession'].apply(lambda x: process.extractOne(x, possible_professions, scorer=fuzz.token_sort_ratio)[0])
print(f"Number of rows and columns after correcting misspellings: {data.shape}")

# 10. Summary Statistics
print("Summary Statistics:\n", data.describe())

# 11. Save the cleaned dataset to a CSV file
data.to_csv('cleaned_time_wasters_data.csv', index=False)
print("Cleaned dataset saved to 'cleaned_time_wasters_data.csv'")


Number of rows and columns after loading: (1000, 31)
Number of rows and columns after dropping columns: (1000, 24)
Number of rows and columns after dropping missing values: (1000, 24)
Number of rows and columns after converting data types: (1000, 24)
Number of rows and columns after feature engineering: (1000, 27)
Number of rows and columns after removing duplicates: (1000, 27)
Number of rows and columns after handling outliers: (1000, 27)
Number of rows and columns after correcting misspellings: (1000, 27)
Summary Statistics:
                Age        Income  Total Time Spent  Number of Sessions  \
count  1000.000000   1000.000000       1000.000000         1000.000000   
mean     40.986000  59524.213000        151.406000           10.013000   
std      13.497852  23736.212925         83.952637            5.380314   
min      18.000000  20138.000000         10.000000            1.000000   
25%      29.000000  38675.250000         78.000000            6.000000   
50%      42.000000  58

In [19]:
import pandas as pd;

df = pd.read_csv("cleaned_time_wasters_data.csv")

df.isnull().sum()

Age                         0
Gender                      0
Location                    0
Income                      0
Debt                        0
Owns Property               0
Profession                  0
Demographics                0
Platform                    0
Total Time Spent            0
Number of Sessions          0
Video Length                0
Engagement                  0
Importance Score            0
Time Spent On Video         0
Number of Videos Watched    0
Scroll Rate                 0
Frequency                   0
ProductivityLoss            0
Satisfaction                0
Self Control                0
Addiction Level             0
CurrentActivity             0
ConnectionType              0
Average Video Length        0
Engagement Rate             0
Time Spent Per Session      0
dtype: int64

In [10]:
import pandas as pd
import numpy as np
from scipy import stats
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

In [11]:
# Load the dataset
data = pd.read_csv('Time-Wasters on Social Media.csv')
print(f"Number of rows and columns after loading: {data.shape}")

Number of rows and columns after loading: (1000, 31)


In [12]:
# Drop unnecessary columns
data = data.drop(columns=['UserID', 'Video ID', 'Video Category', 
                          'Watch Reason', 'DeviceType', 'OS', 'Watch Time']) 
print(f"Number of rows and columns after dropping columns: {data.shape}")

Number of rows and columns after dropping columns: (1000, 24)


In [13]:
# Impute missing numerical values with the median
num_imputer = SimpleImputer(strategy='median')
data[data.select_dtypes(include=
                        ['float64', 'int64']).columns] = num_imputer.fit_transform(
                        data.select_dtypes(include=['float64', 'int64']))

In [14]:
# Impute missing categorical values with the most frequent value
cat_imputer = SimpleImputer(strategy='most_frequent')
data[data.select_dtypes(include=['object']).columns] = cat_imputer.fit_transform(
                                            data.select_dtypes(include=['object']))

In [15]:
# Convert relevant columns to integer for numerical analysis
columns_to_convert = ['Total Time Spent', 'Number of Sessions', 'Video Length', 'Engagement', 
                      'Importance Score', 'Time Spent On Video', 'Number of Videos Watched', 
                      'Scroll Rate', 'ProductivityLoss', 'Satisfaction', 'Self Control', 
                      'Addiction Level']

for col in columns_to_convert:
    data[col] = data[col].astype(int)
print(f"Number of rows and columns after converting data types: {data.shape}")

Number of rows and columns after converting data types: (1000, 24)


In [16]:
# 5. Feature engineering
data['Average Video Length'] = data['Time Spent On Video'] / data['Number of Videos Watched']
data['Engagement Rate'] = data['Engagement'] / data['Number of Videos Watched']
data['Time Spent Per Session'] = data['Total Time Spent'] / data['Number of Sessions']
print(f"Number of rows and columns after feature engineering: {data.shape}")

Number of rows and columns after feature engineering: (1000, 27)


In [17]:
# Create a separate category for "Unknown" CurrentActivity
data['CurrentActivity'] = data['CurrentActivity'].replace('Unknown', 'Missing')
print(f"Number of rows and columns after replacing 'Unknown': {data.shape}")

Number of rows and columns after replacing 'Unknown': (1000, 27)


In [20]:
#Correct misspelled labels
possible_professions = [
    'Engineer', 'Artist', 'Waiting staff', 
    'Students', 'Manager', 'Driver', 'Labor/Worker', 
    'Cashier', 'Teacher'
]
data['Profession'] = data['Profession'].apply(lambda x: process.extractOne(x, 
                        possible_professions, scorer=fuzz.token_sort_ratio)[0])

In [22]:
# Summary Statistics
print("Summary Statistics:\n")
data.describe()

Summary Statistics:



Unnamed: 0,Age,Income,Total Time Spent,Number of Sessions,Video Length,Engagement,Importance Score,Time Spent On Video,Number of Videos Watched,Scroll Rate,ProductivityLoss,Satisfaction,Self Control,Addiction Level,Average Video Length,Engagement Rate,Time Spent Per Session
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,40.986,59524.213,151.406,10.013,15.214,4997.159,5.129,14.973,25.248,49.774,5.136,4.864,7.094,2.906,1.36064,457.641507,28.0786
std,13.497852,23736.212925,83.952637,5.380314,8.224953,2910.053701,2.582834,8.200092,14.029159,29.197798,2.122265,2.122265,2.058495,2.058495,2.832226,1017.154807,39.796349
min,18.0,20138.0,10.0,1.0,1.0,15.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,0.0,0.020408,0.625,0.555556
25%,29.0,38675.25,78.0,6.0,8.0,2415.75,3.0,8.0,14.0,23.0,3.0,4.0,5.0,2.0,0.313839,91.371212,7.626462
50%,42.0,58805.0,152.0,10.0,15.0,5016.0,5.0,15.0,25.0,50.0,5.0,5.0,7.0,3.0,0.6,199.031609,15.594118
75%,52.0,79792.25,223.0,15.0,22.0,7540.25,7.0,22.0,37.0,74.0,6.0,7.0,8.0,5.0,1.146617,378.640152,27.875
max,64.0,99676.0,298.0,19.0,29.0,9982.0,9.0,29.0,49.0,99.0,9.0,9.0,10.0,7.0,29.0,9692.0,295.0


In [24]:
# Save the cleaned dataset to a CSV file
data.to_csv('cleaned_time_wasters_data.csv', index=False)
print("Cleaned dataset saved to 'cleaned_time_wasters_data.csv'")

Cleaned dataset saved to 'cleaned_time_wasters_data.csv'
