***PROJECT 2 - DATA CLEANING***

***DATASET 2***

In [1]:
import pandas as pd

# Load the dataset
file_path = r'C:\Users\Lilly Daniel\OneDrive\ドキュメント\INvideos.csv'
data = pd.read_csv(file_path)

# Display the first few rows and summary statistics to check for anomalies
print("First 5 Rows of the Dataset:")
print(data.head())

print("\nSummary Statistics:")
print(data.describe(include='all'))

# Check for missing values
print("\nMissing Values:")
print(data.isnull().sum())

First 5 Rows of the Dataset:
      video_id trending_date  \
0  kzwfHumJyYc      17.14.11   
1  zUZ1z7FwLc8      17.14.11   
2  10L1hZ9qa58      17.14.11   
3  N1vE8iiEg64      17.14.11   
4  kJzGH0PVQHQ      17.14.11   

                                               title    channel_title  \
0  Sharry Mann: Cute Munda ( Song Teaser) | Parmi...  Lokdhun Punjabi   
1  पीरियड्स के समय, पेट पर पति करता ऐसा, देखकर दं...          HJ NEWS   
2  Stylish Star Allu Arjun @ ChaySam Wedding Rece...             TFPC   
3                     Eruma Saani | Tamil vs English      Eruma Saani   
4  why Samantha became EMOTIONAL @ Samantha naga ...       Filmylooks   

   category_id              publish_time  \
0            1  2017-11-12T12:20:39.000Z   
1           25  2017-11-13T05:43:56.000Z   
2           24  2017-11-12T15:48:08.000Z   
3           23  2017-11-12T07:08:48.000Z   
4           24  2017-11-13T01:14:16.000Z   

                                                tags    views  likes  \
0 

In [2]:
# Option 1: Drop rows with missing values
data_cleaned = data.dropna()

# Option 2: Fill missing values (if appropriate)
# For example, filling missing descriptions with an empty string
data['description'] = data['description'].fillna('')

# Display the number of missing values after handling
print("\nMissing Values After Handling:")
print(data_cleaned.isnull().sum())


Missing Values After Handling:
video_id                  0
trending_date             0
title                     0
channel_title             0
category_id               0
publish_time              0
tags                      0
views                     0
likes                     0
dislikes                  0
comment_count             0
thumbnail_link            0
comments_disabled         0
ratings_disabled          0
video_error_or_removed    0
description               0
dtype: int64


In [3]:
# Check for duplicate rows
duplicate_rows = data.duplicated().sum()
print(f"\nDuplicate Rows: {duplicate_rows}")

# Remove duplicate rows
data_cleaned = data.drop_duplicates()

# Verify duplicates are removed
print(f"\nDuplicate Rows After Removal: {data_cleaned.duplicated().sum()}")


Duplicate Rows: 4263

Duplicate Rows After Removal: 0


In [4]:
# Standardize text columns (e.g., converting to lowercase)
data_cleaned.loc[:, 'title'] = data_cleaned['title'].str.lower()
data_cleaned.loc[:, 'channel_title'] = data_cleaned['channel_title'].str.lower()
data_cleaned.loc[:, 'tags'] = data_cleaned['tags'].str.lower()

# Display the first few rows of the cleaned data
print("\nCleaned Data (First 5 Rows):")
print(data_cleaned.head())


Cleaned Data (First 5 Rows):
      video_id trending_date  \
0  kzwfHumJyYc      17.14.11   
1  zUZ1z7FwLc8      17.14.11   
2  10L1hZ9qa58      17.14.11   
3  N1vE8iiEg64      17.14.11   
4  kJzGH0PVQHQ      17.14.11   

                                               title    channel_title  \
0  sharry mann: cute munda ( song teaser) | parmi...  lokdhun punjabi   
1  पीरियड्स के समय, पेट पर पति करता ऐसा, देखकर दं...          hj news   
2  stylish star allu arjun @ chaysam wedding rece...             tfpc   
3                     eruma saani | tamil vs english      eruma saani   
4  why samantha became emotional @ samantha naga ...       filmylooks   

   category_id              publish_time  \
0            1  2017-11-12T12:20:39.000Z   
1           25  2017-11-13T05:43:56.000Z   
2           24  2017-11-12T15:48:08.000Z   
3           23  2017-11-12T07:08:48.000Z   
4           24  2017-11-13T01:14:16.000Z   

                                                tags    views  likes  \
0

In [5]:
# Convert trending_date to datetime format
data_cleaned.loc[:, 'trending_date'] = pd.to_datetime(data_cleaned['trending_date'], format='%y.%d.%m')

# Convert publish_time to datetime format
data_cleaned.loc[:, 'publish_time'] = pd.to_datetime(data_cleaned['publish_time'])

# Display the first few rows to check date conversion
print("\nData with Converted Date Columns (First 5 Rows):")
print(data_cleaned.head())


Data with Converted Date Columns (First 5 Rows):
      video_id        trending_date  \
0  kzwfHumJyYc  2017-11-14 00:00:00   
1  zUZ1z7FwLc8  2017-11-14 00:00:00   
2  10L1hZ9qa58  2017-11-14 00:00:00   
3  N1vE8iiEg64  2017-11-14 00:00:00   
4  kJzGH0PVQHQ  2017-11-14 00:00:00   

                                               title    channel_title  \
0  sharry mann: cute munda ( song teaser) | parmi...  lokdhun punjabi   
1  पीरियड्स के समय, पेट पर पति करता ऐसा, देखकर दं...          hj news   
2  stylish star allu arjun @ chaysam wedding rece...             tfpc   
3                     eruma saani | tamil vs english      eruma saani   
4  why samantha became emotional @ samantha naga ...       filmylooks   

   category_id               publish_time  \
0            1  2017-11-12 12:20:39+00:00   
1           25  2017-11-13 05:43:56+00:00   
2           24  2017-11-12 15:48:08+00:00   
3           23  2017-11-12 07:08:48+00:00   
4           24  2017-11-13 01:14:16+00:00   

     

In [6]:
import numpy as np

# Define a function to detect outliers using the IQR method
def detect_outliers(column):
    Q1 = column.quantile(0.25)
    Q3 = column.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return column[(column < lower_bound) | (column > upper_bound)]

# Detect outliers in numerical columns
numerical_columns = ['views', 'likes', 'dislikes', 'comment_count']
for column in numerical_columns:
    outliers = detect_outliers(data_cleaned[column])
    print(f'Outliers in {column}: {outliers.count()}')

# Optionally, you can remove outliers
# For example, removing outliers in the views column
data_cleaned = data_cleaned[~data_cleaned['views'].isin(detect_outliers(data_cleaned['views']))]

Outliers in views: 3784
Outliers in likes: 5033
Outliers in dislikes: 3958
Outliers in comment_count: 4277


In [8]:
# Save the cleaned dataset to a new CSV file
cleaned_file_path = r'C:\Users\Lilly Daniel\OneDrive\ドキュメント\INvideos.csv'
data_cleaned.to_csv(cleaned_file_path, index=False)

print(f"\nCleaned dataset saved to: {cleaned_file_path}")


Cleaned dataset saved to: C:\Users\Lilly Daniel\OneDrive\ドキュメント\INvideos.csv


***THE END***