In [1]:
import pandas as pd
import pandasql as ps
import numpy as np

## Load Dataframe

In [2]:
# Open dataframe

one_piece_yt_df = pd.read_csv("one_piece_s2.csv")

In [3]:
one_piece_yt_df.head(5)

Unnamed: 0,Author,Comment,Likes,Timestamp,ReplyCount
0,@EN-Fitz,Chopperâ€™s hiding skill has obviously been impr...,18073,2026-01-12T16:11:16Z,79
1,@invisiblefly2454,It's such a miracle that the show is allowed t...,16158,2026-01-12T19:58:29Z,150
2,@Parzival.UltimateGamer,We better have that one crocus gag with the dr...,14293,2026-01-12T15:09:10Z,131
3,@NhojLhiac27,Fully embracing and not shying away from the r...,8388,2026-01-12T15:27:06Z,42
4,@jacenwade,"They actually adapted the unluckies, lol.\n\nG...",7077,2026-01-12T15:24:33Z,57


## Data Cleaning

### I. Checking Data

In [4]:
# Check duplicated values

duplicated_rows = one_piece_yt_df[one_piece_yt_df.duplicated(keep=False)]

print(duplicated_rows, len(one_piece_yt_df))

                  Author   Comment  Likes             Timestamp  ReplyCount
6595  @saleemmanneer1245  YeeeeahðŸ˜‚      0  2026-01-12T15:05:44Z           0
6596  @saleemmanneer1245  YeeeeahðŸ˜‚      0  2026-01-12T15:05:44Z           0 6703


### II. Data Cleaning

In [5]:
# There are 2 duplicated rows, need to drop them

one_piece_yt_df = one_piece_yt_df.drop_duplicates()

In [6]:
print(len(one_piece_yt_df))

6702


In [7]:
# Before cleaning data, make sure there's no empty values

def missing_values(df):
    columns = df.columns
    for c in columns:
        missing = df[c].isnull().sum()
        print(f"The column, {c}, has {missing} values.")

In [8]:
missing_values(one_piece_yt_df)

The column, Author, has 0 values.
The column, Comment, has 4 values.
The column, Likes, has 0 values.
The column, Timestamp, has 0 values.
The column, ReplyCount, has 0 values.


In [9]:
# We have 4 empty comments, need to check

empty_rating_df_check = one_piece_yt_df[one_piece_yt_df['Comment'].isna()]

empty_rating_df_check

Unnamed: 0,Author,Comment,Likes,Timestamp,ReplyCount
997,@DrunkFist_,,1,2026-01-12T16:28:57Z,0
1095,@Esprit_Shonen,,1,2026-01-12T18:08:07Z,0
5279,@VinayakPadha,,0,2026-01-12T16:45:47Z,0
5306,@VinayakPadha,,0,2026-01-12T16:46:21Z,0


In [10]:
# In this case, since we need to check comments for sentiment analysis, we've decided to delete empty comments
# Should have 6698 rows after deletion

one_piece_yt_df = one_piece_yt_df.dropna()

In [11]:
missing_values(one_piece_yt_df)

print(len(one_piece_yt_df))

The column, Author, has 0 values.
The column, Comment, has 0 values.
The column, Likes, has 0 values.
The column, Timestamp, has 0 values.
The column, ReplyCount, has 0 values.
6698


### III. Change Timestamps to Proper Format

In [12]:
# To standardize time, need to change date to yyyy-mm-dd format

import time

one_piece_yt_df['review_dates'] = pd.to_datetime(one_piece_yt_df['Timestamp']).dt.strftime('%Y-%m-%d')

In [13]:
one_piece_yt_df['review_dates'].unique()

array(['2026-01-12', '2026-01-14', '2026-01-13', '2026-01-15',
       '2026-01-16', '2026-01-18', '2026-01-17', '2026-01-19',
       '2026-01-20'], dtype=object)

### IV. VADER for Checking Sentiment

In [14]:
# Now it's time to check the sentiment using VADER

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

sentiment_vader = SentimentIntensityAnalyzer()

one_piece_yt_df['vader_score_review'] = one_piece_yt_df['Comment'].apply(lambda text: sentiment_vader.polarity_scores(text)['compound'])

In [15]:
def sentiment_category_vader(sentiment):
        if sentiment >= 0.05:
            return "Positive"
        elif sentiment <= -0.05:
            return "Negative"
        else:
            return "Neutral"
        
one_piece_yt_df['vader_sentiment_review'] = one_piece_yt_df['vader_score_review'].apply(sentiment_category_vader)

In [16]:
one_piece_yt_df.head(4)

Unnamed: 0,Author,Comment,Likes,Timestamp,ReplyCount,review_dates,vader_score_review,vader_sentiment_review
0,@EN-Fitz,Chopperâ€™s hiding skill has obviously been impr...,18073,2026-01-12T16:11:16Z,79,2026-01-12,0.2263,Positive
1,@invisiblefly2454,It's such a miracle that the show is allowed t...,16158,2026-01-12T19:58:29Z,150,2026-01-12,0.6908,Positive
2,@Parzival.UltimateGamer,We better have that one crocus gag with the dr...,14293,2026-01-12T15:09:10Z,131,2026-01-12,0.128,Positive
3,@NhojLhiac27,Fully embracing and not shying away from the r...,8388,2026-01-12T15:27:06Z,42,2026-01-12,0.1174,Positive


### V. Last Step for Data Cleaning

In [17]:
# Before organizing data, let's standardize column names by changing them to small letters

one_piece_yt_df['source'] = 'YouTube'

one_piece_yt_df = one_piece_yt_df.rename(columns = {'Comment':'reviews'})

In [18]:
one_piece_yt_df.head(4)

Unnamed: 0,Author,reviews,Likes,Timestamp,ReplyCount,review_dates,vader_score_review,vader_sentiment_review,source
0,@EN-Fitz,Chopperâ€™s hiding skill has obviously been impr...,18073,2026-01-12T16:11:16Z,79,2026-01-12,0.2263,Positive,YouTube
1,@invisiblefly2454,It's such a miracle that the show is allowed t...,16158,2026-01-12T19:58:29Z,150,2026-01-12,0.6908,Positive,YouTube
2,@Parzival.UltimateGamer,We better have that one crocus gag with the dr...,14293,2026-01-12T15:09:10Z,131,2026-01-12,0.128,Positive,YouTube
3,@NhojLhiac27,Fully embracing and not shying away from the r...,8388,2026-01-12T15:27:06Z,42,2026-01-12,0.1174,Positive,YouTube


In [19]:
# We all have necessary data, so let's organize dataframe by using SQL

one_piece_yt_df_final = ps.sqldf("select [review_dates], [reviews], [Likes], [vader_score_review], [vader_sentiment_review], [source] from one_piece_yt_df")

In [20]:
one_piece_yt_df_final.head()

Unnamed: 0,review_dates,reviews,Likes,vader_score_review,vader_sentiment_review,source
0,2026-01-12,Chopperâ€™s hiding skill has obviously been impr...,18073,0.2263,Positive,YouTube
1,2026-01-12,It's such a miracle that the show is allowed t...,16158,0.6908,Positive,YouTube
2,2026-01-12,We better have that one crocus gag with the dr...,14293,0.128,Positive,YouTube
3,2026-01-12,Fully embracing and not shying away from the r...,8388,0.1174,Positive,YouTube
4,2026-01-12,"They actually adapted the unluckies, lol.\n\nG...",7077,-0.3818,Negative,YouTube


## Finalizing Data Cleaning Process & End

In [21]:
one_piece_yt_df_final.to_csv('one_piece_yt_df_cleaned.csv', index=False)