In [1]:
import pandas as pd
import numpy as np
import time
import re

In [2]:
ballerina_raw_df = pd.read_csv('ballerina_raw_20251220.csv')

In [3]:
ballerina_raw_df.head(4)

Unnamed: 0.1,Unnamed: 0,User_IDs,Timestamps,Reviews
0,0,Betty K.,16h,I loved it lots of action
1,1,Matthew W,6d,An interesting new look at the dark underworld...
2,2,Dan O.,Dec 10,Fun...just fun. Well put together certainly wo...
3,3,Adrian H.,Dec 10,8.5/10 One of the best movies this year i have...


In [7]:
def time_conversion(text):
    if pd.isna(text):
        return pd.NaT
    
    now = pd.Timestamp.now()
    
    hr_match = re.search(r'(\d+)\s*h', text)
    if hr_match:
        hours = int(hr_match.group(1))
        return pd.Timestamp.now() - pd.to_timedelta(hours, unit='h')

    day_match = re.search(r'(\d+)\s*d', text)
    if day_match:
        day = int(day_match.group(1))
        return pd.Timestamp.now() - pd.to_timedelta(day, unit='d')
    
    partial_date = re.match(r'^[A-Za-z]{3}\s+\d{1,2}$', text)
    if partial_date:
        return pd.to_datetime(f"{text} {now.year}", format="%b %d %Y", errors="coerce")
    

In [8]:
ballerina_raw_df['Dates'] = ballerina_raw_df['Timestamps'].apply(time_conversion)

ballerina_raw_df['Dates'] = ballerina_raw_df['Dates'].dt.strftime('%Y-%m-%d')

## Sentiment Analysis Using Vader

In [13]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [14]:
sentiment_vader = SentimentIntensityAnalyzer()

ballerina_raw_df['Vader_Score'] = ballerina_raw_df['Reviews'].apply(lambda text: sentiment_vader.polarity_scores(text)['compound'])

In [15]:
def sentiment_category_vader(sentiment):
        if sentiment >= 0.05:
            return "Positive"
        elif sentiment <= -0.05:
            return "Negative"
        else:
            return "Neutral"
        
ballerina_raw_df['Vader_Sentiment'] = ballerina_raw_df['Vader_Score'].apply(sentiment_category_vader)

In [16]:
ballerina_raw_df.head(3)

Unnamed: 0.1,Unnamed: 0,User_IDs,Timestamps,Reviews,Dates,Vader_Score,Vader_Sentiment
0,0,Betty K.,16h,I loved it lots of action,2025-12-19,0.5994,Positive
1,1,Matthew W,6d,An interesting new look at the dark underworld...,2025-12-14,0.5423,Positive
2,2,Dan O.,Dec 10,Fun...just fun. Well put together certainly wo...,2025-12-10,0.8271,Positive


## Last Check

In [17]:
def missing_values(df):
    columns = df.columns
    for c in columns:
        print("Column Name:", c)
        print("Total Rows:", len(df[c]))
        print("Total Missing Values:", df[c].isnull().sum())

In [18]:
missing_values(ballerina_raw_df)

Column Name: Unnamed: 0
Total Rows: 970
Total Missing Values: 0
Column Name: User_IDs
Total Rows: 970
Total Missing Values: 33
Column Name: Timestamps
Total Rows: 970
Total Missing Values: 0
Column Name: Reviews
Total Rows: 970
Total Missing Values: 0
Column Name: Dates
Total Rows: 970
Total Missing Values: 0
Column Name: Vader_Score
Total Rows: 970
Total Missing Values: 0
Column Name: Vader_Sentiment
Total Rows: 970
Total Missing Values: 0


In [19]:
ballerina_raw_df.to_csv('ballerina_final_df_20251220.csv')