In [1]:
import pandas as pd
from sqlalchemy import create_engine,text
import plotly.express as px
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
import re

In [2]:
db_string = 'sqlite:///../db/youtube.db'
# Create a engine
engine = create_engine(db_string)
# Create connection
conn = engine.connect()

In [3]:
# Test database with simple query
query = text('SELECT * FROM video')
video_df = pd.read_sql_query(query, conn)

In [4]:
video_df.head()

Unnamed: 0,channel_id,video_id,video_title,description,tags,published,view_count,like_count,favourite_count,comment_count,duration,definition,caption,category_id
0,UC8butISFwT-Wl7EV0hUK0BQ,YdWkUdMxMvM,Career Change to Code - The Complete Guide,This course is for those considering transitio...,,2024-02-07 15:49:07.000000,3252.0,374.0,0,27.0,12191,hd,False,27
1,UC8butISFwT-Wl7EV0hUK0BQ,5rNk7m_zlAg,Spring Boot & Spring Data JPA â€“Â Complete Course,Learn how to use Spring Boot and Spring Data J...,,2024-02-06 15:25:40.000000,24118.0,1434.0,0,223.0,45737,hd,False,27
2,UC8butISFwT-Wl7EV0hUK0BQ,5ZdHfJVAY-s,Build 25 React Projects â€“ Tutorial,Master React by building 25 different projects...,,2024-02-05 15:30:28.000000,50388.0,2988.0,0,103.0,34614,hd,False,27
3,UC8butISFwT-Wl7EV0hUK0BQ,OwjKN9_NqPI,Oh My Zsh Creator Robby Russell â€“ freeCodeCamp...,"In this week's episode of the podcast, freeCod...",,2024-02-02 15:26:29.000000,14435.0,284.0,0,22.0,7673,hd,False,27
4,UC8butISFwT-Wl7EV0hUK0BQ,e2nkq3h1P68,Learn Accessibility - Full a11y Tutorial,Learn how to write accessible HTML by solving ...,,2024-02-01 15:38:37.000000,23669.0,968.0,0,17.0,5586,hd,False,27


In [30]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HarryAllum\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HarryAllum\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HarryAllum\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [9]:
# Function to preprocess text
def preprocess_text(text):
    # Remove punctuation and convert to lowercase
    text = text.translate(str.maketrans('', '', string.punctuation)).lower()
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    
    # Join tokens back into a string
    preprocessed_text = ' '.join(lemmatized_tokens)
    
    return preprocessed_text

In [14]:
# Apply preprocessing to the 'video_title' column
video_df['preprocessed_title'] = video_df['video_title'].apply(preprocess_text)

In [15]:
# Display the preprocessed titles
print(video_df[['video_title', 'preprocessed_title']].head())

                                         video_title  \
0         Career Change to Code - The Complete Guide   
1    Spring Boot & Spring Data JPA â€“Â Complete Course   
2                 Build 25 React Projects â€“ Tutorial   
3  Oh My Zsh Creator Robby Russell â€“ freeCodeCamp...   
4           Learn Accessibility - Full a11y Tutorial   

                                  preprocessed_title  
0                  career change code complete guide  
1      spring boot spring data jpa â€“ complete course  
2                  build 25 react project â€“ tutorial  
3  oh zsh creator robby russell â€“ freecodecamporg...  
4             learn accessibility full a11y tutorial  


In [5]:
# Define lists of keywords, exaggerated claims, and clickbait phrases
keywords = ['shocking', 'mind-blowing', 'mind blowing' 'unbelievable', 'epic', 'amazing', "you won't believe", "life changing"]
exaggerated_claims = ['the best', 'the most', 'the ultimate']
clickbait_phrases = ["you won't believe", "this will change your life"]

In [6]:
def count_emojis(title):
    # Define the regex pattern for emojis
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    
    # Find all emojis in the title
    emojis = emoji_pattern.findall(title)
    
    # Return the count of emojis
    return len(emojis)

### Check capital letter frequency before converting title to lower case for other features

In [7]:
def extract_features(title):

    features={}
    
    # Title Length
    features['title_length'] = len(title.split())
    
    # Presence of Numbers
    numbers = sum(c.isdigit() for c in title)
    features['has_large_numbers'] = numbers > 100
    
    # Presence of Question Marks
    features['has_question_mark'] = '?' in title
    
    # Presence of exclamation Marks
    features['has_exclamation_mark'] = '!' in title
    
    # Presence of Emojis
    emojis = count_emojis(title)
    features['emoji_count'] = emojis
    features['has_many_emojis'] = emojis > 5
    
    # Relative Frequency of Capital Letters
    letters = sum(c.isalpha() for c in title)
    capital_letters = sum(c.isupper() for c in title)
    features['capital_letter_ratio'] = capital_letters / letters if letters > 0 else 0.0

    # Count of Punctuation Marks
    punctuation_count = sum(1 for c in title if c in string.punctuation)
    features['punctuation_count'] = punctuation_count

    for keyword in keywords:
        features['has_keywords'] = keyword in title
    for claim in exaggerated_claims:
        features['has_claim'] = claim in title
    for phrase in clickbait_phrases:
        features['has_phrase'] = phrase in title
    
    return features

In [8]:
# Get other features now title is lowercase
video_df['title_features'] = video_df['video_title'].apply(extract_features)

# Convert the dictionary of features into separate columns
video_df = pd.concat([video_df.drop(['title_features'], axis=1), video_df['title_features'].apply(pd.Series)], axis=1)

In [9]:
video_df.tail()

Unnamed: 0,channel_id,video_id,video_title,description,tags,published,view_count,like_count,favourite_count,comment_count,...,has_large_numbers,has_question_mark,has_exclamation_mark,emoji_count,has_many_emojis,capital_letter_ratio,punctuation_count,has_keywords,has_claim,has_phrase
44787,UCwBhBDsqiQflTMLy2epbQVw,RO9rfa8-vwo,Life Engine Update (now with graphs! ğŸ“ˆ),Create your own life in the Life Engine: https...,,2021-02-19 22:00:16.000000,16563.0,588.0,0,25.0,...,False,False,True,1,False,0.103448,3,False,False,False
44788,UCwBhBDsqiQflTMLy2epbQVw,HpgXTphPCP0,Bugs are Features in Evolution [The Life Engine],Play the Life Engine: https://thelifeengine.ne...,,2021-02-05 21:55:05.000000,54992.0,1268.0,0,60.0,...,False,False,False,0,False,0.153846,2,False,False,False
44789,UCwBhBDsqiQflTMLy2epbQVw,uGkkm023BSs,Building a Zoo with Evolution [The Life Engine],Here I demonstrate a very different path of ev...,,2021-01-29 21:40:38.000000,83000.0,2303.0,0,74.0,...,False,False,False,0,False,0.157895,2,False,False,False
44790,UCwBhBDsqiQflTMLy2epbQVw,WJyHaPFwFSQ,Evolution of Eyes and Brains [The Life Engine],Play the Life Engine here: https://thelifeengi...,,2020-08-28 15:04:14.000000,103449.0,3886.0,0,141.0,...,False,False,False,0,False,0.162162,2,False,False,False
44791,UCwBhBDsqiQflTMLy2epbQVw,4XEklaH9k6k,Evolution Simulator [The Life Engine],"In this video I introduce the Life Engine, an ...","Evolution simulator, evolution, simulation, na...",2020-08-07 22:21:22.000000,194728.0,6481.0,0,299.0,...,False,False,False,0,False,0.16129,2,False,False,False


In [10]:
def calculate_clickbait_score(title_features):
    score = 0
    
    if title_features['has_keywords']:
        score += 2
    
    if title_features['has_claim']:
        score += 3

    if title_features['has_phrase']:
        score += 5
        
    if title_features['has_large_numbers']:
        score += 1
    
    if title_features['has_question_mark']:
        score += 1
    
    if title_features['has_exclamation_mark']:
        score += 1
    
    if title_features['has_many_emojis']:
        score += 1
    
    if title_features['capital_letter_ratio'] > 0.3:
        score += 1

    if title_features['punctuation_count']:
        score += 1

    return score

def classify_clickbait(title_features, threshold=4):
    score = calculate_clickbait_score(title_features)
    return score >= threshold

In [11]:
# Example usage
video_df['is_clickbait'] = video_df.apply(lambda row: classify_clickbait(row), axis=1)

In [18]:
# Calculate quartiles for view_count column
q1 = video_df['view_count'].quantile(0.25)
q3 = video_df['view_count'].quantile(0.75)

# Calculate interquartile range (IQR)
iqr = q3 - q1

# Define upper and lower bounds for outlier detection
lower_threshold = q1 - 1.5 * iqr
upper_threshold = q3 + 1.5 * iqr

# Filter the DataFrame to remove outliers
filtered_df = video_df[(video_df['view_count'] >= lower_threshold) & (video_df['view_count'] <= upper_threshold)]

# Create the box plot with filtered data
fig = px.box(filtered_df, x='is_clickbait', y='view_count', color='is_clickbait',
             title='Distribution of View Count for Non-Clickbait vs. Clickbait Videos (Outliers Removed)',
             labels={'is_clickbait': 'Clickbait', 'view_count': 'View Count'})
fig.update_layout(xaxis={'categoryorder': 'total ascending'})
fig.show()

In [21]:
# Calculate average views for clickbait and non-clickbait videos
avg_views = video_df.groupby('is_clickbait')['view_count'].mean().reset_index()

# Create bar plot
bar_fig = px.bar(avg_views, x='is_clickbait', y='view_count', color='is_clickbait',
                 labels={'view_count': 'Average View Count', 'is_clickbait': 'Clickbait'})

# Customize plot layout
bar_fig.update_layout(title='Average Views for Clickbait vs Non-Clickbait Videos',
                      xaxis_title='Clickbait', yaxis_title='Average View Count')

# Show or save the plot
bar_fig.show()

In [66]:
# Close the connection
conn.close()
engine.dispose()