## ***1- Importing Libraries***

In [1]:
import numpy as np 
import pandas as pd 

## ***2- Data Preprocessing & Feature Engineering***

In [2]:
# Load the dataset, specifying the data types for each column
youtube_data = pd.read_csv('/Users/godzilla/Desktop/Selected Topics-2/Project/youtube_data.csv', low_memory=False)

### ***2.1- Channel Data Preprocessing***

In [3]:
# Renaming columns
new_column_names = {
    'video_id': 'Video_ID',
    'channelTitle': 'Channel_Name',
    'title': 'Title',
    'description': 'Description',
    'tags': 'Tags',
    'publishedAt': 'Published_At',
    'categoryId': 'Category_ID',
    'defaultAudioLanguage': 'Audio_Language',
    'viewCount': 'View_Count',
    'likeCount': 'Like_Count',
    'dislikeCount': 'Dislike_Count',
    'commentCount': 'Comment_Count',
    'duration': 'Duration',
    'definition': 'Definition',
    'caption': 'Caption',
}

youtube_data.rename(columns=new_column_names, inplace=True)

In [4]:
youtube_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2740 entries, 0 to 2739
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Video_ID        2740 non-null   object
 1   Channel_Name    2740 non-null   object
 2   Title           2740 non-null   object
 3   Description     2599 non-null   object
 4   Tags            2740 non-null   object
 5   Published_At    2740 non-null   object
 6   Category_ID     2740 non-null   int64 
 7   Audio_Language  2740 non-null   object
 8   thumbnails      2740 non-null   object
 9   View_Count      2740 non-null   int64 
 10  Like_Count      2740 non-null   int64 
 11  Dislike_Count   2740 non-null   int64 
 12  Comment_Count   2740 non-null   int64 
 13  Duration        2740 non-null   object
 14  Definition      2740 non-null   object
 15  Caption         2740 non-null   bool  
dtypes: bool(1), int64(5), object(10)
memory usage: 323.9+ KB


In [5]:
# Convert count columns to numeric columns
numeric_columns = ['View_Count', 'Like_Count', 'Comment_Count', 'Category_ID', 'Dislike_Count']
youtube_data[numeric_columns] = youtube_data[numeric_columns].apply(pd.to_numeric, errors='coerce', axis=1)


In [6]:
# convert duration to seconds
youtube_data['Duration'] = youtube_data['Duration'].apply(lambda x: pd.to_timedelta(x).total_seconds())


In [7]:
youtube_data['Audio_Language'] = youtube_data['Audio_Language'].replace({
    'en': 'English',
    'en-US': 'English',
    'ar': 'Arabic',
    'en-GB': 'English',
    'en-IN': 'English',
    'hi': 'Hindi'
})

# Create a dictionary to map channel names to audio languages
channel_audio_mapping = {
    "Ken Jee": "English",
    "Ranesh Guruparan": "English",
    "Youssef Hosni": "Arabic",
    "Sundas Khalid": "English",
    "edrea": "English",
    "Mohamed Al Assaal - اتعلم مع العسال": "Arabic",
    "Deena Gergis": "Arabic",
    "Lore So What": "English",
    "Thu Vu data analytics": "English",
    "Data With Mo": "English",
    "techTFQ": "English",
    "Learn with Lukas": "English",
    "Alex The Analyst": "English",
    "codebasics": "English",
    "Mustafa Othman": "Arabic",
    "Justin Shin": "English",
}

# Update Audio_Language column based on the dictionary mapping for null values
for channel_name, audio_language in channel_audio_mapping.items():
    condition = (youtube_data["Channel_Name"] == channel_name) & (youtube_data["Audio_Language"].isnull())
    youtube_data.loc[condition, "Audio_Language"] = audio_language

In [8]:
youtube_data["Comment_Count"].fillna(0, inplace=True)
youtube_data["Like_Count"].fillna(0, inplace=True)

youtube_data["Tags"].fillna("Not Specified", inplace=True)
youtube_data["Description"].fillna("Not Specified", inplace=True)


### ***5.2- Features Engineering***

In [9]:
# Create a dictionary to map category IDs to their text categories
category_id_to_name = {
    28: "Science & Technology",
    24: "Entertainment",
    22: "People & Blogs",
    27: "Education",
    26: "Howto & Style",
    19: "Travel & Events",
    1: "Film & Animation",
    23: "Comedy",
    25: "News & Politics",
    20: "Gaming",
    17: "Sports"
}

# Replace the category IDs with their text categories
youtube_data["Category_Name"] = youtube_data["Category_ID"].replace(category_id_to_name)

In [10]:
# Add number of tags
youtube_data['Tags_Count'] = youtube_data['Tags'].apply(lambda x: 0 if x == 'Not Specified' else len(x.split(",")))

# Title character length
youtube_data['Title_Length'] = youtube_data['Title'].apply(lambda x: len(x.split(" ")))

In [11]:
# Calculate the engagement rate
youtube_data['Engagement_rate'] = round((youtube_data['Like_Count'] + youtube_data['Comment_Count']) / youtube_data['View_Count'] * 100, 2)


In [12]:
# Define a function to categorize videos
def categorize_videos(row):
    if row["Duration"] <= 60:
        return "Short"
    elif row["Duration"] > 60 and row["Duration"] <= 3600:
        return "Regular Content" 
    else: 
        return "Podcast/Tutorials"

# Apply the categorization function to create a new Category column
youtube_data["Video_Category"] = youtube_data.apply(categorize_videos, axis=1)

In [13]:

# Get the current date as a timezone-aware datetime
current_date = pd.Timestamp.now(tz='UTC')

# Convert the 'Published_At' column to a pandas datetime
youtube_data['Published_At'] = pd.to_datetime(youtube_data['Published_At'])

# Calculate the difference between current date and 'Published_At' in days
youtube_data['Days_Since_Published'] = (current_date - youtube_data['Published_At']).dt.days




In [14]:
# Calculate the popularity score using the given formula
youtube_data['Popularity_Score'] = (
    (youtube_data['View_Count'] * 2) +
    youtube_data['Like_Count'] - youtube_data['Dislike_Count'] +
    (youtube_data['Comment_Count'] / youtube_data['Days_Since_Published'])
)

# Round the popularity_score values to 2 decimal places
youtube_data['Popularity_Score'] = youtube_data['Popularity_Score'].round(2)


In [15]:
# Calculate the shearability score using the formula
youtube_data['Shearability_Score'] = (youtube_data['View_Count'] * 2) + \
                                     youtube_data['Like_Count'] - \
                                     youtube_data['Dislike_Count'] + \
                                     (youtube_data['Comment_Count'] / youtube_data['Days_Since_Published']) * \
                                     youtube_data['Engagement_rate']

# Round the Shearability_Score values to 2 decimal places
youtube_data['Shearability_Score'] = youtube_data['Shearability_Score'].round(2)


In [16]:

youtube_data.head(200)

Unnamed: 0,Video_ID,Channel_Name,Title,Description,Tags,Published_At,Category_ID,Audio_Language,thumbnails,View_Count,...,Definition,Caption,Category_Name,Tags_Count,Title_Length,Engagement_rate,Video_Category,Days_Since_Published,Popularity_Score,Shearability_Score
0,l14K2EnD548,Sundas Khalid,AI Will Replace Tech Jobs: From ex-FAANG Softw...,50 job interview questions & answers 👉🏼 https:...,"['data science', 'self-taugh data scientist', ...",2023-08-07 14:10:00+00:00,28,English,{'default': {'url': 'https://i.ytimg.com/vi/l1...,3736,...,hd,True,Science & Technology,28,9,4.26,Regular Content,1,7631.00,7719.02
1,7ssLi7Ll0I0,Sundas Khalid,How Much Money I Made as Data Engineer? (3 yea...,Resume & Cover Letter template (free) 👉🏼 https...,"['data science', 'data scientist', 'self-taugh...",2023-07-28 14:10:00+00:00,28,English,{'default': {'url': 'https://i.ytimg.com/vi/7s...,16280,...,hd,True,Science & Technology,27,14,1.97,Regular Content,11,32853.73,32856.37
2,xr68cbOxvBs,Sundas Khalid,How to learn Python FAST with ChatGPT and Bard?,Try Quadratic for FREE 👉🏼 https://QuadraticHQ....,"['data science', 'data scientist', 'self-taugh...",2023-07-10 14:10:00+00:00,28,English,{'default': {'url': 'https://i.ytimg.com/vi/xr...,325057,...,hd,True,Science & Technology,28,9,1.18,Regular Content,29,653845.41,653846.03
3,mLP4kdk3DoI,Sundas Khalid,Will AI Replace Data Scientists?,Excel graphs template (free) 👉🏼 https://clickh...,"['data science', 'data scientist', 'self-taugh...",2023-06-29 14:12:00+00:00,28,English,{'default': {'url': 'https://i.ytimg.com/vi/mL...,17758,...,hd,False,Science & Technology,27,5,3.42,Regular Content,40,36054.75,36058.98
4,znouY2A61WI,Sundas Khalid,How to code Python FAST for Data Analysis: Bar...,Click to read full AI Trend Report (FREE) 👉🏼 h...,"['data science', 'data scientist', 'self-taugh...",2023-06-16 14:10:01+00:00,28,English,{'default': {'url': 'https://i.ytimg.com/vi/zn...,22949,...,hd,True,Science & Technology,27,11,2.22,Regular Content,53,46369.74,46370.63
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,YrubB5aXStY,Luke Barousse,Data Science with NO coding 😳👨🏼‍💻,Full Video Here 👉🏼 youtu.be/VrdnBxx8BBI\n\nCou...,"['data viz by luke', 'business intelligence', ...",2023-01-18 16:00:14+00:00,28,English,{'default': {'url': 'https://i.ytimg.com/vi/Yr...,28704,...,hd,False,Science & Technology,18,6,5.55,Short,202,58968.16,58968.88
196,LDds33bJy6g,Luke Barousse,Degrees vs. Experience in Data Science 📜 🆚 📊,Full Video Here 👉🏼 youtu.be/VrdnBxx8BBI\n\nRob...,"['data viz by luke', 'business intelligence', ...",2023-01-16 16:00:18+00:00,28,English,{'default': {'url': 'https://i.ytimg.com/vi/LD...,29202,...,hd,False,Science & Technology,18,9,6.59,Short,204,60297.16,60298.03
197,cPbWjaLPHkc,Luke Barousse,Data Analyst's first course 🧑‍💻,📜 Google Data Analytics Certificate 👉🏼 lukeb....,"['data viz by luke', 'business intelligence', ...",2023-01-13 16:00:20+00:00,28,English,{'default': {'url': 'https://i.ytimg.com/vi/cP...,13693,...,hd,False,Science & Technology,18,5,11.06,Short,207,28875.12,28876.34
198,0qcsqdeDbc0,Luke Barousse,Data Analysts are lowest paid in Data Science 💸🥵,Full Video Here 👉🏼 https://youtu.be/NAuuqdzC_r...,"['data viz by luke', 'business intelligence', ...",2023-01-11 16:00:00+00:00,28,English,{'default': {'url': 'https://i.ytimg.com/vi/0q...,57276,...,hd,False,Science & Technology,18,9,5.67,Short,209,117741.29,117742.65


### ***3- Comments Data Preprocessing***

In [21]:
comments_data = pd.read_csv('/Users/godzilla/Desktop/Selected Topics-2/Project/youtube_comments_data.csv', low_memory=False)

ParserError: Error tokenizing data. C error: Buffer overflow caught - possible malformed input file.


In [None]:
def separate_comments(comments_data):
    all_comments = []

    for index, row in comments_data.iterrows():
        vidid = row['video_id']
        comments = row['comments'].split(',')
        for comment in comments:
            all_comments.append({'vidid': vidid, 'comment': comment.strip()})

    return pd.DataFrame(all_comments)

# Separate comments for the DataFrame
separated_comments = separate_comments(comments_data)
    
# Display the resulting DataFrame
print(separated_comments)
    

In [None]:
comments_data["comments"] = comments_data["comments"].str.replace('[\[\]]', '', regex=True)


In [None]:
comments_data