In [2]:
import pandas as pd
import numpy as np

In [3]:
labelled_video_df = pd.read_csv('../training_dataset/labelled_video_dataset.csv', index_col=False)

In [4]:
labelled_video_df.drop('Unnamed: 0', axis=1, inplace=True)

In [5]:
labelled_video_df.head()

Unnamed: 0,channel_id,video_id,video_title,description,tags,published,view_count,like_count,favourite_count,comment_count,duration,definition,caption,category_id,video_type,video_topic,technologies
0,UC4xKdmAXFh4ACyhpiQ_3qBw,a3GQBdLoUOo,Why Crypto isn't coming back. I'm getting out ...,"Ex-Google TechLead gets out of crypto. ""Tech o...",,2022-09-23 21:26:40.000000,152212.0,4411.0,0,1330.0,756,hd,False,27,News,Business Acumen,
1,UCsvqVGtbbyHaMoevxPAq9Fg,C3ilG2-tIn0,HBase Tutorial For Beginners | HBase In Hadoop...,🔥Post Graduate Program In Data Engineering: ht...,"hbase tutorial for beginners, hbase, hbase tut...",2015-09-11 07:03:59.000000,24552.0,112.0,0,8.0,1184,hd,False,27,Tutorial,Big Data,"HBase, Hadoop"
2,UCW5YeuERMmlnqo4oq8vwUpg,UT9zKFYr18U,PHP Tutorial (& MySQL) #40 - File System (part 2),Extra reading - https://www.w3schools.com/php/...,"php, tutorial, php tutorial, php tutorial for ...",2019-03-12 09:11:51.000000,26930.0,565.0,0,17.0,504,hd,False,27,Tutorial,Software Engineering,"PHP, MySQL"
3,UCsvqVGtbbyHaMoevxPAq9Fg,eaTC5aReb3w,🔥 Top 10 Highest Paying Cybersecurity Jobs For...,🔥Post Graduate Program In Cyber Security: http...,"simplilearn, highest paying cybersecurity jobs...",2023-09-07 14:30:22.000000,2423.0,70.0,0,3.0,850,hd,False,27,Career Advice,Software Engineering,
4,UCsvqVGtbbyHaMoevxPAq9Fg,UTOebGpWfno,Reasons Java Is Still Great #Simplilearn,🔥 IITM Pravartak Professional Certificate Prog...,"simplilearn, java, java programming, java codi...",2023-06-08 09:30:11.000000,658.0,27.0,0,3.0,60,hd,False,27,News,Software Engineering,Java


### Manually reassign video type values

In [6]:
labelled_video_df['video_type'].unique()

array(['News', 'Tutorial', 'Career Advice', 'Project',
       'Interview Techniques', 'Tips', nan, 'Challenge',
       'Podcast/Interview', 'Unknown', 'Discussion', 'Interview',
       'Career Development Paths', 'None', 'Resume Building', 'Review',
       'Product Demo', 'Job Search Strategies', 'Testimonial',
       'Comparison', 'Data Science', 'Other', 'Lecture', 'Advertisement',
       'Question and Answer', "Beginner's Guide/101", 'Inspiration'],
      dtype=object)

In [7]:
# Manually mapping dictionary
mapping_dict = {
    'News': 'News',
    'Tutorial': 'Tutorial',
    'Career Advice': 'Career Advice',
    'Project': 'Project',
    'Interview Techniques': 'Career Advice',
    'Tips': 'Tips',
    np.nan: 'Other',
    'Challenge': 'Challenge',
    'Podcast/Interview': 'Podcast/Interview',
    'Unknown': 'Other',
    'Discussion': 'Podcast/Interview',
    'Interview': 'Podcast/Interview',
    'Career Development Paths': 'Career Advice',
    'None': 'Other',
    'Resume Building': 'Career Advice',
    'Review': 'Tips',
    'Product Demo': 'Tutorial',
    'Job Search Strategies': 'Career Advice',
    'Testimonial': 'Career Advice',
    'Comparison': 'Tips',
    'Data Science': 'Career Advice',
    'Other': 'Other',
    'Lecture': 'Tutorial',
    'Advertisement': 'Tutorial',
    'Question and Answer': 'News',
    "Beginner's Guide/101": 'Tutorial',
    'Inspiration': 'Podcast/Interview'
}

# Update 'video_type' values manually with mapping
labelled_video_df['video_type'] = labelled_video_df['video_type'].map(mapping_dict)

In [8]:
labelled_video_df['video_type'].unique()

array(['News', 'Tutorial', 'Career Advice', 'Project', 'Tips', 'Other',
       'Challenge', 'Podcast/Interview'], dtype=object)

In [9]:
labelled_video_df['video_topic'].unique()

array(['Business Acumen', 'Big Data', 'Software Engineering',
       'Machine Learning / AI', 'Statistics and Probability',
       'Data Structures', 'Career Development Paths', 'Data Wrangling',
       'Ethics and Privacy', nan, 'Cloud Computing', 'Data Visualization',
       'JavaScript', 'Data Science', 'Freelance',
       'Data Structures & Algorithms', 'Balancing Work and Life',
       'Mathematics', 'Resume Building', 'Marketing',
       'Security and Privacy', 'Data Structures and Algorithms',
       'Data Mining', 'Data Analysis', 'Job Search Strategies',
       'Data Security', 'Interview Techniques', 'Project Management',
       'Design', 'Data Analytics', 'Security', 'Business Intelligence',
       'Education', 'Career Advice', 'Data Science Basics', 'Geography',
       'Data Engineering', 'Careers', 'Data Management', 'Finance',
       'Other', 'Computer Science', 'Computer Vision',
       'Natural Language Processing', 'Research', 'Algorithm',
       'Game Design', 'Algori

### Assign non-existent categories to 'Other'

In [10]:
# List of target values
target_values = [
    "Statistics and Probability",
    "Machine Learning / AI",
    "Data Wrangling",
    "Data Visualization",
    "Data Mining",
    "Software Engineering",
    "Ethics and Privacy",
    "Cloud Computing",
    "Resume Building",
    "Job Search Strategies",
    "Interview Techniques",
    "Career Development Paths",
    "Balancing Work and Life",
    "Business Acumen"
]

# Count rows where 'video_topic' is not in the target values
count_not_in_target = labelled_video_df[~labelled_video_df['video_topic'].isin(target_values)].shape[0]

print(f'Values assigned to non-existent categories: {count_not_in_target}')

Values assigned to non-existent categories: 2108


In [11]:
# Update values in video_topic column to "Other" if not in target_values list
labelled_video_df.loc[~labelled_video_df['video_topic'].isin(target_values), 'video_topic'] = "Other"

In [12]:
labelled_video_df['video_topic'].unique()

array(['Business Acumen', 'Other', 'Software Engineering',
       'Machine Learning / AI', 'Statistics and Probability',
       'Career Development Paths', 'Data Wrangling', 'Ethics and Privacy',
       'Cloud Computing', 'Data Visualization', 'Balancing Work and Life',
       'Resume Building', 'Data Mining', 'Job Search Strategies',
       'Interview Techniques'], dtype=object)

In [13]:
labelled_video_df['technologies'].unique()

array([nan, 'HBase, Hadoop', 'PHP, MySQL', ..., 'TakeShape',
       'Mac OS X Terminal', 'Bokeh, Python'], dtype=object)

In [14]:
labelled_video_df.head()

Unnamed: 0,channel_id,video_id,video_title,description,tags,published,view_count,like_count,favourite_count,comment_count,duration,definition,caption,category_id,video_type,video_topic,technologies
0,UC4xKdmAXFh4ACyhpiQ_3qBw,a3GQBdLoUOo,Why Crypto isn't coming back. I'm getting out ...,"Ex-Google TechLead gets out of crypto. ""Tech o...",,2022-09-23 21:26:40.000000,152212.0,4411.0,0,1330.0,756,hd,False,27,News,Business Acumen,
1,UCsvqVGtbbyHaMoevxPAq9Fg,C3ilG2-tIn0,HBase Tutorial For Beginners | HBase In Hadoop...,🔥Post Graduate Program In Data Engineering: ht...,"hbase tutorial for beginners, hbase, hbase tut...",2015-09-11 07:03:59.000000,24552.0,112.0,0,8.0,1184,hd,False,27,Tutorial,Other,"HBase, Hadoop"
2,UCW5YeuERMmlnqo4oq8vwUpg,UT9zKFYr18U,PHP Tutorial (& MySQL) #40 - File System (part 2),Extra reading - https://www.w3schools.com/php/...,"php, tutorial, php tutorial, php tutorial for ...",2019-03-12 09:11:51.000000,26930.0,565.0,0,17.0,504,hd,False,27,Tutorial,Software Engineering,"PHP, MySQL"
3,UCsvqVGtbbyHaMoevxPAq9Fg,eaTC5aReb3w,🔥 Top 10 Highest Paying Cybersecurity Jobs For...,🔥Post Graduate Program In Cyber Security: http...,"simplilearn, highest paying cybersecurity jobs...",2023-09-07 14:30:22.000000,2423.0,70.0,0,3.0,850,hd,False,27,Career Advice,Software Engineering,
4,UCsvqVGtbbyHaMoevxPAq9Fg,UTOebGpWfno,Reasons Java Is Still Great #Simplilearn,🔥 IITM Pravartak Professional Certificate Prog...,"simplilearn, java, java programming, java codi...",2023-06-08 09:30:11.000000,658.0,27.0,0,3.0,60,hd,False,27,News,Software Engineering,Java


In [15]:
# Export df to csv
labelled_video_df.to_csv('../training_dataset/cleaned_labelled_video_dataset.csv', index=False)