# Code to Categorize Youtube Videos

Initial videod data only categroized roughly 20% of videos. This notebook uses those categorized videos to categorize the additional 80%. 

First, transcripts are pulling in from YouTube and it is added to data already avaliable such as tags, short descriptions, etc. Then text is cleaned - first it is tokenized, then stopwords were removed using nltk, and then words were stemmed. This was done after the initial models predictions scores were too low.

After the data was cleaned we used GridSearchCV along with CountVectorizer, tfidfTranformer, and different ML models to determine which model worked the best with which paramters. For here, the final model was produced and video feature space was created.

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from datetime import date
import re

from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import nltk
from nltk.corpus import stopwords

import warnings
warnings.filterwarnings('ignore')

In [6]:
from urllib.parse import urlparse, parse_qs

def get_youtube_video_id(url):
    # Examples:
    # - http://youtu.be/SA2iWivDJiE
    # - http://www.youtube.com/watch?v=_oPAwA_Udwc&feature=feedu
    # - http://www.youtube.com/embed/SA2iWivDJiE
    # - http://www.youtube.com/v/SA2iWivDJiE?version=3&amp;hl=en_US
    query = urlparse(url)
    if query.hostname == 'youtu.be': return query.path[1:]
    if query.hostname in {'www.youtube.com', 'youtube.com'}:
        if query.path == '/watch': return parse_qs(query.query)['v'][0]
        if query.path[:7] == '/embed/': return query.path.split('/')[2]
        if query.path[:3] == '/v/': return query.path.split('/')[2]
    # fail?
    return None

In [7]:
from youtube_transcript_api import YouTubeTranscriptApi

def get_youtube_video_transcript(video_id):
    
    transcript_text = ""

    try:
        transcript_list = YouTubeTranscriptApi.get_transcript(video_id)
    except:
        return ""
    
    for dic in transcript_list:
        print(dic.get('text'))
        transcript_text = transcript_text + dic.get('text')
    
    return transcript_text

In [8]:
from nltk.corpus import stopwords

def remove_stopwords(word_list):
    
    #remove stopwords
    stop_words = set(stopwords.words('english'))
    word_list = [w for w in word_list if not w in stop_words]
    
    new_list = []
    
    for w in word_list:
        if w not in stop_words:
            new_list.append(w)
    
    #remove additional words
    word_list = [i for i in word_list if i != "www"]
    word_list = [i for i in word_list if i != "http"]
    word_list = [i for i in word_list if i != "com"]
    word_list = [i for i in word_list if i != "https"]
    word_list = [i for i in word_list if i != "goo"]
    word_list = [i for i in word_list if i != "gl"]
    word_list = [i for i in word_list if i != "The"]
    word_list = [i for i in word_list if i != "nan"]
    word_list = [i for i in word_list if i != "youtube"]
    word_list = [i for i in word_list if i != "ly"]
    word_list = [i for i in word_list if i != "â"]
    
    return word_list

In [9]:
def list_to_string(list):
    
    str = " "
    
    for element in list:
        str = str + " " + element
            
    return str
#end

In [10]:
import nltk
import spacy

# pip install spacy nltk
# python -m spacy download en

def stem_words(word_list):
    
    new_list = []
    
    if(word_list is None):
        return new_list
    
    en_nlp = spacy.load('en_core_web_sm')
    stemmer = nltk.stem.PorterStemmer()
    #doc_spacy = en_nlp(word_list)   
    
    for word in word_list:
        #print("word before stemming is ": )
        
        stemmed = stemmer.stem(word)
        new_list.append(stemmed)
    
    
    return new_list
    

In [11]:
patient_df = pd.read_csv("patient_info.csv", error_bad_lines = False, engine = "python")
video_df = pd.read_csv(r"video_watched.csv", error_bad_lines=False, engine = "python")

## Clean Video Data & Feature Engineering for Video Categorization

This section of code cleans video data to ues the bag of words method to categroize each video. This data is primarily missing in the original data but should help in recommending movies

Code directly below was used to pull transcripts from youtube videos but was then stored in excel file for easier access

In [7]:
#video_data = video_df[[ 'video_id', 'url', 'primary_category', 'secondary_category', 'notes', 'description', 'tags', 'length']]
#video_data = video_data.drop_duplicates()

In [8]:
#video_data['youtube_id'] = video_data.apply(lambda x : get_youtube_video_id(x['url']), axis = 1)

In [21]:
#video_data['transcript'] = video_data.apply(lambda x : get_youtube_video_transcript(x['youtube_id']), axis = 1)

In [22]:
#video_data.to_excel("video_data_with_transcripts.xlsx")

Begin cleaning data to train video classifier

In [16]:
video_cls_data = pd.read_excel(r"video_data_with_transcripts.xlsx")

Code below is to do a sanity check on the methods used to process the data

In [1]:
#test = video_cls_data[:5]
#test

In [2]:
#test['tok'] = test.apply(lambda x : tokenizer.tokenize(x['all text']), axis = 1)
#test

In [116]:
#test['nonstop'] = test.apply(lambda x : remove_stopwords(x['tok']), axis = 1)
#test

In [117]:
#test['stem'] = test.apply(lambda x : stem_words(x['nonstop']), axis = 1)
#test

Code below is to tokenize sentences, remove stopwords, and stemming - this is was store in excel file and commented out

In [17]:
### SLOW SO DON'T RUN UNLESS ERROR ###
# read from excel instead

# tokenize text
tokenizer = nltk.RegexpTokenizer(r"\w+")
video_cls_data['words'] = video_cls_data.apply(lambda x : tokenizer.tokenize(x['all text']), axis = 1)
# remove stop words
video_cls_data['words'] = video_cls_data.apply(lambda x : remove_stopwords(x['words']), axis = 1)
# stem words
video_cls_data['words'] = video_cls_data.apply(lambda x : stem_words(x['words']), axis = 1)
# remove original data
del video_cls_data['all text']

In [23]:
video_cls_data

Unnamed: 0.1,Unnamed: 0,video_id,primary_category,words
0,0,624,,"['get', 'first', '30', 'day', 'curios', 'strea..."
1,1,620,Anxiety,"['sleep', 'cloudlessmind', 'whi', 'worri', 'mu..."
2,2,787,,"['To', 'support', 'channel', 'level', 'health'..."
3,3,771,Cognitive Behavioral Therapy,"['depress', 'anxieti', 'stress', 'manag', 'pai..."
4,4,2587,,"['welcom', 'neuroflow', 'tool', 'allow', 'work..."
...,...,...,...,...
1018,1018,6464,,"['subscrib', 'ðŸ', 'offici', 'bbc', 'youtub', ..."
1019,1019,6463,,"['understand', 'cope', 'skill', 'hello', 'welc..."
1020,1020,6449,,"['neuroflow', 'amc', 'partnership', 'video', '..."
1021,1021,6452,,"['subscrib', 'ðŸ', 'offici', 'bbc', 'youtub', ..."


In [122]:
# don't update this file unless data is changed for the good
#video_cls_data.to_excel("processed_video_data_for_bag_of_words.xlsx")
#this was done else where and an 'as string' column was added

Begin to process to train prediction model

In [2]:
video_cls_data = pd.read_excel("processed_video_data_for_bag_of_words.xlsx")

# remove rows with no category because they cant help train supervised model
train_rows = video_cls_data.dropna()
# split data from category
X_words = train_rows['as string']
y = train_rows['primary_category']

In [3]:
X_words_train, X_words_test, y_train, y_test = train_test_split(X_words, y)

In [4]:
test = CountVectorizer(list(X_words_train))

In [5]:
X_train = test.fit_transform(list(X_words_train))

In [6]:
X_test = test.transform(list(X_words_test))

In [7]:
X_words

1       sleep whi We worri So much cloudlessmind whi ...
3       depress anxieti stress manag pain substanc us...
5       stress 4 7 8 breath exercis gozen gozen thi g...
12      stop neg self talk now guid exercis thi guid ...
14      how Be more confid you alway get want tri lea...
                             ...                        
572     amaz effect gratitud thank you watch braincra...
640     7 pregnanc warn sign are awar sign pregnanc w...
642     substanc use disord visit us khanacademi org ...
861     spotlight peer specialist who are they A cert...
862     substanc use disord present treatment underst...
Name: as string, Length: 203, dtype: object

## Testing Models and Parameters

Used GridSearchCV to test different models and parameters in each model. Overall, Linear SVC was the best with a C value of roughly 0.03, not using term frequency-inverse document frequency, and n-gram of 1 and only 1.

Linear SVC

In [8]:
model = Pipeline(steps = [('vectorizer', CountVectorizer()), ('tfidf', TfidfTransformer()), ('lin_svc', LinearSVC(class_weight="balanced"))])

parameters = {'vectorizer__ngram_range': [(1,1), (1,2), (2,2)], 'tfidf__use_idf': (True, False), 'lin_svc__C': np.logspace(-4,4,20)}

gs_clf_svm = GridSearchCV(model, parameters, n_jobs = -1)
gs_clf_svm = gs_clf_svm.fit(list(X_words), np.array(y))

print(gs_clf_svm.best_score_)
print(gs_clf_svm.best_params_)


0.7681707317073171
{'lin_svc__C': 0.03359818286283781, 'tfidf__use_idf': False, 'vectorizer__ngram_range': (1, 2)}


SVM with RBF kernal

In [72]:
model = Pipeline(steps = [('vectorizer', CountVectorizer()), ('tfidf', TfidfTransformer()), ('svc', SVC(kernel = 'rbf', class_weight="balanced"))])

parameters = {'vectorizer__ngram_range': [(1,1), (1,2), (2,2)], 'tfidf__use_idf': (True, False), 'svc__C': np.logspace(-4,4,20), 'svc__gamma': np.logspace(-4,4,20)}

gs_clf_svm = GridSearchCV(model, parameters, n_jobs = -1)
gs_clf_svm = gs_clf_svm.fit(X_words, np.array(y))

print(gs_clf_svm.best_score_)
print(gs_clf_svm.best_params_)


0.7240243902439024
{'svc__C': 545.5594781168514, 'svc__gamma': 0.0018329807108324356, 'tfidf__use_idf': False, 'vectorizer__ngram_range': (1, 1)}


Logistic Regression

In [75]:
model = Pipeline(steps = [('vectorizer', CountVectorizer()), ('tfidf', TfidfTransformer()), ('log_reg', LogisticRegression(class_weight="balanced"))])

parameters = {'vectorizer__ngram_range': [(1,1), (1,2), (2,2)], 'tfidf__use_idf': (True, False), 'log_reg__C': np.logspace(-4,4,4), 'log_reg__solver': ('newton-cg', 'sag')}

gs_clf_svm = GridSearchCV(model, parameters, n_jobs = -1)
gs_clf_svm = gs_clf_svm.fit(X_words, np.array(y))

print(gs_clf_svm.best_score_)
print(gs_clf_svm.best_params_)


0.7482926829268293
{'log_reg__C': 21.54434690031882, 'log_reg__solver': 'newton-cg', 'tfidf__use_idf': True, 'vectorizer__ngram_range': (1, 1)}


Naive Bayes

In [76]:
model = Pipeline(steps = [('vectorizer', CountVectorizer()), ('tfidf', TfidfTransformer()), ('multiNB',  MultinomialNB())])

parameters = {'vectorizer__ngram_range': [(1,1), (1,2), (2,2)], 'tfidf__use_idf': (True, False), 'multiNB__alpha': np.logspace(-4,4,40)}

gs_clf_svm = GridSearchCV(model, parameters, n_jobs = -1)
gs_clf_svm = gs_clf_svm.fit(X_words, np.array(y))

print(gs_clf_svm.best_score_)
print(gs_clf_svm.best_params_)


0.6456097560975609
{'multiNB__alpha': 0.0006614740641230146, 'tfidf__use_idf': False, 'vectorizer__ngram_range': (1, 2)}


# Final Video Classifier

In [11]:
# take vidoes that have categories and train model based on them
model = Pipeline(steps = [('vectorizer', CountVectorizer(ngram_range = (1,2))),
                          ('tfidf', TfidfTransformer(use_idf = False)),
                          ('lin_svc', LinearSVC(class_weight="balanced", C = 0.03359818286283781))])

model.fit(X_words_train, np.array(y_train))


Pipeline(steps=[('vectorizer', CountVectorizer(ngram_range=(1, 2))),
                ('tfidf', TfidfTransformer(use_idf=False)),
                ('lin_svc',
                 LinearSVC(C=0.03359818286283781, class_weight='balanced'))])

In [14]:
y_test_pred = model.predict(X_words_test)

In [19]:
print('Accuracy: {:.2f}'.format(accuracy_score(y_test, y_test_pred)))
print('Precision: {:.2f}'.format(precision_score(y_test, y_test_pred, average = 'macro')))
print('Recall: {:.2f}'.format(recall_score(y_test, y_test_pred, average = 'macro')))
print('F1: {:.2f}'.format(f1_score(y_test, y_test_pred, average = 'macro')))


Accuracy: 0.76
Precision: 0.78
Recall: 0.83
F1: 0.78


In [33]:
# then get all the videos (categorized and not) and predict them
X_all = video_cls_data['words']
y = video_cls_data['primary_category']

prediction = model.predict(x_all)

In [34]:
video_feature_space_ = video_cls_data_[['video_id', 'length']]
video_feature_space_['category'] = prediction

In [35]:
video_feature_space_

Unnamed: 0,video_id,length,category
0,624,466,Depression
1,620,137,Anxiety
2,787,285,Stress
3,771,305,Cognitive Behavioral Therapy
4,2587,95,Depression
...,...,...,...
1018,6464,144,Stress
1019,6463,248,Depression
1020,6449,182,Depression
1021,6452,144,Stress


In [36]:
video_feature_space_.to_excel("video_id_to_cat.xlsx")

In [142]:
video_feature_space = pd.get_dummies(video_feature_space_)
video_feature_space = video_feature_space.set_index('video_id')


In [149]:
mean = video_feature_space['length'].mean()

In [150]:
video_feature_space['length'] = video_feature_space['length'].apply(lambda x : x / mean)

In [152]:
video_feature_space.to_csv("video_features.csv")