In [5]:
import os
import google_auth_oauthlib.flow
import googleapiclient.discovery
import googleapiclient.errors

In [1]:
import pymongo as mongo
import json
import shlex

In [2]:
class youtube:
    def __init__(self, db):
        self.yt=self.start_api(api_key=mongo_srv)
        self.db=db
    def start_api(self, api_key=yt_key):
        api_service_name = "youtube"
        api_version = "v3" 
        yt = googleapiclient.discovery.build(api_service_name, api_version, developerKey=api_key)
        return yt
    def channel_info(self, ids):
        infos = []
        info = self.yt.channels().list(part=["brandingSettings", "id", "statistics", "topicDetails", "snippet"], fields="items", id=ids)
        info = info.execute()
        infos += info["items"]
        response = []
        for json in infos:
            response.append(self.clean_channel_json(json))
        return response
    def clean_channel_json(self, json):
        new_json={}
        new_json["id"]=json["id"]
        new_json["collab"]=False
        new_json["title"]=json["brandingSettings"]["channel"]["title"]
        new_json["description"]=json["brandingSettings"]["channel"].get("description")
        new_json["keywords"]=json["brandingSettings"]["channel"].get("keywords")
        new_json["country"]=json["brandingSettings"]["channel"].get("country")
        new_json["viewCount"]=int(json["statistics"].get("viewCount", "0"))
        new_json["subscriberCount"]=int(json["statistics"].get("subscriberCount","0"))
        new_json["videoCount"]=int(json["statistics"].get("videoCount","0"))
        new_json["image"]=json["snippet"]["thumbnails"]["high"]["url"]
        new_json["topics"]=[]
        if json.get("topicDetails") != None:
            json_topics=json["topicDetails"].get("topicCategories",[])
            for i in range(len(json_topics)):
                new_json["topics"].append(json_topics[i].replace("https://en.wikipedia.org/wiki/",""))
        return new_json
    def channel_to_database(self, json):
        for channel in json:
            self.db.channels.update_one({"title":channel["title"]}, {"$set": channel}, upsert=True)
        return len(json)

In [3]:
client = mongo.MongoClient(mongo_srv)
db = client.matchmaking

In [9]:
db.channels.update_many({}, {"$set":{"collab":False}})

<pymongo.results.UpdateResult at 0x2510795df08>

In [6]:
yt = youtube(db)

## Input information into the database

In [None]:
yt.channel_to_database(yt.channel_info("Fighting Games", results=500))

## Updating information

In [None]:
#this is an example were wu updated the channel's profile image

for channel in yt.db.channels.find({}):

    info = yt.yt.channels().list(part="snippet",id=channel["id"])
    info = info.execute()
    if info.get("items")==None:
        continue
    info = info["items"][0]
    yt.db.channels.update_one({"id":info["id"]},{"$set": {"image": info["snippet"]["thumbnails"]["high"]["url"]}})

## Setting up keywords that will be used.

In [7]:
#Create our dictionary of keywords

keywords={}
for channel in yt.db.channels.find({}):
    if channel["keywords"]!= None:
        for keyword in shlex.split(channel["keywords"].replace("'", "")):
            keyword=keyword.lower()
            keywords[keyword]=keywords.get(keyword, 0)+1
    if channel["topics"]!=None:
        for keyword in channel["topics"]:
            keyword=keyword.replace("_"," ")
            keyword=keyword.lower()
            keywords[keyword]=keywords.get(keyword, 0)+1

In [8]:
for channel in yt.db.channels.find({}):
    if channel["description"]!= None:
        for keyword in shlex.split(channel["description"].replace("'", "").replace("/n", "").replace('"', "")):
            keyword=keyword.lower()
            if keyword in keywords.keys():
                keywords[keyword]+=1
    if channel["title"]!= None:
        for keyword in shlex.split(channel["title"].replace("'", "").replace("/n", "").replace('"', "")):
            keyword=keyword.lower()
            if keyword in keywords.keys():
                keywords[keyword]+=1

In [9]:
distribution={}
for keyword in keywords:
    distribution[keywords[keyword]]=distribution.get(keywords[keyword],0)+1

In [10]:
top_keywords=[]
for keyword in keywords.keys():
    if keywords[keyword]>=4:   #Numbers chosen so that words appear in reasonable amount of channels not too few
        top_keywords.append(keyword)
len(top_keywords)

1115

In [11]:
with open("top_keywords.json", "w") as json_file:
    json.dump(top_keywords, json_file)

In [69]:
keywords={}
for channel in yt.db.channels.find({}):
    channel_top_keywords=set()
    if channel["keywords"]!= None:
        for keyword in shlex.split(channel["keywords"].replace("'", "")):
            keyword=keyword.lower()
            if keyword in top_keywords:
                channel_top_keywords.add(keyword)
    if channel["description"]!= None:
        for keyword in shlex.split(channel["description"].replace("'", "").replace("/n", "").replace('"', "")):
            keyword=keyword.lower()
            if keyword in top_keywords:
                channel_top_keywords.add(keyword)
    if channel["topics"]!=None:
        for keyword in channel["topics"]:
            keyword=keyword.replace("_"," ")
            keyword=keyword.lower()
            if keyword in top_keywords:
                channel_top_keywords.add(keyword)
    if channel["title"]!= None:
        for keyword in shlex.split(channel["title"].replace("'", "").replace("/n", "").replace('"', "")):
            keyword=keyword.lower()
            if keyword in top_keywords:
                channel_top_keywords.add(keyword)
    channel_top_keywords=list(channel_top_keywords)
    yt.db.channels.update_one({"title":channel["title"]},{"$set":{"top_keywords":channel_top_keywords}})

# Have vector representing each channel.

In [11]:
import gensim
import numpy as np
for channel in db.channels.find({}):
    word_vectors = gensim.models.KeyedVectors.load('word_vectors.kv')
    keywords=channel["top_keywords"]
    vector=np.array([0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0])
    for keyword in keywords:
        try:
            vector+=word_vectors.get_vector(keyword)
        except:
            pass
    vector=vector/len(keywords)
    vector=list(vector)
    db.channels.update_one({"title":channel["title"]},{"$set":{"vector":vector}})

  vector=vector/len(keywords)


## Getting approximate timezone data based on country

In [3]:
with open("raw_timezones.json", "r") as json_file:
    raw_timezones=json.load(json_file)

In [13]:
timezones={}
for timezone in raw_timezones:
    timezones[timezone["countryCode"]]=int(timezone["locales"][0]["name"][4:7])

In [14]:
with open("timezones.json", "w") as json_file:
    json.dump(timezones, json_file)

In [20]:
for channel in db.channels.find({}):
    db.channels.update_one({"title":channel["title"]},{"$set":{"timezone":timezones.get(channel.get("country"),None)}})