In [1]:
#Try model test

#Read files for trending and non trending data set
#import tensorflow to build neural network
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
import re #import regex
from sklearn.preprocessing import StandardScaler
#For YouTube API Request
import urllib
import json
from tabulate import tabulate
#import regularizers to reduce overfitting

def getApiKey(filename):
    api_key_file = open(filename, 'r')
    return api_key_file.read().rstrip()

def standardizeTuple(t, d):
    t -= np.mean(d, axis=0)
    t /= np.std(d, axis=0)
    return t

def standardize(a):
    d = np.array(a)
    d -= np.mean(d, axis=0)
    d /= np.std(d, axis=0)
    return np.array(d)

def getOneVideoStats(video_id, api_key):
    searchUrl="https://www.googleapis.com/youtube/v3/videos?id="+video_id+"&key="+api_key+"&part=statistics,snippet,content_details"
    response = urllib.request.urlopen(searchUrl).read()
    data = json.loads(response.decode('utf-8'))
    try:
        viewCount = data['items'][0]['statistics']['viewCount']
        likeCount = data['items'][0]['statistics']['likeCount']
        dislikeCount = data['items'][0]['statistics']['dislikeCount']
        commentCount = data['items'][0]['statistics']['commentCount']
        tagCount = len(data['items'][0]['snippet']['tags'])
        categoryId = data['items'][0]['snippet']['categoryId']
        return [[viewCount,likeCount,dislikeCount,commentCount, tagCount, categoryId]]
    except (KeyError, IndexError):
        return

def getFullTrainingSet(trendDf, nontrendDf):
    fullSubset = pd.concat([trendDf, nontrendDf])
    #Drop duplicates between datasets
    print(len(fullSubset))
    fullSubset = fullSubset.drop_duplicates(subset='video_id', keep='first')
    print(len(fullSubset))
    trendingLabels = np.array(fullSubset['trending'].iloc[:])
    fullSubset = fullSubset.drop(columns=['tags','trending','duration','video_id','video_title'], axis=1)
    return (fullSubset, trendingLabels)

def getNonTrendingSample(nontrendDf):
    return nontrendDf.sample(n=6000)
    
def standardizeData(fullSubset):
    #Standardize the data
    matrixData = np.array(fullSubset)
    matrixData = matrixData.astype('float32')
    matrixData -= np.mean(matrixData, axis=0)
    matrixData /= np.std(matrixData, axis=0)

    # Add input scaling
    scaler = StandardScaler()
    scaleMatrixData = scaler.fit_transform(matrixData)
    return scaleMatrixData
    
trendingSet = pd.read_csv('../data/new-datasets/trending_dataset.csv')
nontrendingSet = pd.read_csv('../data/new-datasets/nontrending_dataset.csv')
trendingDf = pd.DataFrame(trendingSet)
nontrendingDf = pd.DataFrame(nontrendingSet)

subsetTrending = trendingDf.head(6000)
subsetNontrending = nontrendingDf.head(len(subsetTrending))    

#Test dataset
trendingTestSet = trendingDf.iloc[-400:].drop(columns=['tags','trending','duration','video_id','video_title'], axis=1)
nontrendingTestSet = nontrendingDf.iloc[-400:].drop(columns=['tags','trending','duration','video_id','video_title'], axis=1)
totaldf = pd.concat([trendingTestSet, nontrendingTestSet])
#convert to numpy array
smallTestSet = np.array(totaldf)

#test set labels
trendingTestLabels = trendingDf.iloc[-400:]['trending']
nontrendingTestLabels = nontrendingDf.iloc[-400:]['trending']
totalTestlabels = np.concatenate([trendingTestLabels,nontrendingTestLabels])


# Implement early stopping
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=1, mode='auto')

# Build the feed forward Neural Network
model = keras.Sequential()
model.add(keras.layers.Dense(300, kernel_regularizer=keras.regularizers.l2(0.01), activation=tf.nn.relu))
model.add(keras.layers.Dense(300, kernel_regularizer=keras.regularizers.l2(0.01)
                             , activity_regularizer=keras.regularizers.l1(0.01), activation=tf.nn.relu))
model.add(keras.layers.Dropout(0.50))
model.add(keras.layers.Dense(300, kernel_regularizer=keras.regularizers.l2(0.01)
                             , activity_regularizer=keras.regularizers.l1(0.01), activation=tf.nn.relu))
model.add(keras.layers.Dropout(0.50))
#Use softmax for activation function for output layer
model.add(keras.layers.Dense(2, activation=tf.nn.softmax))

#Compile model
model.compile(optimizer=tf.train.AdamOptimizer(), loss='sparse_categorical_crossentropy', metrics=['accuracy'], callbacks=[early_stop]) #compile model

#Train/fit model 
nonTrendingTrainSet = nontrendingDf.head(14000)
all_data=[]
for i in range(5):
    nonTrendingDataSet = getNonTrendingSample(nonTrendingTrainSet)
    setTuple = getFullTrainingSet(subsetTrending, nonTrendingDataSet)
    fullSubset = setTuple[0]
    all_data = setTuple[0]
    trendingLabels = setTuple[1]
    scaleMatrixData = standardizeData(fullSubset)
    model.fit(scaleMatrixData, trendingLabels, epochs=10)

#model test accuracy
_, testacc = model.evaluate(smallTestSet, totalTestlabels)
print("test accuracy is " + str(testacc))

#model predict 
predictions = model.predict(smallTestSet)
print(predictions)

#Drop trending column for prediction
# print(all_data)

#print current shape 
all_data = all_data.loc[:, ~all_data.columns.str.contains('^Unnamed')]
# print(all_data)
apiKey = getApiKey('apikey.txt')
videoId = ''
while (videoId != 'quit'):
    print("Predict a video by entering a video id (or type 'quit' to exit): ")
    videoId = input()
    video = np.array(getOneVideoStats(videoId, apiKey), dtype='|S10').astype(float)
    # print table of video data
    print("\n" + tabulate(video, headers=['ViewCount', 'LikeCount', 'DislikeCount', 'CommentCount', 'Tags']))
    all_data= np.array(all_data)
    video = standardizeTuple(video, all_data)
    video = video[0].T
    print(fullSubset.shape)
    print(video.shape)
    # if video data was empty dont do a prediction
    if (len(video) != 0):
        prediction = model.predict(video)
        # print table of prediction probabilities
        print("\n" + tabulate(prediction, headers=['P(Non-Trending)', 'P(Trending)']))
        # print prediction for video
        if (np.argmax(prediction) == 1):
            print("\nPredicted Trending Video\n")
        else:
            print("\nPredicted Non-Trending Video\n")
    else:
        print("Video missing views, likes, dislikes, or commentCount...")

12000
11892
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
12000
11881
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
12000
11869
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
12000
11875
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
12000
11890
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
test accuracy is 0.8025
[[0. 1.]
 [0. 1.]
 [0. 1.]
 ...
 [0. 1.]
 [1. 0.]
 [0. 1.]]
Predict a video by entering a video id (or type 'quit' to exit): 
ccHhj_E245s


HTTPError: HTTP Error 403: Forbidden