## Jonathan Mathai 110320715 
## Richu Jacob 110257792
## Omar Syed 110484590

#                                                <center>Youtube Trending Data</center>


### Problem: Predicting whether a video will be trending or not 
### Libraries used: pandas, numpy, tensorflow, urllib, json, tabulate, re





In [None]:
import tensorflow as tf
from tensorflow import keras
from tabulate import tabulate
import pandas as pd
import numpy as np
import urllib
import json
import matplotlib.pyplot as plt

## <center> Data gathering </center>
### <ol> <li> Scrape the internet for list of trending and nontrending videos using Selenium webdriver. </li><li>For each video extract the video title and video id and add to dataframe</li>Used the YouTube API to add views, likes, dislikes, comments, category id, and tags length features to each data<li>Compiled around 6500 trending videos and 15900 nontrending videos</li> </ol>

In [None]:
def getOneVideoStats(video_id, api_key):
    searchUrl="https://www.googleapis.com/youtube/v3/videos?id="+video_id+"&key="+api_key+"&part=statistics,snippet,content_details"
    response = urllib.request.urlopen(searchUrl).read()
    data = json.loads(response)
    try:
        viewCount = data['items'][0]['statistics']['viewCount']
        likeCount = data['items'][0]['statistics']['likeCount']
        dislikeCount = data['items'][0]['statistics']['dislikeCount']
        commentCount = data['items'][0]['statistics']['commentCount']
        categoryId = data['items'][0]['snippet']['categoryId']
        tagLength = len(data['items'][0]['snippet']['tags'])
        likeRatioCount = int(likeCount)/(int(likeCount)+int(dislikeCount))
        commentRatioCount = int(commentCount)/int(viewCount)
        return [[viewCount,likeCount,dislikeCount,commentCount, categoryId, tagLength]]
    except (KeyError, IndexError):
        return
    
def standardize(a):
    d = np.array(a)
    d = d.astype('float32')
    d -= np.mean(d, axis=0)
    d /= np.std(d, axis=0)
    return np.array(d)

def getApiKey(filename):
    api_key_file = open(filename, 'r')
    return api_key_file.read().rstrip()

## <center> Preparing data </center> 
### <ol><li>Created a new dataframe out of 6000 trending and 6000 nontrending data</li><li>Standardized the dataframe </li> </ol>

## <center> Preprocessing Neural Net </center>
## Classifier Architecture
![dff](deep_feed_forward.jpg)
### <ol> <li>Created input, hidden, and output layers sequentially using keras</li><li>Dropped any duplicates from dataframe </li><li>Passed the converted dataframe into deep feed forward model</li><li>We used a deep feed forward neural network. Composed of 1 input layer and 2 hidden layers and 1 output.</li></ol> 

## <center> Optimizing Neural Net </center>


all_data=[]
for i in range(5):
    nonTrendingDataSet = getNonTrendingSample(nonTrendingTrainSet)
    setTuple = getFullTrainingSet(subsetTrending, nonTrendingDataSet)
    fullSubset = setTuple[0]
    all_data = setTuple[0]
    trendingLabels = setTuple[1]
    scaleMatrixData = standardizeData(fullSubset)
    model.fit(scaleMatrixData, trendingLabels, epochs=10)



### <ol> <li>Our input layer and hidden layers contain 512 nodes and our output layer is 2 nodes, because our results are trending and nontrending</li>

model = keras.Sequential([
keras.layers.Dense(512, input_dim=6,activation=tf.nn.relu),
keras.layers.Dense(512, activation=tf.nn.relu),
(keras.layers.Dropout(0.50)),
keras.layers.Dense(512, activation=tf.nn.relu),
(keras.layers.Dropout(0.50)),
keras.layers.Dense(2, activation=tf.nn.softmax)
])
    

## <ol><li>Testing accuracy was 50 prior to standardizing data </li>
## <li>Used standard scalar input data to reduce overfitting of the data.</li>
## <li> Used drop out layer to reduce overfitting after each hidden layer </li> </ol>

In [None]:
def main():
    trendingSet = pd.read_csv('trending_dataset.csv')
    nontrendingSet = pd.read_csv('nontrending_dataset.csv')
    trendingSet = trendingSet[['views', 'likes', 'dislikes', 'comments', 'category_id', 'tags_length']]
    nontrendingSet = nontrendingSet[['views', 'likes', 'dislikes', 'comments', 'category_id', 'tags_length']]
    #trendingSet['likes_ratio'] = trendingSet['likes']/(trendingSet['likes']+trendingSet['dislikes'])
    #trendingSet['comments_ratio'] = trendingSet['comments']/(trendingSet['views'])
    
    #nontrendingSet['likes_ratio'] = nontrendingSet['likes']/(nontrendingSet['likes']+nontrendingSet['dislikes'])
    #nontrendingSet['comments_ratio'] = nontrendingSet['comments']/(nontrendingSet['views'])
    
    
    #print(trendingSet)
    trending_stats_non = trendingSet.values
    nontrending_stats_non = nontrendingSet.values
    #print(trending_stats_non.shape)
    #print(nontrending_stats_non)
    
    #nontrending_stats_non = np.load('nontrending_stats.npy')
    #trending_stats_non = np.load('trending_stats.npy')
    # combine trending and nontrending data, then standardize them
    all_data = np.concatenate((trending_stats_non[0:6000,:], nontrending_stats_non[0:6000]), axis=0)
    
    #all_data = all_data.astype('float32')
    #print(all_data.shape)
    standardized_data = standardize(all_data)
    # sample tuple for a random video (views, likes, dislikes, commentCount)
    exampleTuple = [[1713501,84894,2855,15155, 20, 12]]
    print(all_data.shape)
    exampleTuple = standardizeTuple(exampleTuple, all_data)
    # split em up
    trending_stats = standardized_data[0:6000,:]
    nontrending_stats = standardized_data[6001:,:]
    # curate sets of data necessary for neural net
    train_data = np.concatenate((trending_stats[0:2900,:], nontrending_stats[0:2900,:]), axis = 0)
    test_data = np.concatenate((trending_stats[2901:5801,:], nontrending_stats[2901:5801,:]), axis = 0)
    train_labels = np.concatenate((np.ones(2900), np.zeros(2900)), axis = 0)
    test_labels = np.concatenate((np.ones(2900), np.zeros(2900)), axis = 0)


  

    model.compile(optimizer=tf.train.AdamOptimizer(),
                    loss ='sparse_categorical_crossentropy',
                    metrics=['accuracy'])
    
    #train_data = train_data.sample(frac=1)
    model.fit(train_data, train_labels, epochs=100)
    predictions = model.predict(test_data)
    #print(predictions)
    #cm = tf.contrib.metrics.confusion_matrix(test_labels, predictions)
    #print(cm)

    test_loss, test_acc = model.evaluate(test_data, test_labels)

    print("\n\nTest Set Accuracy:" + str(test_acc))

    #history = model.fit(X, Y, validation_split=0.33, epochs=1000, batch_size=10, verbose=0, callbacks=[tb, early_stop])
    # extra code so you can test individual video ids and see if trending or nontrending
    apiKey = getApiKey('api_key.txt')
    videoId = ''
    while videoId != 'quit':
        print("Predict a video by entering a video id (or type 'quit' to exit): ")
        videoId = input()
        video = np.array(getOneVideoStats(videoId, apiKey), dtype='|S10').astype(float)
        # print table of video data
        print("\n" + tabulate(video, headers=['ViewCount', 'LikeCount', 'DislikeCount', 'CommentCount', 'CategoryId', 'Tags']))
        video = standardizeTuple(video, all_data)
        # if video data was empty dont do a prediction
        if (len(video) != 0):
            prediction = model.predict(video)
            # print table of prediction probabilities
            print("\n" + tabulate(prediction, headers=['P(Non-Trending)', 'P(Trending)']))
            # print prediction for video
            if (np.argmax(prediction) == 1):
                print("\nPredicted Trending Video\n")
            else:
                print("\nPredicted Non-Trending Video\n")
        else:
            print("Video missing views, likes, dislikes, or commentCount...")


main()

## <center> End product </center>
### <ol> <li> User enters video id to get prognosis whether it is trending </li></ol>

