# **Introduction**
This Jupyter notebook serves as a proof of concept for a view count predictor. The first sections focus on setup and data processing. At the end of the notebook, you can test the predictive model. 

The first step is to introduce several libraries that help with model training.

# **Import Libraries**

In [None]:
!pip install plotly
!pip install wordcloud
!pip install pillow
!pip install  matplotlib
!pip install --upgrade pip
!pip install tensorflow
!pip install tensorflow_addons

In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from tqdm import tqdm
from ast import literal_eval
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.metrics import r2_score
import tensorflow as tf
import tensorflow_addons as tfa
from wordcloud import WordCloud
from PIL import Image
import matplotlib.pyplot as plt
import re
import ipywidgets as widgets

In [None]:
!jupyter lab build



# **Import YouTube trending video dataset**
##  "Collected or available datasets"
Retrives records from csv file and puts them into data frame
Only retrieves categoryID, view_count, and tags

In [None]:
video_df = pd.read_csv('US_youtube_trending_data_lite.csv', usecols=['categoryId', 'view_count', 'tags'],engine='python', on_bad_lines='skip', header=0)
train_df = video_df

# **Featurizing, Parsing, Cleaning, and Wrangling Datasets**

In [None]:
# remove empty and/or incomplete rows
values = ['[None]',"[None]",'None']
train_df = train_df[train_df.tags.isin(values) == False]
train_df.dropna()
train_df = train_df.drop(train_df[train_df['tags'] == '[]'].index, axis=0).reset_index(drop=True)
train_df

In [None]:
# Remove non alphanumeric characters that are also not "|"
# Turn tags string into list of string items
train_df['tags'] = train_df['tags'].apply(lambda word_list: re.sub(r"[^a-zA-Z0-9|]", "",word_list.lower().replace(" ", "")).split('|'))
train_df

# **Building Variables for Model**

In [None]:
y = train_df.loc[:, 'view_count']
X_raw = train_df.loc[:, 'tags']

In [None]:
X_raw

In [None]:
word_counts = {}
stopwords = ['[None]', '',' ','[none]']
for word_list in X_raw:
    for word in word_list:
        if word not in stopwords:
            if word in word_counts:
                word_counts[word] += 1
            else:
                word_counts[word] = 1

In [None]:
word_counts_sorted = {key: value for key, value in sorted(word_counts.items(), key=lambda item: item[1], reverse=True)}
#word_counts_sorted

In [None]:
vocabulary = list(word_counts_sorted)[:500]
for word_list in X_raw:
    word_list[:] = [word for word in word_list if word in vocabulary]
X_raw

In [None]:
null_indices = set()

for i, words in tqdm(enumerate(X_raw)):
    if not words:
        null_indices.add(i)
X_raw = X_raw.drop(null_indices, axis=0).reset_index(drop=True)
y = y.drop(null_indices, axis=0).reset_index(drop=True)

# Remove null rows and reset index

In [None]:
null_indices = set()
for i, words in tqdm(enumerate(X_raw)):
    if not words:
        null_indices.add(i)
word_column_names = []
for word_list in tqdm(X_raw):
    for word in word_list:
        if word not in word_column_names:
            word_column_names.append(word)

In [None]:
mlb = MultiLabelBinarizer()

X = pd.DataFrame(mlb.fit_transform(X_raw), columns=word_column_names)

In [None]:
X

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.2, random_state=24)
X.shape

# **Create and Train Data Model**

In [None]:
inputs = tf.keras.Input(shape=(500,))
x = tf.keras.layers.Dense(64, activation='relu')(inputs)
x = tf.keras.layers.Dense(64, activation='relu')(x)
outputs = tf.keras.layers.Dense(1, activation='linear')(x)

model = tf.keras.Model(inputs, outputs)


model.compile(
    optimizer='adam',
    loss='mean_squared_logarithmic_error'
)


batch_size = 20
epochs = 70

history = model.fit(
    X_train,
    y_train,
    validation_split=0.2,
    batch_size=batch_size,
    epochs=epochs,
    callbacks=[tf.keras.callbacks.ReduceLROnPlateau()]
)

In [None]:
hist_fig = px.line(
    history.history,
    y=['loss', 'val_loss'],
    labels={'x': "Epoch", 'y': "Loss"},
    title="Loss Over Time"
)

hist_fig.show()

# **Outcome Accuarcy**

In [None]:
np.argmin(history.history['val_loss'])

In [None]:
model.evaluate(X_test, y_test)

In [None]:
y_preds = model.predict(X_test)
y_test

In [None]:
#Determine Accuracy
# 1 is excelent
r2_score(y_test, model.predict(X_test))

In [None]:
pca = PCA(n_components=2)
X_reduced = pd.DataFrame(pca.fit_transform(X), columns=["PC1", "PC2"])
word_lists = []

for row in X_reduced.iterrows():
    word_list = [word for word in X.columns if X.loc[row[0], word] == 1]
    word_lists.append(word_list)
X_reduced['keywords'] = word_lists
X_reduced['keywords'] = X_reduced['keywords'].astype(str)

# **Non-Descriptive / Analytics**
Tags are organized by highest average views of videos accosiated with this tag

In [None]:
# convert dictionary to dataframe
word_count_df = pd.DataFrame(list(word_counts_sorted.items()), columns = ['tag','frequency'])
word_count_df = word_count_df.head(500)
word_count_df['tags'] = word_count_df.tag
word_count_df = word_count_df.merge(train_df.explode('tags'),how='left')
word_count_df['view_avg'] = word_count_df.groupby('tag')['view_count'].transform('mean')
del word_count_df['tags']
word_count_df.sort_values('view_avg')

In [None]:
corrMap = word_count_df.corr()
category_fig  = go.Figure()
category_fig.add_trace(
    go.Heatmap(
        x = corrMap.columns,
        y = corrMap.index,
        z = np.array(corrMap),
        text=corrMap.values,
        texttemplate='%{text:.2f}'
    )
)
category_fig.show()

In [None]:
scatter_fig = px.scatter(
    X_reduced,
    x='PC1',
    y='PC2',
    trendline="ols",
    hover_data={
        'PC1': False,
        'PC2': False,
        'keywords': True
    }
)
scatter_fig.show("notebook")

# **Descriptive Methods / Data Visualization**

In [None]:
word_count_fig  = px.scatter(word_count_df, x="view_avg",y="tag")
word_count_fig.show()

In [None]:
wc = WordCloud().generate_from_frequencies(word_counts_sorted)

plt.figure(figsize=(100,20))
plt.subplot(1,2,1)
plt.imshow(wc)

# **Interactive Widget**

In [None]:
# importing the module
import json
 
# Opening JSON file

with open('US_category_id.json') as json_file:
    categories = json.load(json_file)["items"]
cat_dict = {}
for cat in categories:
    cat_dict[int(cat["id"])] = cat["snippet"]["title"]
word_count_df['category_name'] = word_count_df['categoryId'].map(cat_dict)
word_count_df

In [None]:
categors = list(cat_dict.values())
# category dropdown
cat_drop= widgets.SelectMultiple(
    options= categors ,
    #rows=10,
    description='Video Categories',
    disabled=False
)

#tag 1
tag1_filter = [] 
for i in word_count_df.loc[word_count_df['category_name'] == cat_drop.value]: 
    if i not in result: 
        result.append(i)
tag1 = widgets.SelectMultiple(
    options= tag1_filter,
    #rows=10,
    description='Video Categories',
    disabled=False
)
#tag 2
tag2_filter = tag1_filter 
tag2_filter.remove(tag1.value)
tag2 = widgets.SelectMultiple(
    options= tag2_filter,
    #rows=10,
    description='Video Categories',
    disabled=False
)