In [None]:
# popular_videos
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns


def get_liked_videos(max_results=200):
    videos = []
    next_page_token = None

    while len(videos) < max_results:
        request = youtube.videos().list(
            part="snippet,contentDetails,statistics,topicDetails",
            myRating="like",
            maxResults=50,
            pageToken=next_page_token
        )

        response = request.execute()
        videos.extend(response.get("items", []))

        next_page_token = response.get("nextPageToken")
        if not next_page_token:
            break

    return videos[:max_results]

def get_random_videos_details(video_ids):
    videos = []
    for i in range(0, len(video_ids), 50):
        request = youtube.videos().list(
            part="snippet,contentDetails,statistics,topicDetails",
            id=','.join(video_ids[i:i+50])
        )
        response = request.execute()
        videos.extend(response.get("items", []))
    return videos

def get_random_videos(max_results=200, regions=['US', 'KR']):
    video_ids = []

    for region in regions:
        while len(video_ids) < max_results:
            request = youtube.videos().list(
                part="snippet",
                chart="mostPopular",
                regionCode=region,
                maxResults=50
            )

            response = request.execute()
            video_ids.extend([item['id'] for item in response.get("items", [])])

            if len(video_ids) >= max_results:
                break

    return get_random_videos_details(video_ids[:max_results])

# Get 200 liked videos
liked_videos = get_liked_videos(max_results=200)

# Get 200 random videos from YouTube (from US and Korea)
random_videos = get_random_videos(max_results=200, regions=['US', 'KR'])

# Create DataFrame to store video information
def create_dataframe(video_items, category):
    data = []
    for item in video_items:
        snippet = item.get("snippet", {})
        content_details = item.get("contentDetails", {})
        statistics = item.get("statistics", {})
        topic_details = item.get("topicDetails", {})

        video_info = {
            "like": category,
            "title": snippet.get("title", "N/A"),
            "description": snippet.get("description", "N/A"),
            "tags": snippet.get("tags", "N/A"),
            "category_id": snippet.get("categoryId", "N/A"),
            "duration": content_details.get("duration", "N/A"),
            "view_count": int(statistics.get("viewCount", 0)),
            "like_count": int(statistics.get("likeCount", 0)),
            "comment_count": int(statistics.get("commentCount", 0)),
            "topic_categories": topic_details.get("topicCategories", "N/A"),
            "language": snippet.get("defaultAudioLanguage", snippet.get("defaultLanguage", "N/A"))
        }
        data.append(video_info)
    return pd.DataFrame(data)

# Create DataFrames for liked and random videos
liked_videos_df = create_dataframe(liked_videos, "Liked")
random_videos_df = create_dataframe(random_videos, "Random")

# Combine both DataFrames
combined_df = pd.concat([liked_videos_df, random_videos_df], ignore_index=True)

# Save DataFrame to CSV
combined_df.to_csv("combined_videos.csv", index=False)

# Display the DataFrame
print(combined_df.head())

# Summarize the dataset
print("\nDataset Summary:")
print(combined_df.describe())

# Bar graph to compare number of liked and random videos
plt.figure(figsize=(10, 6))
sns.countplot(x="like", data=combined_df)
plt.title("Number of Videos: Liked vs Random")
plt.xlabel("Like")
plt.ylabel("Count")
plt.show()

# Compare views of liked and random videos
plt.figure(figsize=(10, 6))
sns.boxplot(x="like", y="view_count", data=combined_df)
plt.title("View Count Comparison: Liked vs Random Videos")
plt.xlabel("Like")
plt.ylabel("View Count")
plt.yscale("log")  # Use log scale to handle wide range of view counts
plt.show()


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, classification_report
import pandas as pd

# Assuming 'df' is your DataFrame after processing the upload date
# If not already done, process the upload date features:

# Convert 'upload_date' to datetime object
df['upload_date'] = pd.to_datetime(df['upload_date'], errors='coerce')

# Extract year, month, day, weekday, and hour
df['upload_year'] = df['upload_date'].dt.year
df['upload_month'] = df['upload_date'].dt.month
df['upload_day'] = df['upload_date'].dt.day
df['upload_weekday'] = df['upload_date'].dt.weekday  # Monday=0, Sunday=6
df['upload_hour'] = df['upload_date'].dt.hour

# Optionally, drop the original 'upload_date' column if not needed
# df = df.drop(columns=['upload_date'])

# Define the features and target variable
features = [
    'view_count', 'like_count', 'comment_count', 'duration_seconds', 
    'category_id', 'upload_year', 'upload_month', 'upload_day', 'upload_weekday', 'upload_hour'
]
target = 'like'

# Split the data into features (X) and target (y)
X = df[features]
y = df[target]

# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing: Feature scaling for numerical features and one-hot encoding for categorical features

# Decide which date features are numerical and which are categorical
numerical_features = [
    'view_count', 'like_count', 'comment_count', 'duration_seconds'
]
categorical_features = ['category_id']

# Define the preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

# Create a pipeline with preprocessing and logistic regression
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

# Train the logistic regression model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

# Print evaluation metrics
print(f"Accuracy: {accuracy:.2f}")
print(f"Confusion Matrix:\n{conf_matrix}")
print(f"ROC-AUC Score: {roc_auc:.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


### Model 4: Neural Network

The fourth model we will use is a **Neural Network**. Neural Networks are powerful machine learning models capable of capturing complex, non-linear relationships in the data. Given the diverse set of features we have, including numerical metadata and textual features, a Neural Network can help discover intricate patterns that may not be easily captured by simpler models like Logistic Regression or Naive Bayes.

Neural Networks consist of layers of interconnected nodes (neurons). The basic structure involves an **input layer**, one or more **hidden layers**, and an **output layer**. Each neuron in a layer takes inputs, applies a weight to each input, sums them, and then applies an **activation function** to introduce non-linearity. Mathematically, for a neuron $j$ in a hidden layer, the output can be represented as:

$
Z_j = f\left( \sum_{i=1}^n w_{ij}X_i + b_j \right)
$

where $X_i$ represents the inputs, $w_{ij}$ are the weights associated with each input, $b_j$ is the bias term, and $f$ is the activation function. Common activation functions include **ReLU** (Rectified Linear Unit), **sigmoid**, and **tanh**. In our model, we will use the ReLU activation function for hidden layers to help the network learn non-linear relationships efficiently.

The **output layer** uses a sigmoid activation function to output a probability score for the "liked" class, as this is a binary classification problem. The output can be represented as:

$
P(y = 1 | X) = \sigma(Z) = \frac{1}{1 + e^{-Z}}
$

where $\sigma(Z)$ is the sigmoid function. The network will be trained using **backpropagation**, which involves calculating the error at the output, propagating it back through the network, and adjusting the weights using **gradient descent** to minimize the error.

We are using a Neural Network model because it allows us to capture complex relationships within the data, especially when combining both numerical and textual features. The flexibility of Neural Networks makes them suitable for this task, as they can adapt to the various input types and find underlying interactions between features that simpler models might miss.

#### Pseudocode for Neural Network model

1. **Define the structure of the neural network** (input layer, hidden layers, output layer).

2. **Initialize weights and biases** for all neurons to small random values.

3. **For each epoch** (iteration over training data):

   - **Forward Propagation**:

     - Compute the linear combination of inputs and weights for each neuron in the hidden layer:
       $Z^{[l]} = W^{[l]} A^{[l-1]} + b^{[l]}$, where $A^{[0]}$ is the input data.
     - Apply activation functions (e.g., ReLU for hidden layers):
       $A^{[l]} = f(Z^{[l]})$.
     - Compute the output using a sigmoid activation function in the output layer.

   - **Calculate the Loss**:

     - Use binary cross-entropy loss to measure the error between predictions and actual values.

   - **Backward Propagation**:

     - Compute the gradients of the loss with respect to weights and biases for each layer.
     - Apply the chain rule to propagate errors back through the network and adjust weights accordingly.

   - **Gradient Descent Update**:

     - Update weights and biases using a learning rate to minimize the loss.

4. **Train the model** using the above steps until convergence or maximum epochs are reached.

5. **Make predictions** on the test set by performing forward propagation with the learned weights.
```


The neural network model is built using a sequential structure consisting of an input layer, two hidden layers, and an output layer. The input layer has 128 neurons, followed by a second hidden layer with 64 neurons, both using Leaky ReLU as the activation function to address the problem of dead neurons that can occur with standard ReLU. The output layer uses a sigmoid activation function for binary classification, providing output values between 0 and 1. The model is compiled using the Adam optimizer, which helps in efficient training, and binary cross-entropy loss to evaluate the error during the classification task.

In [None]:
# Section 6: Train Neural Network Model

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack

# Combine Logistic Regression numeric data and Naive Bayes text data for Neural Network
X_combined_nn_train = hstack([X_train_lr, X_train_nb])
X_combined_nn_test = hstack([X_test_lr, X_test_nb])

# Scale the combined features for neural network
scaler = StandardScaler(with_mean=False)
X_train_nn_scaled = scaler.fit_transform(X_combined_nn_train)
X_test_nn_scaled = scaler.transform(X_combined_nn_test)

# Initialize neural network model
neural_network_model = Sequential()
neural_network_model.add(Dense(128, input_dim=X_train_nn_scaled.shape[1], activation='relu'))
neural_network_model.add(Dropout(0.5))
neural_network_model.add(Dense(64, activation='relu'))
neural_network_model.add(Dense(1, activation='sigmoid'))

# Compile the neural network
neural_network_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the neural network
neural_network_model.fit(X_train_nn_scaled, y_train_lr, epochs=10, batch_size=32, validation_split=0.2)

# Evaluate the neural network on the test set
test_loss, test_accuracy = neural_network_model.evaluate(X_test_nn_scaled, y_test_lr)

print(f"Neural Network Test Accuracy: {test_accuracy:.2f}")

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Define the target variable
target = 'like'

# Combine title and description into a single text feature
df['combined_text'] = df['title'].astype(str) + ' ' + df['description'].astype(str)

# Convert combined text to feature matrix using CountVectorizer
vectorizer = CountVectorizer(stop_words='english')
text_features = vectorizer.fit_transform(df['combined_text']).toarray()

# Define numerical and categorical features
numerical_features = ['view_count', 'like_ratio', 'duration_seconds']
categorical_features = pd.get_dummies(df['category_id']).values
numerical_data = df[numerical_features].fillna(0).values

# Combine numerical, categorical, and text features
X = np.hstack((numerical_data, categorical_features, text_features))
y = df[target].values

# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling for numerical features
scaler = StandardScaler()
X_train[:, :len(numerical_features)] = scaler.fit_transform(X_train[:, :len(numerical_features)])
X_test[:, :len(numerical_features)] = scaler.transform(X_test[:, :len(numerical_features)])

# Convert target variable to categorical
y_train_cat = to_categorical(y_train, num_classes=2)
y_test_cat = to_categorical(y_test, num_classes=2)

# Create Neural Network model
model = Sequential()
model.add(Dense(128, input_dim=X_train.shape[1], activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(2, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train_cat, epochs=10, batch_size=32, validation_data=(X_test, y_test_cat))

# Make predictions on the test set
y_pred_prob = model.predict(X_test)
y_pred = np.argmax(y_pred_prob, axis=1)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

# Print evaluation metrics
print(f"Accuracy: {accuracy:.2f}")
print(f"Confusion Matrix:\n{conf_matrix}")
print(f"ROC-AUC Score: {roc_auc:.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# NLP with title and description
# Tokenize titles and descriptions and count word frequency
df['combined_text'] = df['combined_text'].astype(str)
stop_words = set(stopwords.words('english'))

def tokenize_and_filter(text):
    words = word_tokenize(text.lower())
    words = [word for word in words if word.isalnum() and word not in stop_words]
    return words

# Apply tokenization
df['tokenized_combined'] = df['combined_text'].apply(tokenize_and_filter)

# Count word frequencies for 'like' and 'not like' groups
liked_words = df[df['like'] == 1]['tokenized_combined'].sum()
unliked_words = df[df['like'] == 0]['tokenized_combined'].sum()

liked_word_freq = Counter(liked_words)
unliked_word_freq = Counter(unliked_words)

# Get the most common words in each group
most_common_liked = liked_word_freq.most_common(10)
most_common_unliked = unliked_word_freq.most_common(10)

print("\nMost Common Words in Liked Videos:")
print(most_common_liked)
print("\nMost Common Words in Not Liked Videos:")
print(most_common_unliked)