In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import confusion_matrix

# Part 1: Prepare dataset:
# a.Uplode dataset:
url = "https://github.com/kristenauriemma/595Project/raw/main/USvideos.csv"
data = pd.read_csv(url)


# b.Replace all "0" values:
median_likes = data['likes'].median() 
data['likes'] = data['likes'].replace(0, median_likes) 

median_dislikes = data['dislikes'].median() 
data['dislikes'] = data['dislikes'].replace(0, median_dislikes) 

median_comment = data['comment_count'].median() 
data['comment_count'] = data['comment_count'].replace(0, median_comment)

data['publish_time'] = pd.to_datetime(data['publish_time'])
data['publish_year'] = data['publish_time'].dt.year
data['publish_month'] = data['publish_time'].dt.month
data['publish_day'] = data['publish_time'].dt.day

data['trending_date'] = pd.to_datetime(data['trending_date'], format='%y.%d.%m')
data['trending_year'] = data['trending_date'].dt.year
data['trending_month'] = data['trending_date'].dt.month
data['trending_day'] = data['trending_date'].dt.day


# c.Split training and testing dataset:
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
#print('train set is ', train_data.shape[0])   #train set is  32759
#print('test set is ', test_data.shape[0])    #test set is  8190


# d.Using 3 feature as training dataset:
x_train_1 = train_data[['likes', 'dislikes', 'comment_count']].values
y_train = train_data['views'].values
x_test_1 = test_data[['likes', 'dislikes', 'comment_count']].values
y_test = test_data['views'].values

# e.Using 6 columns (10 features) as training dataset:
x_train_2 = train_data[['likes', 
                        'dislikes', 
                        'comment_count', 
                        'publish_year', 'publish_month', 'publish_day', 
                        'trending_year', 'trending_month', 'trending_day', 
                        'category_id']]
column_names = x_train_2.columns
print(column_names)
x_train_2 = x_train_2.values

x_test_2 = test_data[['likes', 
                      'dislikes', 
                      'comment_count', 
                      'publish_year', 'publish_month', 'publish_day', 
                      'trending_year', 'trending_month', 'trending_day',
                      'category_id']].values

# f.Using 5 columns (7 features) as training dataset:
x_train_3 = train_data[['likes', 
                        'dislikes', 
                        'comment_count', 
                        'publish_year', 'publish_month', 'publish_day', 
                        'category_id']].values
x_test_3 = test_data[['likes', 
                      'dislikes', 
                      'comment_count', 
                      'publish_year', 'publish_month', 'publish_day', 
                      'category_id']].values

For the above section, I processed the data by selecting relevant columns, filling missing values with the median, and deriving the year, month, and day values from the time column. I prepared three different test datasets to train the model:
1. x_train_1 and x_test_1: These datasets include the most impactful columns: 'likes', 'dislikes', and 'comment_count'. This train set is simple and effective; however, some data points lack values for all three columns, which significantly impacts the results.
2. x_train_2 and x_test_2: In addition to the previous three columns, this train set includes 'category_id', 'publish_time', and 'trending_date'. This dataset contains all relevant columns but requires substantial memory to train the model.
3. x_train_3 and x_test_3: This is the final train set I used for the initial part of my model training. After running the model several times with the previous datasets, I realized that the 'trending_date' column introduced noise, reducing the prediction accuracy instead of providing useful information. Therefore, I removed the 'trending_date' column from this dataset.



In [122]:

# Part 2: Forest and error checking functions:
# a. First forest(base forest):
def forest_model_1 (x,y):
    forest = RandomForestClassifier(n_estimators=50, max_leaf_nodes=100, max_depth = 10, random_state=42)
    forest.fit(x, y)
    print("finish training")
    return forest

# b. MSE and r2:
def mse_r2(a, b):
    mse = mean_squared_error(a, b)
    r2 = r2_score(a, b)
    print("Mean Squared Error (MSE):", mse)
    print("R-squared (R²):", r2)
    return mse, r2

# c. plot the result:
def result_show(a_pred, a_test):
    a_pred = np.array(a_pred)
    a_test = np.array(a_test)
    sample_size = 100
    
    indices = np.linspace(0, len(a_pred) - 1, sample_size, dtype=int)
    a_pred_sampled = a_pred[indices]
    a_test_sampled = a_test[indices]
    
    plt.plot(indices, a_pred_sampled, 'r-', label='Predicted', linewidth=1)
    plt.plot(indices, a_test_sampled, 'b.', label='Actual', markersize=2)
    
    plt.legend() 
    plt.xlabel('Index')  
    plt.ylabel('Values') 
    plt.title('Prediction vs Actual')  
    
    plt.savefig('plot.png')
    
    plt.show()
    
# Part 3.Test functions:
# a. test with 3 feature trainset:
forest_test1 = forest_model_1(x_train_1, y_train)
y_pred1 = forest_test1.predict(x_test_1)
result_show(y_pred1, y_test)
mse_1, r2_1 = mse_r2(y_test, y_pred1)

# b. test with 10 feature trainset:
forest_test2 = forest_model_1(x_train_2, y_train)
y_pred2 = forest_test2.predict(x_test_2)
mse_2, r2_2 = mse_r2(y_test, y_pred2)
result_show(y_pred2, y_test)

# c. test with 7 feature trainset:
forest_test3 = forest_model_1(x_train_3, y_train)
y_pred3 = forest_test3.predict(x_test_3)
mse_2, r2_2 = mse_r2(y_test, y_pred3)
result_show(y_pred3, y_test)


In this section, I build the first version of the random forest model: forest_model_1. In this model, I randomly selected the hyperparameters and ran three datasets to obtain the initial results. Based on the results from trainset 1 and 2:
1. trainset 1 Result: Mean Squared Error (MSE): 11159728343252.725; R-squared (R²): 0.7682357226586797
2. trainset 2 Result: Mean Squared Error (MSE): 17127661628281.486; R-squared (R²): 0.6442941980549763

I began to notice that trainset 2 introduced too much noisy data, which may not be necessary. This is when I decided to add dataset 3 (trainset 3 Result: Mean Squared Error (MSE): 15638183450545.225; R-squared (R²): 0.675227552600961). So far, the dataset with 3 features (x_train_1) seems to work the best.

In [None]:

#Part 4. Based on forest_model_1, adjust the hyperparmeter:
# a. Tune the model:
def forest_model_2 (x,y):
    forest = RandomForestClassifier(n_estimators=100, max_leaf_nodes=100, min_samples_split = 5, min_samples_leaf = 2, max_depth = 10, random_state=42)
    forest.fit(x, y)
    print("finish training")
    return forest

# b, Test result:
# trainset 1:
forest_test1 = forest_model_2(x_train_1, y_train)
y_pred1 = forest_test1.predict(x_test_1)
mse_1, r2_1 = mse_r2(y_test, y_pred1)
result_show(y_pred1, y_test)

# trainset 2:
forest_test2 = forest_model_2(x_train_2, y_train)
y_pred2 = forest_test2.predict(x_test_2)
mse_2, r2_2 = mse_r2(y_test, y_pred2)
result_show(y_pred2, y_test)

# trainset 3:
forest_test3 = forest_model_2(x_train_3, y_train)
y_pred3 = forest_test3.predict(x_test_3)
mse_2, r2_2 = mse_r2(y_test, y_pred3)
result_show(y_pred3, y_test)

I tuned the previous model in 'forest_model_2'. In this section, I tried adding hyperparameters such as 'max_leaf_nodes', 'min_samples_split', and 'min_samples_leaf', as well as adjusting the number of trees in the random forest. Here are some of the hyperparameters that I chose to tune:
1. Result-1: n_estimators=100, max_leaf_nodes=100, min_samples_split = 5, min_samples_leaf = 2, max_depth = 10, random_state=42
2. Result-2: n_estimators=100, max_leaf_nodes=100, max_depth = 20, random_state=42
3. Result-3: n_estimators=150, max_leaf_nodes=150, max_depth = 20, random_state=42 (Too much memory needed to train a model with over 3 features)
4. Result-4: n_estimators=50, max_leaf_nodes=50, max_depth = 15, random_state=42
5. Result-5: n_estimators=100, max_leaf_nodes=100, min_samples_split = 5, min_samples_leaf = 2, max_depth = 12, random_state=42
6. Result-6: n_estimators=100, max_leaf_nodes=100, min_samples_split = 2, min_samples_leaf = 1, max_depth = 10, random_state=42
7. Result-7: n_estimators=100, max_leaf_nodes=100, min_samples_split = 10, min_samples_leaf = 5, max_depth = 10, random_state=42
8. Result-test3_1: n_estimators=150, max_leaf_nodes=100, min_samples_split = 5, min_samples_leaf = 2, max_depth = 10, random_state=42
9. Result-test3-2: n_estimators=80, max_leaf_nodes=100, min_samples_split = 5, min_samples_leaf = 2, max_depth = 10, random_state=42
10. Result-test3-3: n_estimators=100, max_leaf_nodes=100, min_samples_split = 7, min_samples_leaf = 2, max_depth = 10, random_state=42
11. Result-test3-4: n_estimators=100, max_leaf_nodes=100, min_samples_split = 5, min_samples_leaf = 3, max_depth = 10, random_state=42
12. Result-test3-5: n_estimators=100, max_leaf_nodes=90, min_samples_split = 5, min_samples_leaf = 2, max_depth = 10, random_state=42

Part of the result from trainset 1 (x_train_1) is listed below
1. Result-1: Mean Squared Error (MSE): 14101218767308.586; R-squared (R²): 0.7071471028044247
2. Result-2: Mean Squared Error (MSE): 11280566205582.316; R-squared (R²): 0.7657261723383781
3. Result-3: Mean Squared Error (MSE): 11677284747260.941; R-squared (R²): 0.7574871558236401
4. Result-4: Mean Squared Error (MSE): 21704152830206.836; R-squared (R²): 0.5492500228251708
5. Result-5: Mean Squared Error (MSE): 9129241914461.613; R-squared (R²): 0.8104046899798829
6. Result-6: Mean Squared Error (MSE): 11363043631200.883; R-squared (R²): 0.7640132882647244
7. Result-7: Mean Squared Error (MSE): 9780747834352.56; R-squared (R²): 0.7968742711325092

Part of the result from trainset 2 (x_train_2) is listed below:
1. Result-1: Mean Squared Error (MSE): 14268376239509.443; R-squared (R²): 0.703675590814596
2. Result-2: Mean Squared Error (MSE): 20341760804703.42; R-squared (R²): 0.5775440631041426
As I expected before, the 10 features bring too much noisy data to the model, which requires a huge amount of memory and training time, resulting in an unsatisfactory model

Part of the result from trainset 3 (x_train_3) is listed below:
1. Result-1: Mean Squared Error (MSE): 8570610683605.1875: R-squared (R²): 0.8220062952822209
2. Result-2: Mean Squared Error (MSE): 11139439161662.44; R-squared (R²): 0.7686570866349777
3. Result-4: Mean Squared Error (MSE): 16460499335906.984; R-squared (R²): 0.6581497670980205
4. Result-5: Mean Squared Error (MSE): 11852180042836.658; R-squared (R²): 0.7538549453842157
5. Result-6: Mean Squared Error (MSE): 15419108474788.736; R-squared (R²): 0.6797772828343558
6. Result-7: Mean Squared Error (MSE): 10948417476177.715; R-squared (R²): 0.7726242085514942
7. Result-test3-1: Mean Squared Error (MSE): 9114062405514.55; R-squared (R²): 0.8107199367146871
8. Result-test3-2: Mean Squared Error (MSE): 9284269449688.404; R-squared (R²): 0.8071850914767019
9. Result-test3-3: Mean Squared Error (MSE): 13786198584874.736; R-squared (R²): 0.7136894148288795
10. Result-test3-4: Mean Squared Error (MSE): 10924210810998.6; R-squared (R²): 0.7731269304896562
11. Result-test3-5: Mean Squared Error (MSE): 10159916237541.361; R-squared (R²): 0.7889997343828

From the result, I can see that my model starts to overfit several times when the value of the hyperparameter goes too high. With different datasets, different hyperparameters work differently. Based on the result, the trainset 3, which contains 7 features, works best with the current model.

In [None]:

# Part 5: Get deep learning of the dataset: 
# a. Get to know the importance of each feature:
# a-1: function of feature important calculation:
def feature_import(model, x_train):
    importances = model.feature_importances_
    
    feature_importances = pd.DataFrame({
        'Feature': x_train.columns,
        'Importance': importances
    })
    feature_importances = feature_importances.sort_values(by='Importance', ascending=False)

    print(feature_importances)

# a-2: Calculate the importance of each features:
forest_test2 = forest_model_2(x_train_2, y_train)
x_train_2_df = pd.DataFrame(x_train_2, columns=column_names)
feature_import(forest_test2, x_train_2_df)
y_pred2 = forest_test2.predict(x_test_2)
mse_2, r2_2 = mse_r2(y_test, y_pred2)
result_show(y_pred2, y_test)


By calculating the feature_importances_, the result shows as follows:

0. |          Feature    | Importance  |
1. | 0           likes   |   0.197535  |
2. | 2   comment_count   |   0.182441  |
3. | 1        dislikes   |   0.156043  |
4. | 8    trending_day   |   0.131839  |
5. | 5     publish_day   |   0.092326  |
6. | 9     category_id   |   0.086993  |
7. | 4   publish_month   |   0.053832  |
8. | 7  trending_month   |   0.044207  |
9. | 3    publish_year   |   0.043077  |
10. | 6   trending_year   |   0.011708  |

I learned which features affect the model's prediction the most. For this particular dataset, if the columns 'likes', 'dislikes', and 'comment_count' contain 0, it specifically means that no data is available. Since only a small amount of data falls into this category, I removed these meaningless entries and retrained the model to evaluate whether it improves the performance of my best model. Based on these factors, I modified the training set accordingly below:


In [128]:

# b. Modified dataset again:
no_zero_data = pd.read_csv(url)

num_rows = len(no_zero_data)
print(f"Number of rows before: {num_rows}") # Number of rows before: 40949

no_zero_data = no_zero_data[(no_zero_data[['likes', 'dislikes', 'comment_count']] != 0).all(axis=1)]
num_rows = len(no_zero_data)
print(f"Number of rows after: {num_rows}") # Number of rows after: 40003

no_zero_data['publish_time'] = pd.to_datetime(no_zero_data['publish_time'])
no_zero_data['publish_day'] = no_zero_data['publish_time'].dt.day

no_zero_data['trending_date'] = pd.to_datetime(no_zero_data['trending_date'], format='%y.%d.%m')
no_zero_data['trending_day'] = no_zero_data['trending_date'].dt.day

no_zero_train, no_zero_test= train_test_split(no_zero_data, test_size=0.2, random_state=42)
print('train set is ', no_zero_train.shape[0]) # train set is  32002
print('test set is ', no_zero_test.shape[0]) # test set is  8001

no_zero_trainX= no_zero_train[['likes', 'dislikes', 'comment_count', 'publish_day', 'trending_day', 'category_id']].values
no_zero_trainY = no_zero_train['views'].values
no_zero_testX = no_zero_test[['likes', 'dislikes', 'comment_count', 'publish_day', 'trending_day', 'category_id']].values
no_zero_testY = no_zero_test['views'].values


Number of rows before: 40949
Number of rows after: 40003
train set is  32002
test set is  8001


In [None]:

# b-1: test new dataset with best model:
print('start test:')
no_zero_test = forest_model_2(no_zero_trainX, no_zero_trainY)
no_zero_y = no_zero_test.predict(no_zero_testX)
mse_1, r2_1 = mse_r2(no_zero_testY, no_zero_y)
result_show(no_zero_y, no_zero_testY)


Here is the result of no_zero dataset on the best model:
1. Mean Squared Error (MSE): 13283701336064.434
2. R-squared (R²): 0.8140916033865432

From the result, the non zero dataset isn't really improve the performance of the model.

In [None]:

# b-2: test with boostrap:
print('start test:')
no_zero_trainX_df = pd.DataFrame(no_zero_trainX)
bootstrapped_x_train = no_zero_trainX_df.sample(n=len(no_zero_trainX_df), replace=True)
bootstrap_test = forest_model_2(bootstrapped_x_train, no_zero_trainY)
bootstrap_y = bootstrap_test.predict(no_zero_testX)
mse_1, r2_1 = mse_r2(no_zero_testY, bootstrap_y )
result_show(bootstrap_y , no_zero_testY)


Here is the result of the bootstrapped dataset on the best model:
1. Mean Squared Error (MSE): 85638016138213.78;
2. R-squared (R²): -0.19852335329074267

![Plot Result](../plot.png)



Oh, I got the first negative R-squared result. From the result, the bootstrapped dataset actually decreased the performance of the model.

In [None]:
# c. refill the missing values with the mean: 

mean_data = pd.read_csv(url)

columns_to_replace = ['likes', 'dislikes', 'comment_count']
for column in columns_to_replace:
    mean_data[column] = mean_data[column].replace(0, mean_data[column].mean())

mean_data['publish_time'] = pd.to_datetime(mean_data['publish_time'])
mean_data['publish_day'] = mean_data['publish_time'].dt.day

mean_data['trending_date'] = pd.to_datetime(mean_data['trending_date'], format='%y.%d.%m')
mean_data['trending_day'] = mean_data['trending_date'].dt.day

mean_train, mean_test= train_test_split(mean_data, test_size=0.2, random_state=42)
print('train set is ', mean_train.shape[0])
print('test set is ', mean_test.shape[0])

mean_trainX= mean_train[['likes', 'dislikes', 'comment_count', 'publish_day', 'trending_day', 'category_id']].values
mean_trainY = mean_train['views'].values
mean_testX = mean_test[['likes', 'dislikes', 'comment_count', 'publish_day', 'trending_day', 'category_id']].values
mean_testY = mean_test['views'].values


# c-1: test with this new mean dataset:
print('test start:')
mean_test = forest_model_2(mean_trainX, mean_trainY)
mean_y = mean_test.predict(mean_testX)
mse_1, r2_1 = mse_r2(mean_testY, mean_y)
result_show(mean_y, mean_testY)


In the last part, I used the mean instead of the median to fill all 0 values and trained the model with the best configuration. By observing the results as follows:
1. Mean Squared Error (MSE): 17244960096747.096
2. R-squared (R²): 0.6418581535616878


![Plot Result](../plot.png)

The modification to the dataset didn't significantly impact the prediction accuracy of the model. Tuning the hyperparameters seemed to be more helpful for improving the performance of the model.