In [None]:
# Step 1: Import the datasets and libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from keras.models import Sequential
from keras.layers import Dense

# Loading the dataset
data = pd.read_csv('youtube_adview.csv')

# For Checking the shape and datatypes of the dataset
print(data.shape)
print(data.dtypes)

# Step 2: Visualise the dataset using plotting using heatmaps and plots.

f , ax = plt.subplots(figsize = (10,8))
corr = data.corr()
sns.heatmap(corr,mask = np.zeros_like(corr,dtype = np.bool),cmap = sns.diverging_palette(220,20,as_cmap = True),square = True,ax = ax,annot = True)
plt.show()

# Assigning each character a number
category = {'A':1,'B':2,'C':3,'D':4,'E':5,'F':6,'G':7,'H':8}
data['category'] = data['category'].map(category)
data['category']

data = data[data.views != 'F']
data = data[data.likes != 'F']
data = data[data.dislikes != 'F']
data = data[data.comment ! ='F']

In [None]:
# Step 3: Clean the dataset by removing missing values and other things.

# Check for missing values
print(data.isnull().sum())

# Drop rows with missing values
data = data.dropna()

# Remove any duplicate rows if any
data = data.drop_duplicates()

data['views'] = pd.to_numeric(data['views'])
data['likes'] = pd.to_numeric(data['likes'])
data['dislikes'] = pd.to_numeric(data['dislikes'])
data['comment'] = pd.to_numeric(data['comment'])
data['adview'] = pd.to_numeric(data['adview'])

# Step 4: Transform attributes into numerical values and other necessary transformations

data['duration'] = LabelEncoder().fit_transform(data['duration'])
data['vidid'] = LabelEncoder().fit_transform(data['vidid'])
data['published'] = LabelEncoder().fit_transform(data['published'])


plt.hist(data['category'])
plt.show

plt.plot(data['adview'])
plt.show

data = data[data['adview'] < 2000000]

plt.plot(data['adview'])
plt.show

In [None]:
# Step 5: Normalise your data and split the data into training, validation and test set in the appropriate ratio.

# Select features and target
X = data[['views', 'likes', 'dislikes', 'comment',
          'published', 'duration', 'category']]
y = data['adview']

# Normalize the feature data
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training (60%), validation (20%), and test (20%) sets
X_train, X_temp, y_train, y_temp = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.2, random_state=42)

X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [None]:
# Define function to print Error
from sklearn import metrics


def print_Error(X_test, y_test, model_name):
  prediction = model_name.predict(X_test)
  print("Mean Absolute error of ", model_name,
        metrics.mean_absolute_error(y_test, prediction))
  print("Mean Squared error of ", model_name,
        metrics.mean_squared_error(y_test, prediction))
  print("Root Mean Squared error of ", model_name,np.sqrt(metrics.mean_squared_error(y_test,prediction)))

In [None]:
# Step 6: Use linear regression, Support Vector Regressor for training and get errors.

# Initialize the Linear Regression model
linear_reg = LinearRegression()
# Train the model
linear_reg.fit(X_train, y_train)
# Predict on validation set
y_pred_val_linear = linear_reg.predict(X_val)
# Calculate error
error_linear = mean_squared_error(y_val, y_pred_val_linear)
print(f'Linear Regression MSE: {error_linear}')
print_Error(X_test, y_test, linear_reg)
print("\n")

# Initialize the Support Vector Regressor model
svr_reg = SVR()
# Train the model
svr_reg.fit(X_train, y_train)
# Predict on validation set
y_pred_val_svr = svr_reg.predict(X_val)
# Calculate error
error_svr = mean_squared_error(y_val, y_pred_val_svr)
print(f'SVR MSE: {error_svr}')
print_Error(X_test, y_test,svr_reg)

In [None]:
# Step 7: Use Decision Tree Regressor and Random Forest Regressors.

# Initialize the Decision Tree Regressor model
dt_reg = DecisionTreeRegressor()
# Train the model
dt_reg.fit(X_train, y_train)
# Predict on validation set
y_pred_val_dt = dt_reg.predict(X_val)
# Calculate error
error_dt = mean_squared_error(y_val, y_pred_val_dt)
print(f'Decision Tree MSE: {error_dt}')
print_Error(X_test, y_test, dt_reg)
print("\n")

# Initialize the Random Forest Regressor model
rf_reg = RandomForestRegressor(
    n_estimators=200, max_depth=25, min_samples_split=15, min_samples_leaf=5)
# rf_reg = RandomForestRegressor()
# Train the model
rf_reg.fit(X_train, y_train)
# Predict on validation set
y_pred_val_rf = rf_reg.predict(X_val)
# Calculate error
error_rf = mean_squared_error(y_val, y_pred_val_rf)
print(f'Random Forest MSE: {error_rf}')
print_Error(X_test, y_test,rf_reg)

In [None]:
# Step 8: Build an artificial neural network and train it with different layers and hyperparameters.

# Initialize the ANN model
ann = Sequential()
ann.add(Dense(units=128, activation='relu', input_dim=X_train.shape[1]))
ann.add(Dense(units=64, activation='relu'))
ann.add(Dense(units=1, activation='linear'))

# Compile the ANN
ann.compile(optimizer='adam', loss='mean_squared_error')
# Train the ANN model
ann.fit(X_train, y_train, batch_size=32,
        epochs=100, validation_data=(X_val, y_val))

# Predict on validation set
y_pred_val_ann = ann.predict(X_val)
# Calculate error
error_ann = mean_squared_error(y_val, y_pred_val_ann)
print(f'ANN MSE: {error_ann}')

In [None]:
# Step 9: Pick the best model based on error as well as generalisation.

# Compare the errors of all models
model_errors = {
    'Linear Regression': error_linear,
    'SVR': error_svr,
    'Decision Tree': error_dt,
    'Random Forest': error_rf,
    'ANN': error_ann
}

best_model = min(model_errors, key=model_errors.get)
print(f'The best model is: {best_model}')


# The test set predictions can now be used for further analysis or creating a submission file, etc.

In [None]:
# Step 10: Save your model and predict on the test set.

# Assuming the best model is the Random Forest based on the previous step
best_model = rf_reg

In [None]:
# Save the model
import joblib
joblib.dump(best_model, 'best_youtube_adview_predictor.pkl')

# Load model and predict on the test set (for demonstration)
loaded_model = joblib.load('best_youtube_adview_predictor.pkl')
test_predictions = loaded_model.predict(X_test)
print(test_predictions)

In [None]:
ann.save("ann_youtube_adview.h5")