In [56]:


import numpy as np # linear algebra
import pandas as pd # data processing, 
import matplotlib.cm as cm
import matplotlib.pyplot as plt

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [57]:
import warnings
warnings.filterwarnings('ignore')

In [58]:
df = pd.read_csv("../input/youtube-adview-dataset/train.csv")

In [59]:
df.head()

In [60]:
df.shape

In [61]:
#visualisation
#individual plots
plt.hist(df["category"])
plt.show()
plt.plot(df["adview"])
plt.show()

In [62]:
#remove videos with adview greater than 2000000
df=df[df["adview"]<2000000]

In [63]:
# Heatmap
import seaborn as sns
f, ax = plt.subplots(figsize=(10, 8))
corr = df.corr()
sns.heatmap(corr, mask=np.zeros_like(corr, dtype=np.bool), cmap=sns.diverging_palette(220, 10, as_cmap=True),
square=True, ax=ax,annot=True)
plt.show()

In [64]:
df.dtypes

In [65]:
category={'A': 1,'B': 2,'C': 3,'D': 4,'E': 5,'F': 6,'G': 7,'H': 8,}
df["category"]=df["category"].map(category)
df.head()

In [66]:
df=df[df.views!='F']
df=df[df.likes!='F']
df=df[df.dislikes!='F']
df=df[df.comment!='F']

In [67]:
df["views"]=pd.to_numeric(df["views"])
df["comment"]=pd.to_numeric(df["comment"])
df["likes"]=pd.to_numeric(df["likes"])
df["dislikes"]=pd.to_numeric(df["dislikes"])
df["adview"]=pd.to_numeric(df["adview"])

In [68]:
df.info()

In [69]:
column_vidid=df['vidid']

In [70]:
from sklearn.preprocessing import LabelEncoder

In [71]:
df['duration']=LabelEncoder().fit_transform(df['duration'])
df['vidid']=LabelEncoder().fit_transform(df['vidid'])
df['published']=LabelEncoder().fit_transform(df['published'])

In [72]:
df.head()

In [73]:
import datetime
import time

In [74]:
def checki(x):
  y=x[2:]
  h=''
  m=''
  s=''
  mm=''
  p= ['H','M','S']
  for i in y:
    if i not in p:
      mm+=i
    else:
      if(i=="H"):
        h=mm
        mm=''
      elif(i=="M"):
        m=mm
        mm=''
      else:
        s=mm
        mm=''
  if(h==''):
    h='00'
  if(m==''):
    m='00'
  if(s==''):
    m='00'
  bp = h+':'+m+':'+s
  return bp
train = pd.read_csv("../input/youtube-adview-dataset/train.csv")
mp=pd.read_csv("../input/youtube-adview-dataset/train.csv")["duration"]
time=mp.apply(checki)

In [75]:
df.info()

In [76]:
def func_sec(time_string):
  h , m, s=time_string.split(':')
  return int(h) * 3600 + int(m) * 60 + int(s)

  time1=time.apply(func_sec)
  df["duration"]=time1
df.head()

In [77]:
# Split Data
Y_train = pd.DataFrame(data = df.iloc[:, 1].values, columns = ['target'])
df=df.drop(["adview"],axis=1)
df=df.drop(["vidid"],axis=1)
df.head()

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df, Y_train, test_size=0.2, random_state=42)

X_train.shape



In [78]:
# Normalise Data
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train=scaler.fit_transform(X_train)
X_test=scaler.fit_transform(X_test)

X_train.mean()

In [79]:
# Evaluation Metrics
from sklearn import metrics
def print_error(X_test, y_test, model_name):
  prediction = model_name.predict(X_test)
  print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, prediction))
  print('Mean Squared Error:', metrics.mean_squared_error(y_test, prediction))
  print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, prediction)))


In [80]:
# Linear Regression
from sklearn import linear_model
linear_regression = linear_model.LinearRegression()
linear_regression.fit(X_train, y_train)
print_error(X_test,y_test, linear_regression)

In [81]:
# Support Vector Regressor
from sklearn.svm import SVR
supportvector_regressor = SVR()
supportvector_regressor.fit(X_train,y_train)
print_error(X_test,y_test, supportvector_regressor)

In [82]:
# Decision Tree Regressor
from sklearn.tree import DecisionTreeRegressor
decision_tree = DecisionTreeRegressor()
decision_tree.fit(X_train, y_train)
print_error(X_test,y_test, decision_tree)

In [83]:
# Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor
n_estimators = 200
max_depth = 25
min_samples_split=15
min_samples_leaf=2
random_forest = RandomForestRegressor(n_estimators = n_estimators, max_depth = max_depth, min_samples_split=min_samples_split)
random_forest.fit(X_train,y_train)
print_error(X_test,y_test, random_forest)

In [84]:
# Artificial Neural Network
from tensorflow import keras
from keras.layers import Dense

In [85]:
ann = keras.models.Sequential([
                                Dense(6, activation="relu",
                                input_shape=X_train.shape[1:]),
                                Dense(6,activation="relu"),
                                Dense(1)
                                ])

optimizer=keras.optimizers.Adam()
loss=keras.losses.mean_squared_error
ann.compile(optimizer=optimizer,loss=loss,metrics=["mean_squared_error"])

history=ann.fit(X_train,y_train,epochs=100)

print_error(X_test,y_test,ann)

In [86]:
#Saving Scikitlearn models
import joblib
joblib.dump(supportvector_regressor, "SVR_youtubeadview.pkl")
# Saving Keras Artificial Neural Network model
ann.save("ann_youtubeadview.h5")

In [87]:
dft = pd.read_csv("../input/youtube-adview-dataset/test.csv")

In [88]:
dft.head()

In [89]:
from keras.models import load_model
model = load_model("ann_youtubeadview.h5")

In [90]:
# Removing character "F" present in data
dft=dft[dft.views!='F']
dft=dft[dft.likes!='F']
dft=dft[dft.dislikes!='F']
dft=dft[dft.comment!='F']

In [91]:
dft.head()

In [92]:
# Assigning each category a number for Category feature
category={'A': 1,'B':2,'C':3,'D':4,'E':5,'F':6,'G':7,'H':8}
dft["category"]=dft["category"].map(category)
dft.head()

In [93]:
# Convert values to integers for views, likes, comments, dislikes and adview
dft["views"] = pd.to_numeric(dft["views"])
dft["comment"] = pd.to_numeric(dft["comment"])
dft["likes"] = pd.to_numeric(dft["likes"])
dft["dislikes"] = pd.to_numeric(dft["dislikes"])
column_vidid=dft['vidid']

# Endoding features like Category, Duration, Vidid
from sklearn.preprocessing import LabelEncoder
dft['duration']=LabelEncoder().fit_transform(dft['duration'])
dft['vidid']=LabelEncoder().fit_transform(dft['vidid'])
dft['published']=LabelEncoder().fit_transform(dft['published'])
dft.head()

In [94]:
def checki(x):
  y = x[2:]
  h = ''
  m = ''
  s = ''
  mm = ''
  P = ['H','M','S']
  for i in y:
    if i not in P:
      mm+=i
    else:
      if(i=="H"):
        h = mm
        mm = ''
      elif(i == "M"):
        m = mm
        mm = ''
      else:
        s = mm
        mm = ''
  if(h==''):
    h = '00'
  if(m == ''):
    m = '00'
  if(s==''):
    s='00'
  bp = h+':'+m+':'+s
  return bp

train=pd.read_csv("../input/youtube-adview-dataset/test.csv")
mp = pd.read_csv("../input/youtube-adview-dataset/test.csv")["duration"]
time = mp.apply(checki)

def func_sec(time_string):
  h, m, s = time_string.split(':')
  return int(h) * 3600 + int(m) * 60 + int(s)

time1=time.apply(func_sec)

dft["duration"]=time1
dft.head()

In [95]:
dft=dft.drop(["vidid"],axis=1)
dft.head()

In [96]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_test = dft
X_test=scaler.fit_transform(X_test)

In [97]:
prediction = model.predict(X_test)

In [98]:
prediction=pd.DataFrame(prediction)
prediction.info()

In [99]:
prediction = prediction.rename(columns={0: "Adview"})

In [100]:
prediction.head()

In [101]:
prediction.to_csv('predictions.csv')