In [1]:
def Process(link):
    import pandas as pd
    import numpy as np

    country = pd.read_csv(link)

    country['trending_date'] = country.trending_date.apply(lambda date : '20' + str(date))
    country['trending_date'] = country.trending_date.apply(lambda date : str(date).replace('.', '/'))
    # Converting Trending date and publish time to datetime format
    country.trending_date = pd.to_datetime(country['trending_date'], format = '%Y/%d/%m')
    country.publish_time = pd.to_datetime(country['publish_time']).dt.tz_localize(None)   # to remove timestamp from date

    country['NoTags'] = [len(tag.split('|')) for tag in country['tags']]
    country['NoWord'] = [len(title.split(' ')) for title in country['title']]
    country['NoChar'] = [len(title.strip(' ')) for title in country['title']]
    country['TMonths'] = country['trending_date'].dt.month.astype(object)
    country['PMonths'] = country['publish_time'].dt.month.astype(object)
    country['Timediff'] = (country.trending_date - country.publish_time).dt.days

    # Deleting unnecessary variables
    country.drop(['video_id', 'category_id', 'thumbnail_link', 'description', 'title', 'channel_title', 'tags'], axis = 1, inplace = True )
    #country = country[country['views'] >= 5000000]
    
    # Categorical Encoding
    import category_encoders as ce
    encoder = ce.BinaryEncoder(cols=['category', 'TMonths', 'PMonths'])
    country1 = encoder.fit_transform(country)

    # Feature Selection
    Xn = country1.drop(['views','trending_date', 'publish_time'], axis= 1)
    Yn = country1['views']

    from sklearn.feature_selection import SelectKBest, f_regression
    feat = SelectKBest(score_func=f_regression, k='all')
    Selectd_X = feat.fit(Xn, Yn)
    #print(Selectd_X.scores_)
    score = pd.DataFrame(Selectd_X.scores_)
    feat = pd.DataFrame(Xn.columns)
    FeatScores = pd.concat([feat, score], axis =1)
    FeatScores.columns=['Features','Score']
    print(FeatScores.nlargest(20,'Score'))

    country2 = country1[FeatScores['Features'][FeatScores['Score'] > 0]]
    country2['views'] = country1['views'] 
    country2.shape

    #Removing Outliers
    Q1 = country2.quantile(0.25)
    Q3 = country2.quantile(0.75)
    IQR = Q3 - Q1
    country3 = country2[~((country2 < (Q1 - 1.5 * IQR)) |(country2 > (Q3 + 1.5 * IQR))).any(axis=1)]

    # Modeling
    Xn = country3.drop('views', axis =1)
    Yn = np.log(country3['views'])

    from sklearn.preprocessing import StandardScaler
    sc = StandardScaler()
    X = sc.fit_transform(Xn)
    #Y = sc.fit_transform(Yn.values.reshape(-1, 1))
    
    # Train and test split
    from sklearn.model_selection import train_test_split, cross_val_score
    train_X, test_X, train_y, test_y = train_test_split(X, Yn, test_size = 0.2, random_state=101)
    from sklearn.metrics import mean_squared_error as mse
    
    from sklearn.ensemble import RandomForestRegressor
    rfr = RandomForestRegressor(n_estimators=200, max_depth=100, random_state=2)
    model1 = rfr.fit(train_X, train_y)
    ypred = rfr.predict(test_X)
    rmse0 = mse(test_y, ypred)**(1/2)
    rfrscore = cross_val_score(rfr, train_X, train_y, cv=5)

    from sklearn.linear_model import LinearRegression
    lr = LinearRegression()
    model2 = lr.fit(train_X, train_y)
    ypred = lr.predict(test_X)
    rmse1 = mse(test_y, ypred)**(1/2)
    lrscore = cross_val_score(lr, train_X, train_y, cv=5)

    from sklearn.svm import SVR
    svr = SVR(kernel= 'rbf', degree = 3)
    model3 = svr.fit(train_X, train_y)
    ypred = svr.predict(test_X)
    rmse2 = mse(test_y, ypred)**(1/2)
    svscore = cross_val_score(svr, train_X, train_y, cv=5)

    #from sklearn.model_selection import GridSearchCV
    #from sklearn.neighbors import KNeighborsRegressor
    #params = {'n_neighbors':list(range(1,21))}
    #knn = KNeighborsRegressor()
    #kn = GridSearchCV(knn, params, cv=5)
    #kn.fit(train_X, train_y)
    #y_pred = kn.predict(test_X)
    #rmse3 = mse(test_y, ypred)**(1/2)
    #knscore = cross_val_score(kn, X, Yn, cv=5)

    print('Random forest: RMSE: ', round(rmse0, 3), 'Accuracy: %0.2f (+/- %0.2f)' %(rfrscore.mean() ,rfrscore.std() * 2))
    print('linear Regression RMSE: ', round(rmse1, 3), 'Accuracy: %0.2f (+/- %0.2f)' %(lrscore.mean() ,lrscore.std() * 2))
    print('Support Vector RMSE: ', round(rmse2, 3), 'Accuracy: %0.2f (+/- %0.2f)' %(svscore.mean() ,svscore.std() * 2))
    #print('K-nearest RMSE: ', round(rmse3, 3), 'Accuracy: %0.2f (+/- %0.2f)' %(knscore.mean() ,knscore.std() * 2))
    return

In [None]:
Process('Data/Capstone/CA-ok.csv')

In [None]:
Process('Data/Capstone/MX-ok.csv')

In [None]:
Process('Data/Capstone/GB-ok.csv')

In [None]:
Process('Data/Capstone/FR-ok.csv')

In [None]:
Process('Data/Capstone/KR-ok.csv')

In [None]:
Process('Data/Capstone/IN-ok.csv')

In [None]:
Process('Data/Capstone/JP-ok.csv')

In [None]:
Process('Data/Capstone/DE-ok.csv')

In [None]:
Process('Data/Capstone/RU-ok.csv')