In [12]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ec2-user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/ec2-user/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [13]:
filepath="youtube_channels_1M_clean.csv"
model = 'Gradient Boosting'
subscribers = 1  #  1: Predict subscrbers  ;  0: Predict total views

#df = pd.read_csv(filepath, sep="\t", encoding='utf-8')
df = pd.read_csv(filepath)
print("******* Loading Data ********")
print("No. of rows: {}".format(df.shape[0]))
print("No. of columns: {}".format(df.shape[1]))





******* Loading Data ********
No. of rows: 1095242
No. of columns: 16


In [14]:
print("\nSelecting columns needed for processing: channel_id, channel_name, subscriber_count, description, keywords, total_views")

df = df[['channel_id', 'channel_name', 'subscriber_count', 'description', 'keywords', 'total_views']]
df['description'] = df['description'].astype(str)

df["description"] = df["description"].apply(lambda x: x.strip())


df=df.dropna()
df['total_views'] = df['total_views'].astype(int)
print("No. of rows (After dropping null): {}".format(df.shape[0]))
print("No. of columns: {}".format(df.shape[1]))


df.drop_duplicates(subset=['channel_id'], keep='first', inplace=True)
print("No of rows (After removing duplicates): {}".format(df.shape[0]))


df.head()


Selecting columns needed for processing: channel_id, channel_name, subscriber_count, description, keywords, total_views
No. of rows (After dropping null): 546994
No. of columns: 6
No of rows (After removing duplicates): 546994


Unnamed: 0,channel_id,channel_name,subscriber_count,description,keywords,total_views
1,UC28mqg7IlYWEhrZwHb72IQA,Food 'n' Happiness,0,"Hello viewers.\n I am Veena from Mangalore, Ka...","food n happiness, food and happiness, food, Fo...",592961
2,UCoLwWY9zQ7Jp8aDtYUszmYg,Tim Shieff,166000,The journey of rediscovery.\n\nhttps://rdscvr.com,"Tim, shieff, timothy, health, human, spiritual...",27250763
3,UCAQOeJwsgBMC74-OjjcQcJA,Jerry & Julie Music,1090,Welcome to Jerry & Julie Music. We hope you w...,"jerryandjuliemusic, jerryspianobar, juliesguit...",339906
5,UCkcc9W34khoJeQA6Eypp0JA,Burhan & Zohan,617,Welcome To My Channel ..!!♥️ Hi I Am Mommy Of ...,"Burhan & Zohan, cooking and vlogging, lifestyl...",36995
6,UCmDLg3tp5998ODVPHv51aPg,Jenny taylor,278,Let's get healthy!\n\nWelcome to my channel! \...,"Disease proof, health",54293


In [None]:
df['total_views'].median()
df['subscriber_count'].median()

In [15]:
df["total_views"] = df["total_views"].apply(lambda x: 1 if x >232200.5 else 0)
df["subscriber_count"] = df["subscriber_count"].apply(lambda x: 1 if x >1190 else 0)


In [16]:
def fit_model(X, y, model):
    
    """
      Seceral fitting models to choose from:
      Decision Tree, SVM, Logistic Regression, Random Forest, Gradient Boosting
    """
    
    if model=='Decision Tree':
        model = DecisionTreeClassifier(max_depth=2).fit(X, y)
    elif model=='SVM':
        model = SVC(kernel='linear', probability=True).fit(X, y)
    elif model=='Logistic Regression':
        model = LogisticRegression().fit(X, y)   
    elif model=='Random Forest':
        model = RandomForestClassifier(max_depth=3).fit(X, y)
    elif model=='Gradient Boosting':
        model = GradientBoostingClassifier().fit(X, y)
    
    return model

In [17]:
def evaluate_model(y, y_pred):
    #evaluate model performance

    
    
    print('\n************* Model Evaluation *************\n')
    
    print('Confusion Matrix:\n')
    print(confusion_matrix(y, y_pred))
    print('\nClassification Report:\n')
    print(classification_report(y, y_pred))

In [18]:
def preprocess(X_data__raw):
    # Preprocess data

    X_data=X_data__raw.iloc[:, 3].astype(str)

    # Convert characters to lowercase
    X_data = X_data.map(lambda x: x.lower())
    X_data = X_data.str.replace('[^\w\s]', '')

    # Tokenize sentence
    X_data = X_data.apply(nltk.word_tokenize)

    # Remove stopwords
    stopword_list = stopwords.words("english")
    X_data = X_data.apply(lambda x: [word for word in x if word not in stopword_list])

    # Stemming
    stemmer = PorterStemmer()
    X_data = X_data.apply(lambda x: [stemmer.stem(y) for y in x])
    X_data = X_data.apply(lambda x: " ".join(x))

    return X_data

In [19]:
if subscribers:
    X, y = df.iloc[:, :-1], df.iloc[:, 2]
else:
    X, y = df.iloc[:, :-1], df.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=69, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=69, stratify=y_test)

print("Train Data: {}".format(X_train.shape))
print("Validation Data: {}".format(X_val.shape))
print("Test Data: {}".format(X_test.shape))

print('\nClass Counts(label, row): Train')
print(y_train.value_counts())
print('\nClass Counts(label, row): Val')
print(y_val.value_counts())
print('\nClass Counts(label, row): Test')
print(y_test.value_counts())

print("\nData View: X Train")
print(X_train.head(3))
print("\nData View: X Val")
print(X_val.head(3))
print("\nData View: X Test")
print(X_test.head(3))

# Reset index   
X_train=X_train.reset_index(drop=True)
X_val=X_val.reset_index(drop=True)
X_test=X_test.reset_index(drop=True)

y_train=y_train.reset_index(drop=True)
y_val=y_val.reset_index(drop=True)
y_test=y_test.reset_index(drop=True)


# Preprocess data

X_train_processed = preprocess(X_train)

print("\n************* Pre-processed Data ***************")
print("\nTrain Data: ", X_train_processed.shape)
print("\nData View: X Train")
print(X_train_processed.head(3))




Train Data: (437595, 5)
Validation Data: (54699, 5)
Test Data: (54700, 5)

Class Counts(label, row): Train
0    219355
1    218240
Name: subscriber_count, dtype: int64

Class Counts(label, row): Val
0    27419
1    27280
Name: subscriber_count, dtype: int64

Class Counts(label, row): Test
0    27420
1    27280
Name: subscriber_count, dtype: int64

Data View: X Train
                      channel_id              channel_name  subscriber_count  \
92632   UC8wXC0ZCfGt3HaVLy_fdTQw            Digital Trends                 1   
772390  UCVEpo47Y0mIeOm0kJOgqrCg  Connecticut State Police                 1   
779043  UCV7W0JDXFLeaVU9ER-S4w6g         Internet e Coisas                 1   

                                              description  \
92632   Digital Trends was founded in 2006 with a simp...   
772390  This is the official YouTube Channel of the Co...   
779043  Você é Maker de Internet das Coisas? Então ess...   

                                                 keywords  
92632

  X_data = X_data.str.replace('[^\w\s]', '')



************* Pre-processed Data ***************

Train Data:  (437595,)

Data View: X Train
0    digit trend found 2006 simpl mission give read...
1    offici youtub channel connecticut state polic ...
2    você é maker de internet da coisa então ess é ...
Name: description, dtype: object


In [20]:
# transforming data

count_vect = CountVectorizer(tokenizer=lambda x: x, lowercase=False)
counts = count_vect.fit_transform(X_train_processed)
transformer = TfidfTransformer(smooth_idf=True, use_idf=True).fit(counts)
X_train_transformed = transformer.transform(counts)

X_train_t = X_train_transformed
y_train_t = y_train





In [21]:
# model fitting


print("\n************** Training Model: " + model + " *************")
y_train_t = y_train_t.astype(int)
model = fit_model(X_train_t, y_train_t, model=model)

## prediction

print("\n\n*************** Getting predictions **************")

X_test_processed = preprocess(X_test)
counts_test = count_vect.transform(X_test_processed)
X_test_t = transformer.transform(counts_test)
y_pred = model.predict(X_test_t)

## 7. Evaluating model performance

print("\n************** Evaluating performance **************")

evaluate_model(y_test, y_pred)



************** Training Model: Gradient Boosting *************


*************** Getting predictions **************


  X_data = X_data.str.replace('[^\w\s]', '')



************** Evaluating performance **************

************* Model Evaluation *************

Confusion Matrix:

[[15360 12060]
 [11008 16272]]

Classification Report:

              precision    recall  f1-score   support

           0       0.58      0.56      0.57     27420
           1       0.57      0.60      0.59     27280

    accuracy                           0.58     54700
   macro avg       0.58      0.58      0.58     54700
weighted avg       0.58      0.58      0.58     54700

